[1-6]文本分类

垃圾邮件分类

def modelbuilding(smsdata,sms_labels):
	'''
	This is an example pipline to building a text classifier
	1. sampling
	2. TfidfVectorizer conversion
	3. building a naive_bayes model
	4. print the accuracy and other metrics
	5. print most relavent features 
	'''

	# sampling steps 
	trainset_size = int(round(len(sms_data)*0.70))
	# i chose this threshold for 70:30 train and test split.
	print 'The training set size for this classifier is ' + str(trainset_size) + '\n'
	x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]])
	y_train = np.array([el for el in sms_labels[0:trainset_size]])
	x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
	y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_labels)]])
	print x_train
	print y_train

	# count vectorizer 
	# not really used just for explanation 
	from sklearn.feature_extraction.text import CountVectorizer
	sms_exp=[ ]
	for line in sms_list:
		sms_exp.append(preprocessing(line[1]))
	vectorizer = CountVectorizer(min_df=1)
	X_exp = vectorizer.fit_transform(sms_exp)
	print "||".join(vectorizer.get_feature_names())
	print X_exp.toarray()

	# We are building a TFIDF vectorizer here
	from sklearn.feature_extraction.text import TfidfVectorizer
	vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),  stop_
	words='english',  strip_accents='unicode',  norm='l2')
	X_train = vectorizer.fit_transform(x_train)
	X_test = vectorizer.transform(x_test)

	from sklearn.naive_bayes import MultinomialNB
	clf = MultinomialNB().fit(X_train, y_train)
	y_nb_predicted = clf.predict(X_test)
	print y_nb_predicted
	print ' \n confusion_matrix \n '
	cm = confusion_matrix(y_test, y_pred)
	print cm
	print '\n Here is the classification report:'
	print classification_report(y_test, y_nb_predicted)
	# print the top features 

	coefs = clf.coef_
	intercept = clf.intercept_
	coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
	n=10
	top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
	for (coef_1, fn_1), (coef_2, fn_2) in top:
		print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2))

def preprocessing(text):
    text = text.decode("utf8")
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) \
    for word in nltk.word_tokenize(sent)]

    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    # lower capitalization
    tokens = [word.lower() for word in tokens]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text

def main():
	smsdata = open('SMSSpamCollection') # check the structure of this file!
	smsdata_data = []
	sms_labels = []
	csv_reader = csv.reader(sms,delimiter='\t')
	for line in csv_reader:
	     # adding the sms_id 
	    sms_labels.append( line[0])
	    # adding the cleaned text We are calling preprocessing method 
	    sms_data.append(preprocessing(line[1]))

	sms.close() 
	# we are calling the model builing function here 
	modelbuilding(smsdata,sms_labels)   
if __name__ == '__main__':
	main()

其它的模型

文本聚类

主题建模

Last updated

Was this helpful?