[1-6]文本分类

垃圾邮件分类

def modelbuilding(smsdata,sms_labels):
	'''
	This is an example pipline to building a text classifier
	1. sampling
	2. TfidfVectorizer conversion
	3. building a naive_bayes model
	4. print the accuracy and other metrics
	5. print most relavent features 
	'''

	# sampling steps 
	trainset_size = int(round(len(sms_data)*0.70))
	# i chose this threshold for 70:30 train and test split.
	print 'The training set size for this classifier is ' + str(trainset_size) + '\n'
	x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]])
	y_train = np.array([el for el in sms_labels[0:trainset_size]])
	x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
	y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_labels)]])
	print x_train
	print y_train

	# count vectorizer 
	# not really used just for explanation 
	from sklearn.feature_extraction.text import CountVectorizer
	sms_exp=[ ]
	for line in sms_list:
		sms_exp.append(preprocessing(line[1]))
	vectorizer = CountVectorizer(min_df=1)
	X_exp = vectorizer.fit_transform(sms_exp)
	print "||".join(vectorizer.get_feature_names())
	print X_exp.toarray()

	# We are building a TFIDF vectorizer here
	from sklearn.feature_extraction.text import TfidfVectorizer
	vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),  stop_
	words='english',  strip_accents='unicode',  norm='l2')
	X_train = vectorizer.fit_transform(x_train)
	X_test = vectorizer.transform(x_test)

	from sklearn.naive_bayes import MultinomialNB
	clf = MultinomialNB().fit(X_train, y_train)
	y_nb_predicted = clf.predict(X_test)
	print y_nb_predicted
	print ' \n confusion_matrix \n '
	cm = confusion_matrix(y_test, y_pred)
	print cm
	print '\n Here is the classification report:'
	print classification_report(y_test, y_nb_predicted)
	# print the top features 

	coefs = clf.coef_
	intercept = clf.intercept_
	coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
	n=10
	top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
	for (coef_1, fn_1), (coef_2, fn_2) in top:
		print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2))

def preprocessing(text):
    text = text.decode("utf8")
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) \
    for word in nltk.word_tokenize(sent)]

    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    # lower capitalization
    tokens = [word.lower() for word in tokens]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text

def main():
	smsdata = open('SMSSpamCollection') # check the structure of this file!
	smsdata_data = []
	sms_labels = []
	csv_reader = csv.reader(sms,delimiter='\t')
	for line in csv_reader:
	     # adding the sms_id 
	    sms_labels.append( line[0])
	    # adding the cleaned text We are calling preprocessing method 
	    sms_data.append(preprocessing(line[1]))

	sms.close() 
	# we are calling the model builing function here 
	modelbuilding(smsdata,sms_labels)   
if __name__ == '__main__':
	main()

其它的模型

# SGD mostly used

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
clf=SGDClassifier(alpha=.0001, n_iter=50).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print '\n Here is the classification report:'
print classification_report(y_test, y_pred)
print ' \n confusion_matrix \n '
cm = confusion_matrix(y_test, y_pred)
print cm

# SVM
from sklearn.svm import LinearSVC
svm_classifier = LinearSVC().fit(X_train, y_train)
y_svm_predicted = svm_classifier.predict(X_test)
print '\n Here is the classification report:'
print classification_report(y_test, y_svm_predicted)
cm = confusion_matrix(y_test, y_pred)
print cm

# RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier
RF_clf = RandomForestClassifier(n_estimators=10)
predicted = RF_clf.predict(X_test)
print '\n Here is the classification report:'
print classification_report(y_test, predicted)
cm = confusion_matrix(y_test, y_pred)
print cm

文本聚类

from sklearn.cluster import KMeans, MiniBatchKMeans
true_k=5
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
kmini = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=opts.verbose)
# we are using the same test,train data in TFIDF form as we did in text classification

km_model=km.fit(X_train)
kmini_model=kmini.fit(X_train)
print "For K-mean clustering "
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
print "For K-mean Mini batch clustering "
clustering = collections.defaultdict(list)
for idx, label in enumerate(kmini_model.labels_):
        clustering[label].append(idx)

主题建模

from gensim import corpora, models, similarities
from itertools import chain
import nltk
from nltk.corpus import stopwords
from operator import itemgetter
import re
documents = [document for document in sms_data]
stoplist = stopwords.words('english')
texts = [[word for word in document.lower().split() if word not in stoplist] \
for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
si = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
#lsi.print_topics(20)
n_topics = 5
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics)
for i in range(0, n_topics):
		temp = lda.show_topic(i, 10)
		terms = []
		for term in temp:
			terms.append(term[1])
			print "Top 10 terms for topic #" + str(i) + ": "+ ", ".join(terms)

Last updated

Was this helpful?