[1-6]文本分类
垃圾邮件分类
def modelbuilding(smsdata,sms_labels):
'''
This is an example pipline to building a text classifier
1. sampling
2. TfidfVectorizer conversion
3. building a naive_bayes model
4. print the accuracy and other metrics
5. print most relavent features
'''
# sampling steps
trainset_size = int(round(len(sms_data)*0.70))
# i chose this threshold for 70:30 train and test split.
print 'The training set size for this classifier is ' + str(trainset_size) + '\n'
x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]])
y_train = np.array([el for el in sms_labels[0:trainset_size]])
x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_labels)]])
print x_train
print y_train
# count vectorizer
# not really used just for explanation
from sklearn.feature_extraction.text import CountVectorizer
sms_exp=[ ]
for line in sms_list:
sms_exp.append(preprocessing(line[1]))
vectorizer = CountVectorizer(min_df=1)
X_exp = vectorizer.fit_transform(sms_exp)
print "||".join(vectorizer.get_feature_names())
print X_exp.toarray()
# We are building a TFIDF vectorizer here
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_
words='english', strip_accents='unicode', norm='l2')
X_train = vectorizer.fit_transform(x_train)
X_test = vectorizer.transform(x_test)
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train, y_train)
y_nb_predicted = clf.predict(X_test)
print y_nb_predicted
print ' \n confusion_matrix \n '
cm = confusion_matrix(y_test, y_pred)
print cm
print '\n Here is the classification report:'
print classification_report(y_test, y_nb_predicted)
# print the top features
coefs = clf.coef_
intercept = clf.intercept_
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
n=10
top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top:
print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2))
def preprocessing(text):
text = text.decode("utf8")
# tokenize into words
tokens = [word for sent in nltk.sent_tokenize(text) \
for word in nltk.word_tokenize(sent)]
# remove stopwords
stop = stopwords.words('english')
tokens = [token for token in tokens if token not in stop]
# remove words less than three letters
tokens = [word for word in tokens if len(word) >= 3]
# lower capitalization
tokens = [word.lower() for word in tokens]
# lemmatize
lmtzr = WordNetLemmatizer()
tokens = [lmtzr.lemmatize(word) for word in tokens]
preprocessed_text= ' '.join(tokens)
return preprocessed_text
def main():
smsdata = open('SMSSpamCollection') # check the structure of this file!
smsdata_data = []
sms_labels = []
csv_reader = csv.reader(sms,delimiter='\t')
for line in csv_reader:
# adding the sms_id
sms_labels.append( line[0])
# adding the cleaned text We are calling preprocessing method
sms_data.append(preprocessing(line[1]))
sms.close()
# we are calling the model builing function here
modelbuilding(smsdata,sms_labels)
if __name__ == '__main__':
main()
其它的模型
# SGD mostly used
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
clf=SGDClassifier(alpha=.0001, n_iter=50).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print '\n Here is the classification report:'
print classification_report(y_test, y_pred)
print ' \n confusion_matrix \n '
cm = confusion_matrix(y_test, y_pred)
print cm
# SVM
from sklearn.svm import LinearSVC
svm_classifier = LinearSVC().fit(X_train, y_train)
y_svm_predicted = svm_classifier.predict(X_test)
print '\n Here is the classification report:'
print classification_report(y_test, y_svm_predicted)
cm = confusion_matrix(y_test, y_pred)
print cm
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
RF_clf = RandomForestClassifier(n_estimators=10)
predicted = RF_clf.predict(X_test)
print '\n Here is the classification report:'
print classification_report(y_test, predicted)
cm = confusion_matrix(y_test, y_pred)
print cm
文本聚类
from sklearn.cluster import KMeans, MiniBatchKMeans
true_k=5
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
kmini = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=opts.verbose)
# we are using the same test,train data in TFIDF form as we did in text classification
km_model=km.fit(X_train)
kmini_model=kmini.fit(X_train)
print "For K-mean clustering "
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx)
print "For K-mean Mini batch clustering "
clustering = collections.defaultdict(list)
for idx, label in enumerate(kmini_model.labels_):
clustering[label].append(idx)
主题建模
from gensim import corpora, models, similarities
from itertools import chain
import nltk
from nltk.corpus import stopwords
from operator import itemgetter
import re
documents = [document for document in sms_data]
stoplist = stopwords.words('english')
texts = [[word for word in document.lower().split() if word not in stoplist] \
for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
si = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
#lsi.print_topics(20)
n_topics = 5
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics)
for i in range(0, n_topics):
temp = lda.show_topic(i, 10)
terms = []
for term in temp:
terms.append(term[1])
print "Top 10 terms for topic #" + str(i) + ": "+ ", ".join(terms)
Last updated
Was this helpful?