Todo

文本摘要

假设摘要其实就是基于句子对读者的重要性和意义,对句子进行排序。

在通常情况下,有相对较多实体和名词的句子比其他句子更重要。

import sys
f = open('nyt.txt', 'r')
news_content = f.read()

import nltk
results=[]
for sent_no,sentence in enumerate(nltk.sent_tokenize(news_content)):
    no_of_tokens=len(nltk.word_tokenize(sentence))
    # Let's do POS tagging
    tagged=nltk.pos_tag(nltk.word_tokenize(sentence))
    # Count the no of Nouns in the sentence
    no_of_nouns=len([word for word,pos in tagged if pos in ["NN","NNP"] ])
    #Use NER to tag the named entities.
    ners=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence)), binary=False)
    no_of_ners= len([chunk for chunk in ners if hasattr(chunk, 'node')])
    score=(no_of_ners+no_of_nouns)/float(no_of_toekns)
    results.append((sent_no,no_of_tokens,no_of_ners,\
no_of_nouns,score,sentence))

for sent in sorted(results,key=lambda x: x[4],reverse=True):
    print sent[5]

使用TF-IDF

>>>import nltk
>>>from sklearn.feature_extraction.text import TfidfVectorizer
>>>results=[]
>>>sentences=nltk.sent_tokenize(news_content)
>>>vectorizer = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True)
>>>sklearn_binary=vectorizer.fit_transform(sentences)
>>>print countvectorizer.get_feature_names()
>>>print sklearn_binary.toarray()
>>>for sent_no,i in enumerate(sklearn_binary.toarray()):
>>>	results.append(sent_no,i.sum()/float(len(i.nonzero()[0])))

Last updated

Was this helpful?