对应的Python解析器有csv, HTMLparser、 SAX Parser、 DOM Parser, XMLParser, PYODBC, json, PDFminer等。
例如
# csv load >>>import csv>>>withopen('example.csv','rb')as f:>>> reader=csv.reader(f,delimiter=',',quotechar='"')>>>for line in reader :>>>print line[1]# assuming the second field is the raw sting# json load >>>import json>>>jsonfile=open('example.json')>>>data=json.load(jsonfile)>>>print data['string']
>>>inputstring = ' This is an example sent. The sentence splitter will split on sent markers. Ohh really !!'
>>>from nltk.tokenize import sent_tokenize
>>>all_sent=sent_tokenize(inputstring)
>>>print all_sent
[' This is an example sent', 'The sentence splitter will split on markers.','Ohh really !!']
# stop word
>>>from nltk.corpus import stopwords
>>>stoplist=stopwords.words('english') # config the language name
>>>text = "This is just a test"
>>>cleanwordlist=[word for word in text.split() if word not in stoplist]
# rare word removal
>>>freq_dist=nltk.FreqDist(token)
>>>rarewords =freq_dist.keys()[-50:]
>>>after_rare_words= [ word for word in token not in rarewords]