# toy CFG
>>> from nltk import CFG
>>>toy_grammar =
nltk.CFG.fromstring(
"""
S -> NP VP # S indicate the entire sentence
VP -> V NP # VP is verb phrase the
V -> "eats" | "drinks" # V is verb we are using only 2 verbs in the example
NP -> Det N # NP is noun phrase (chunk that has noun in it)
Det -> "a" | "an" | "the" # Det is determiner used in the sentences
N -> "president" |"Obama" |"apple"| "coke" # N some example nouns
""")
>>> toy_grammar.productions()
# Chunking
>>>from nltk.chunk.regexp import *
>>>test_sent="The prime minister announced he had asked the chief government whip, Philip Ruddock, to call a special party room meeting for 9am on Monday to consider the spill motion."
>>>test_sent_pos=nltk.pos_tag(nltk.word_tokenize(test_sent))
>>>rule_vp = ChunkRule(r'(<VB.*>)?(<VB.*>)+(<PRP>)?', 'Chunk VPs')
>>>parser_vp = RegexpChunkParser([rule_vp],chunk_label='VP')
>>>print parser_vp.parse(test_sent_pos)
>>>rule_np = ChunkRule(r'(<DT>?<RB>?)?<JJ|CD>*(<JJ|CD><,>)*(<NN.*>)+', 'Chunk NPs')
>>>parser_np = RegexpChunkParser([rule_np],chunk_label="NP")
>>>print parser_np.parse(test_sent_pos)
信息抽取
原始文本——分句——分词——POS标注——输入检测——关系提取——关系
命名实体识别
# NP chunking (NER)
>>>f=open(# absolute path for the file of text for which we want NER)
>>>text=f.read()
>>>sentences = nltk.sent_tokenize(text)
>>>tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
>>>tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
>>>for sent in tagged_sentences:
>>> print nltk.ne_chunk(sent)
关系抽取
# Relation Extraction
>>>import re
>>>IN = re.compile(r'.*\bin\b(?!\b.+ing)')
>>>for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
>>> for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
>>>print(nltk.sem.rtuple(rel))