# settings
n = 3 # 3-gram
fn = "../data/ptb.train.txt.UNK"
all_words = set() # 记录语料库中的全部词
ngrams_list = [] # n元组(分子)
prefix_list = [] # n-1元组(分母)
# 构建语料库
with open(fn) as f:
for line in f:
sentence = line.split()
word_list = [x.lower() for x in sentence]
for word in word_list:
word = word.strip()
all_words.add(word)
ngrams = list(zip(*[sentence[i:] for i in range(n)])) # 一个句子中n-gram元组的列表
prefix = list(zip(*[sentence[i:] for i in range(n-1)])) # 前缀元组的列表
ngrams_list += ngrams
prefix_list += prefix
all_words = list(all_words)
ngrams_counter = Counter(ngrams_list)
prefix_counter = Counter(prefix_list)
要理解ngrams的生成过程,可以看一个例子:
sentence = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
print([sentence[i:] for i in range(n)])
print(*[sentence[i:] for i in range(n)])
print(list(zip(*[sentence[i:] for i in range(n)])))
# 加载测试集
fn = "./sentences.txt"
with open(fn) as f:
line = f.read()
# 分词
words = nltk.tokenize.word_tokenize(line)
words = [x.lower() for x in words]
print(" ".join(words))
# 将语料库中没有的词替换成UNK
blank_ids = [str(n) for n in range(1, 21)]
for i in range(len(words)):
if words[i] in blank_ids:
continue
if not words[i] in all_words:
words[i] = "UNK"
print(" ".join(words))
sentence = words
# 加载测试集选项
f = open("./options.txt")
options = []
for line in f:
ll = line.split()
words = []
for s in ll:
word = s.split(".")[-1].lower()
words.append(word)
options.append(words)
f.close()
print(options)
# 加载测试集标签(答案)
f = open("./answers.txt")
answers = f.readline().strip()
f.close()
print(answers)
def accuracy(answers,choices):
n = len(answers)
c = 0
for i in range(len(answers)):
if answers[i] == choices[i]:
c += 1
return c*1.0/n
a = accuracy(answers, choices)
print(a)