import re
import string
remove_nota = u'[’·°–!"#$%&\'()*+,-./:;<=>?@,。?★、…【】()《》?“”‘’![\\]^_`{|}~]+'
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def filter_str(sentence):
sentence = re.sub(remove_nota, '', sentence)
sentence = sentence.translate(remove_punctuation_map)
return sentence.strip()
# 判断中日韩英
def judge_language(s):
# s = unicode(s) # python2需要将字符串转换为unicode编码,python3不需要
s = filter_str(s)
result = []
s = re.sub('[0-9]', '', s).strip()
# unicode english
re_words = re.compile(u"[a-zA-Z]")
res = re.findall(re_words, s) # 查询出所有的匹配字符串
res2 = re.sub('[a-zA-Z]', '', s).strip()
if len(res) > 0:
result.append('en')
if len(res2) <= 0:
return 'en'
# unicode chinese
re_words = re.compile(u"[\u4e00-\u9fa5]+")
res = re.findall(re_words, s) # 查询出所有的匹配字符串
res2 = re.sub(u"[\u4e00-\u9fa5]+", '', s).strip()
if len(res) > 0:
result.append('zh')
if len(res2) <= 0:
return 'zh'
# unicode korean
re_words = re.compile(u"[\uac00-\ud7ff]+")
res = re.findall(re_words, s) # 查询出所有的匹配字符串
res2 = re.sub(u"[\uac00-\ud7ff]+", '', s).strip()
if len(res) > 0:
result.append('ko')
if len(res2) <= 0:
return 'ko'
# unicode japanese katakana and unicode japanese hiragana
re_words = re.compile(u"[\u30a0-\u30ff\u3040-\u309f]+")
res = re.findall(re_words, s) # 查询出所有的匹配字符串
res2 = re.sub(u"[\u30a0-\u30ff\u3040-\u309f]+", '', s).strip()
if len(res) > 0:
result.append('ja')
if len(res2) <= 0:
return 'ja'
return ','.join(result)