#coding=utf-8import xlrdimport distancefrom sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizerimport numpy as npfrom scipy.linalg import normworkbook = xlrd.open_workbook(u'工程师问答.xls')sheet_names= workbook.sheet_names()ls = []for sheet_name in sheet_names: sheet1 = workbook.sheet_by_name(sheet_name) for i in range(1, 3858): row = sheet1.row_values(i) ls.append(row[0])# print len(ls)target = u'D90的发动机热效率是多少?'print u'目标语句:' + target# 编辑距离计算def edit_distance(s1, s2): return distance.levenshtein(s1, s2)results = list(filter(lambda x: edit_distance(x, target) <= 5, ls))print u'1)编辑距离计算,阈值为5'for i in results: print i# 杰卡德系数计算def jaccard_similarity(s1, s2): def add_space(s): return ' '.join(list(s)) # 将字中间加入空格 s1, s2 = add_space(s1), add_space(s2) # 转化为TF矩阵 cv = CountVectorizer(tokenizer=lambda s: s.split()) corpus = [s1, s2] vectors = cv.fit_transform(corpus).toarray() # 求交集 numerator = np.sum(np.min(vectors, axis=0)) # 求并集 denominator = np.sum(np.max(vectors, axis=0)) # 计算杰卡德系数 return 1.0 * numerator / denominatorresults = list(filter(lambda x: jaccard_similarity(x, target) > 0.6, ls))print u'2)杰卡德系数计算,阈值为0.6'for i in results: print i# TF 计算def tf_similarity(s1, s2): def add_space(s): return ' '.join(list(s)) # 将字中间加入空格 s1, s2 = add_space(s1), add_space(s2) # 转化为TF矩阵 cv = CountVectorizer(tokenizer=lambda s: s.split()) corpus = [s1, s2] vectors = cv.fit_transform(corpus).toarray() # 计算TF系数 return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))results = list(filter(lambda x: tf_similarity(x, target) > 0.7, ls))print u'3)TF 计算,阈值为0.7'for i in results: print i# TFIDF 系数def tfidf_similarity(s1, s2): def add_space(s): return ' '.join(list(s)) # 将字中间加入空格 s1, s2 = add_space(s1), add_space(s2) # 转化为TF矩阵 cv = TfidfVectorizer(tokenizer=lambda s: s.split()) corpus = [s1, s2] vectors = cv.fit_transform(corpus).toarray() # 计算TF系数 return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))results = list(filter(lambda x: tfidf_similarity(x, target) > 0.6, ls))print u'4)TFIDF 系数,阈值为0.6'for i in results: print i