摘要:
一、基于gensim1、模型类importosimportjiebaimportpickleimportloggingimportnumpyasnpfromgensimimportcorpora,models,similaritiesimportutils.word_processasword_processfromroot_pathimportrootfrompathlibimportPathi
一、基于gensim
1、模型类
importos importjieba importpickle importlogging importnumpy as np from gensim importcorpora, models, similarities importutils.word_process as word_process from root_path importroot from pathlib importPath importheapq classTfIdf(object): """tf-idf模型计算相似度""" def __init__(self): root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf") if notPath(root_path).is_dir(): os.mkdir(root_path) self.dic_path = os.path.join(root_path, "bow.model") self.tfidf_model_path = os.path.join(root_path, "tfidf_model.model") self.tfidf_index_path = os.path.join(root_path, "tfidf_index.model") self.stop_list =word_process.get_stop_list() self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt") defdel_stopwords(self, words): """删除一句话中的停用词"""word_list =[] for word inwords: if word not inself.stop_list: word_list.append(word) returnword_list def _seg_word(self, words_list, jieba_flag=True, del_stopword=True): """对多句话进行分词或分字"""word_list =[] ifjieba_flag: ifdel_stopword: for words inwords_list: jieba.cut(words) word_list.append(self.del_stopwords(list(jieba.cut(words)))) else: for words inwords_list: word_list.append(list(jieba.cut(words))) else: ifdel_stopword: for words inwords_list: word_list.append(self.del_stopwords(words)) else: for words inwords_list: word_list.append([word for word inwords]) returnword_list deftrain(self, sentence_list): """训练模型""" #下面保存语料字典 word_list =self._seg_word(sentence_list) dic = corpora.Dictionary(word_list, prune_at=2000000) dic.save(self.dic_path) #构建tfidf模型 tfidf_model_path =self.tfidf_model_path corpus_model = [dic.doc2bow(word) for word inword_list] tfidf_model =models.TfidfModel(corpus_model) tfidf_model.save(tfidf_model_path) #构造检索模型 tfidf_index_path =self.tfidf_index_path corpus_tfidf =tfidf_model[corpus_model] tfidf_index =similarities.MatrixSimilarity(corpus_tfidf) tfidf_index.save(tfidf_index_path) defpredict(self, sentence): #得到句子向量, 直接出检索结果(检索是基于word_list的)。 dic =corpora.Dictionary.load(self.dic_path) words =sentence word_bow =dic.doc2bow(self._seg_word([words])[0]) word_tfidf =models.TfidfModel.load(self.tfidf_model_path)[word_bow] tfidf_index =similarities.MatrixSimilarity.load(self.tfidf_index_path) score =tfidf_index[word_tfidf] returnscore defget_train_data(self): """得到句子数组和标签数组"""labels =[] sentences =[] with open(self.data_path, "r", encoding="utf8") as f: for line inf.readlines(): data_tuple = line.split(" ") label =data_tuple[0] labels.append(label) sentence = data_tuple[1].replace("", "").replace("", "") sentences.append(sentence) returnlabels, sentences defmain(self): labels, sentences =self.get_train_data() print(sentences) self.train(sentences) score_list = self.predict("我有困难还不了") #获取下标, 输出为[4, 5, 2] print(heapq.nlargest(30, range(len(score_list)), score_list.__getitem__)) #获取数值, 输出为[9, 9, 6] print(heapq.nlargest(30, score_list)) if __name__ == '__main__': TfIdf().main()
2、工具类
importos from root_path importroot importtqdm stop = os.path.join(root, "confusion_detection","data", "raw_data", "ChineseStopWords.txt") defget_stop_list(): """得到停用词列表"""stop_word_list =[] with open(stop, "r", encoding="utf8") as f: data_lines = tqdm.tqdm(f.readlines(), smoothing=0, mininterval=0.1) data_lines.set_description('正在处理停用词...') for line indata_lines: line = line.replace(" ", "").replace("", "").replace("", "") if len(line) == 1: stop_word_list.append(line) return stop_word_list
二、基于sklearn
importos importjieba importpickle from root_path importroot from pathlib importPath from sklearn.feature_extraction.text importTfidfVectorizer classTfIdf(object): """tf-idf模型计算相似度""" def __init__(self): root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf") if notPath(root_path).is_dir(): os.mkdir(root_path) self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt") self.model_path = os.path.join(root_path, "tfidf.model") defget_train_data(self): """得到句子数组和标签数组"""labels =[] sentences =[] with open(self.data_path, "r", encoding="utf8") as f: for line inf.readlines(): data_tuple = line.split(" ") label =data_tuple[0] labels.append(label) sentence = data_tuple[1].replace("", "").replace("", "") sentences.append(sentence) returnlabels, sentences deftrain(self): labels, sentences =self.get_train_data() sent_words = [list(jieba.cut(sent0)) for sent0 insentences] document = [" ".join(sent0) for sent0 insent_words] tfidf_vectorizer =TfidfVectorizer() feature =tfidf_vectorizer.fit_transform(document) #保存模型 with open(self.model_path, 'wb') as f: pickle.dump(tfidf_vectorizer, f) defpredict(self, sentence): #加载模型 with open(self.model_path, 'rb') as f: tfidf_vectorizer =pickle.load(f) sentence =list(jieba.cut(sentence)) sen = " ".join(sentence) res =tfidf_vectorizer.transform([sen]).toarray() returnres[0] defmain(self): sentence = "是的,我知道那就十五号没办法,因为这个,也可能是十二十号发工资的,因为遇见了超过了一点点。"self.predict(sentence) if __name__ == '__main__': TfIdf().main()