NLP（二十三）：用tf-idf得到句子向量，并计算相似度

摘要：

一、基于gensim1、模型类importosimportjiebaimportpickleimportloggingimportnumpyasnpfromgensimimportcorpora,models,similaritiesimportutils.word_processasword_processfromroot_pathimportrootfrompathlibimportPathi

一、基于gensim

1、模型类

importos
importjieba
importpickle
importlogging
importnumpy as np
from gensim importcorpora, models, similarities
importutils.word_process as word_process
from root_path importroot
from pathlib importPath
importheapq

classTfIdf(object):
    """tf-idf模型计算相似度"""
    def __init__(self):
        root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf")
        if notPath(root_path).is_dir():
            os.mkdir(root_path)
        self.dic_path = os.path.join(root_path, "bow.model")
        self.tfidf_model_path = os.path.join(root_path, "tfidf_model.model")
        self.tfidf_index_path = os.path.join(root_path, "tfidf_index.model")
        self.stop_list =word_process.get_stop_list()

        self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt")


    defdel_stopwords(self, words):
        """删除一句话中的停用词"""word_list =[]

        for word inwords:
            if word not inself.stop_list:
                word_list.append(word)
        returnword_list

    def _seg_word(self, words_list, jieba_flag=True, del_stopword=True):
        """对多句话进行分词或分字"""word_list =[]
        ifjieba_flag:
            ifdel_stopword:
                for words inwords_list:
                    jieba.cut(words)
                    word_list.append(self.del_stopwords(list(jieba.cut(words))))
            else:
                for words inwords_list:
                    word_list.append(list(jieba.cut(words)))
        else:
            ifdel_stopword:
                for words inwords_list:
                    word_list.append(self.del_stopwords(words))
            else:
                for words inwords_list:
                    word_list.append([word for word inwords])
        returnword_list

    deftrain(self, sentence_list):
        """训练模型"""
        #下面保存语料字典
        word_list =self._seg_word(sentence_list)
        dic = corpora.Dictionary(word_list, prune_at=2000000)
        dic.save(self.dic_path)

        #构建tfidf模型
        tfidf_model_path =self.tfidf_model_path
        corpus_model = [dic.doc2bow(word) for word inword_list]
        tfidf_model =models.TfidfModel(corpus_model)
        tfidf_model.save(tfidf_model_path)

        #构造检索模型
        tfidf_index_path =self.tfidf_index_path
        corpus_tfidf =tfidf_model[corpus_model]
        tfidf_index =similarities.MatrixSimilarity(corpus_tfidf)
        tfidf_index.save(tfidf_index_path)

    defpredict(self, sentence):
        #得到句子向量, 直接出检索结果(检索是基于word_list的)。
        dic =corpora.Dictionary.load(self.dic_path)
        words =sentence
        word_bow =dic.doc2bow(self._seg_word([words])[0])
        word_tfidf =models.TfidfModel.load(self.tfidf_model_path)[word_bow]
        tfidf_index =similarities.MatrixSimilarity.load(self.tfidf_index_path)
        score =tfidf_index[word_tfidf]
        returnscore

    defget_train_data(self):
        """得到句子数组和标签数组"""labels =[]
        sentences =[]
        with open(self.data_path, "r", encoding="utf8") as f:
            for line inf.readlines():
                data_tuple = line.split("  ")
                label =data_tuple[0]
                labels.append(label)
                sentence = data_tuple[1].replace("", "").replace("", "")
                sentences.append(sentence)
        returnlabels, sentences

    defmain(self):
        labels, sentences =self.get_train_data()
        print(sentences)
        self.train(sentences)
        score_list = self.predict("我有困难还不了")

        #获取下标， 输出为[4, 5, 2]
        print(heapq.nlargest(30, range(len(score_list)), score_list.__getitem__))

        #获取数值， 输出为[9, 9, 6]
        print(heapq.nlargest(30, score_list))



if __name__ == '__main__':
    TfIdf().main()

2、工具类

importos
from root_path importroot
importtqdm


stop = os.path.join(root, "confusion_detection","data", "raw_data", "ChineseStopWords.txt")

defget_stop_list():
    """得到停用词列表"""stop_word_list =[]
    with open(stop, "r", encoding="utf8") as f:
        data_lines = tqdm.tqdm(f.readlines(), smoothing=0, mininterval=0.1)
        data_lines.set_description('正在处理停用词...')
        for line indata_lines:
            line = line.replace(" ", "").replace("", "").replace("", "")
            if len(line) == 1:
                stop_word_list.append(line)
    return stop_word_list

二、基于sklearn

importos
importjieba
importpickle
from root_path importroot
from pathlib importPath

from sklearn.feature_extraction.text importTfidfVectorizer

classTfIdf(object):
    """tf-idf模型计算相似度"""
    def __init__(self):
        root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf")
        if notPath(root_path).is_dir():
            os.mkdir(root_path)
        self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt")
        self.model_path = os.path.join(root_path, "tfidf.model")

    defget_train_data(self):
        """得到句子数组和标签数组"""labels =[]
        sentences =[]
        with open(self.data_path, "r", encoding="utf8") as f:
            for line inf.readlines():
                data_tuple = line.split("  ")
                label =data_tuple[0]
                labels.append(label)
                sentence = data_tuple[1].replace("", "").replace("", "")
                sentences.append(sentence)
        returnlabels, sentences

    deftrain(self):
        labels, sentences =self.get_train_data()
        sent_words = [list(jieba.cut(sent0)) for sent0 insentences]
        document = [" ".join(sent0) for sent0 insent_words]
        tfidf_vectorizer =TfidfVectorizer()
        feature =tfidf_vectorizer.fit_transform(document)
        #保存模型
        with open(self.model_path, 'wb') as f:
            pickle.dump(tfidf_vectorizer, f)

    defpredict(self, sentence):
        #加载模型
        with open(self.model_path, 'rb') as f:
            tfidf_vectorizer =pickle.load(f)
        sentence =list(jieba.cut(sentence))
        sen = " ".join(sentence)
        res =tfidf_vectorizer.transform([sen]).toarray()
        returnres[0]

    defmain(self):
        sentence = "是的，我知道那就十五号没办法，因为这个，也可能是十二十号发工资的，因为遇见了超过了一点点。"self.predict(sentence)

if __name__ == '__main__':
    TfIdf().main()

自然语言处理-中文语料预处理

自然语言处理——中文文本预处理近期，在自学自然语言处理，初次接触NLP觉得十分的难，各种概念和算法，而且也没有很强的编程基础，学着稍微有点吃力。不过经过两个星期的学习，已经掌握了一些简单的中文、英文语料的预处理操作。写点笔记，记录一下学习的过程。 1、中文语料的特点　　第一点：中文语料中词与词之间是紧密相连的，这一点不同与英文或者其它语种的语料，因此在...

数据挖掘：基于Spark+HanLP实现影视评论关键词抽取(1)

1. 背景近日项目要求基于爬取的影视评论信息，抽取影视的关键字信息。考虑到影视评论数据量较大，因此采用Spark处理框架。关键词提取的处理主要包含分词+算法抽取两部分。目前分词工具包较为主流的，包括哈工大的LTP以及HanLP，而关键词的抽取算法较多，包括TF-IDF、TextRank、互信息等。本次任务主要基于LTP、HanLP、Ac双数组进行分词，采...

R语言自然语言处理：关键词提取（TF-IDF）

作者：黄天元，复旦大学博士在读，热爱数据科学与开源工具（R/Python），致力于利用数据科学迅速积累行业经验优势和科学知识发现，涉猎内容包括但不限于信息计量、机器学习、数据可视化、应用统计建模、知识图谱等，著有《R语言高效数据处理指南》、《文本数据挖掘——基于R语言》（《文本数据挖掘基于R语言》(黄天元)【摘要书评试读】- 京东图书）。知乎专栏：R...

NLP（二十三）：用tf-idf得到句子向量，并计算相似度

一、基于gensim

二、基于sklearn

相关文章

自然语言处理-中文语料预处理

数据挖掘：基于Spark+HanLP实现影视评论关键词抽取(1)

R语言自然语言处理：关键词提取（TF-IDF）

最新文章

随机推荐

思享工具箱导航

JSON工具

格式化转换

加解密编码

文本数字

网络

站长

计算

其他

对照列表