NLP(二十三):用tf-idf得到句子向量,并计算相似度

摘要:
一、基于gensim1、模型类importosimportjiebaimportpickleimportloggingimportnumpyasnpfromgensimimportcorpora,models,similaritiesimportutils.word_processasword_processfromroot_pathimportrootfrompathlibimportPathi

一、基于gensim

1、模型类

importos
importjieba
importpickle
importlogging
importnumpy as np
from gensim importcorpora, models, similarities
importutils.word_process as word_process
from root_path importroot
from pathlib importPath
importheapq

classTfIdf(object):
    """tf-idf模型计算相似度"""
    def __init__(self):
        root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf")
        if notPath(root_path).is_dir():
            os.mkdir(root_path)
        self.dic_path = os.path.join(root_path, "bow.model")
        self.tfidf_model_path = os.path.join(root_path, "tfidf_model.model")
        self.tfidf_index_path = os.path.join(root_path, "tfidf_index.model")
        self.stop_list =word_process.get_stop_list()

        self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt")


    defdel_stopwords(self, words):
        """删除一句话中的停用词"""word_list =[]

        for word inwords:
            if word not inself.stop_list:
                word_list.append(word)
        returnword_list

    def _seg_word(self, words_list, jieba_flag=True, del_stopword=True):
        """对多句话进行分词或分字"""word_list =[]
        ifjieba_flag:
            ifdel_stopword:
                for words inwords_list:
                    jieba.cut(words)
                    word_list.append(self.del_stopwords(list(jieba.cut(words))))
            else:
                for words inwords_list:
                    word_list.append(list(jieba.cut(words)))
        else:
            ifdel_stopword:
                for words inwords_list:
                    word_list.append(self.del_stopwords(words))
            else:
                for words inwords_list:
                    word_list.append([word for word inwords])
        returnword_list

    deftrain(self, sentence_list):
        """训练模型"""
        #下面保存语料字典
        word_list =self._seg_word(sentence_list)
        dic = corpora.Dictionary(word_list, prune_at=2000000)
        dic.save(self.dic_path)

        #构建tfidf模型
        tfidf_model_path =self.tfidf_model_path
        corpus_model = [dic.doc2bow(word) for word inword_list]
        tfidf_model =models.TfidfModel(corpus_model)
        tfidf_model.save(tfidf_model_path)

        #构造检索模型
        tfidf_index_path =self.tfidf_index_path
        corpus_tfidf =tfidf_model[corpus_model]
        tfidf_index =similarities.MatrixSimilarity(corpus_tfidf)
        tfidf_index.save(tfidf_index_path)

    defpredict(self, sentence):
        #得到句子向量, 直接出检索结果(检索是基于word_list的)。
        dic =corpora.Dictionary.load(self.dic_path)
        words =sentence
        word_bow =dic.doc2bow(self._seg_word([words])[0])
        word_tfidf =models.TfidfModel.load(self.tfidf_model_path)[word_bow]
        tfidf_index =similarities.MatrixSimilarity.load(self.tfidf_index_path)
        score =tfidf_index[word_tfidf]
        returnscore

    defget_train_data(self):
        """得到句子数组和标签数组"""labels =[]
        sentences =[]
        with open(self.data_path, "r", encoding="utf8") as f:
            for line inf.readlines():
                data_tuple = line.split("  ")
                label =data_tuple[0]
                labels.append(label)
                sentence = data_tuple[1].replace("", "").replace("", "")
                sentences.append(sentence)
        returnlabels, sentences

    defmain(self):
        labels, sentences =self.get_train_data()
        print(sentences)
        self.train(sentences)
        score_list = self.predict("我有困难还不了")

        #获取下标, 输出为[4, 5, 2]
        print(heapq.nlargest(30, range(len(score_list)), score_list.__getitem__))

        #获取数值, 输出为[9, 9, 6]
        print(heapq.nlargest(30, score_list))



if __name__ == '__main__':
    TfIdf().main()

2、工具类

importos
from root_path importroot
importtqdm


stop = os.path.join(root, "confusion_detection","data", "raw_data", "ChineseStopWords.txt")

defget_stop_list():
    """得到停用词列表"""stop_word_list =[]
    with open(stop, "r", encoding="utf8") as f:
        data_lines = tqdm.tqdm(f.readlines(), smoothing=0, mininterval=0.1)
        data_lines.set_description('正在处理停用词...')
        for line indata_lines:
            line = line.replace(" ", "").replace("", "").replace("", "")
            if len(line) == 1:
                stop_word_list.append(line)
    return stop_word_list

二、基于sklearn

importos
importjieba
importpickle
from root_path importroot
from pathlib importPath

from sklearn.feature_extraction.text importTfidfVectorizer

classTfIdf(object):
    """tf-idf模型计算相似度"""
    def __init__(self):
        root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf")
        if notPath(root_path).is_dir():
            os.mkdir(root_path)
        self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt")
        self.model_path = os.path.join(root_path, "tfidf.model")

    defget_train_data(self):
        """得到句子数组和标签数组"""labels =[]
        sentences =[]
        with open(self.data_path, "r", encoding="utf8") as f:
            for line inf.readlines():
                data_tuple = line.split("  ")
                label =data_tuple[0]
                labels.append(label)
                sentence = data_tuple[1].replace("", "").replace("", "")
                sentences.append(sentence)
        returnlabels, sentences

    deftrain(self):
        labels, sentences =self.get_train_data()
        sent_words = [list(jieba.cut(sent0)) for sent0 insentences]
        document = [" ".join(sent0) for sent0 insent_words]
        tfidf_vectorizer =TfidfVectorizer()
        feature =tfidf_vectorizer.fit_transform(document)
        #保存模型
        with open(self.model_path, 'wb') as f:
            pickle.dump(tfidf_vectorizer, f)

    defpredict(self, sentence):
        #加载模型
        with open(self.model_path, 'rb') as f:
            tfidf_vectorizer =pickle.load(f)
        sentence =list(jieba.cut(sentence))
        sen = " ".join(sentence)
        res =tfidf_vectorizer.transform([sen]).toarray()
        returnres[0]

    defmain(self):
        sentence = "是的,我知道那就十五号没办法,因为这个,也可能是十二十号发工资的,因为遇见了超过了一点点。"self.predict(sentence)

if __name__ == '__main__':
    TfIdf().main()

免责声明:文章转载自《NLP(二十三):用tf-idf得到句子向量,并计算相似度》仅用于学习参考。如对内容有疑问,请及时联系本站处理。

上篇php新手:XAMMP打开开源php代码dubbo备忘录下篇

宿迁高防,2C2G15M,22元/月;香港BGP,2C5G5M,25元/月 雨云优惠码:MjYwNzM=

相关文章

自然语言处理-中文语料预处理

自然语言处理——中文文本预处理 近期,在自学自然语言处理,初次接触NLP觉得十分的难,各种概念和算法,而且也没有很强的编程基础,学着稍微有点吃力。不过经过两个星期的学习,已经掌握了一些简单的中文、英文语料的预处理操作。写点笔记,记录一下学习的过程。 1、中文语料的特点   第一点:中文语料中词与词之间是紧密相连的,这一点不同与英文或者其它语种的语料,因此在...

数据挖掘:基于Spark+HanLP实现影视评论关键词抽取(1)

1. 背景 近日项目要求基于爬取的影视评论信息,抽取影视的关键字信息。考虑到影视评论数据量较大,因此采用Spark处理框架。关键词提取的处理主要包含分词+算法抽取两部分。目前分词工具包较为主流的,包括哈工大的LTP以及HanLP,而关键词的抽取算法较多,包括TF-IDF、TextRank、互信息等。本次任务主要基于LTP、HanLP、Ac双数组进行分词,采...

R语言自然语言处理:关键词提取(TF-IDF)

作者:黄天元,复旦大学博士在读,热爱数据科学与开源工具(R/Python),致力于利用数据科学迅速积累行业经验优势和科学知识发现,涉猎内容包括但不限于信息计量、机器学习、数据可视化、应用统计建模、知识图谱等,著有《R语言高效数据处理指南》、《文本数据挖掘——基于R语言》(《文本数据挖掘 基于R语言》(黄天元)【摘要 书评 试读】- 京东图书)。知乎专栏:R...