KNN算法源代码,knn算法python代码

KNN算法源代码

摘要：

Importpandasaspdimportosdefcreatcatesdir（data，target）：#获取已消除重复的类别列表categories=list（data['channelName'].unique（））#打印类别打印（categories）#为类别创建类别文件夹：#Splice子目录路径final_path=target+catetry:os.mkdir（final_path）#

import pandas as pd
import os


def creatcatesdir(data, target):
# 获取去重后的分类列表
    cates = list(data['channelName'].unique())
# 打印类别
    print(cates)
# 建立类别文件夹
    for cate in cates:
# 拼接子目录路径
        final_path = target + cate
try:
            os.mkdir(final_path)  # 建立目录
        except Exception as e:
print(str(e))


def excel2txt(data, target):
# 建立类别目录
    creatcatesdir(data, target)
# 逐条获取excel中的内容
    for index, row in data.iterrows():
# 文章内容
        content = row['content']
# 文件名 -> 文章id
        filename = row['id']
# 文章标题
        title = row['title']
# 子目录 -> 类别
        cate = row['channelName']
# 拼接文件路径
        txt_path = target + cate + os.sep
# 将文章内容写入txt
        with open(txt_path + str(filename) + ".txt", encoding='utf-8', mode='wt') as f:
            f.write(str(title)+str(content))


def main():
# 使用pandas读取excel
    targetfile = "../article/"
    # 数据表个数
    sheets = [1, 2, 3, 4, 5, 6, 7, 8]
# 遍历每个数据表 并将数据写入txt文件
    for sheet in sheets:
        data = pd.read_excel('1.xlsx', sheet_name=sheet)
        excel2txt(data, targetfile)


if __name__ == '__main__':
    main()


****************************************************************************************

# encoding=utf-8                         #遍历文件，用ProsessofWords处理文件
from imp import reload
import jieba
import os
import numpy as np
import sys

reload(sys)


def EnumPathFiles(path, callback, stop_words_list):
if not os.path.isdir(path):
        print('Error:"', path, '" is not a directory or does not exist.')
return
    list_dirs = os.walk(path)

for root, dirs, files in list_dirs:
for d in dirs:
            print(d)
            EnumPathFiles(os.path.join(root, d), callback, stop_words_list)
for f in files:
            callback(root, f, stop_words_list)


def ProsessofWords(textpath, stop_words_list):
    f = open(textpath, 'r', encoding='utf-8')
    text = f.read()
    f.close()
    result = list()
    outstr = ''
    seg_list = jieba.cut(text, cut_all=False)
for word in seg_list:
if word not in stop_words_list:
if word != '	':
                outstr += word
                outstr += " "
    f = open(textpath, 'w+', encoding='utf-8')
    f.write(outstr)
    f.close()


def callback1(path, filename, stop_words_list):
    textpath = path + '\' + filename
    print(textpath)
    ProsessofWords(textpath, stop_words_list)


if __name__ == '__main__':
    stopwords_file = "../stopword/stopword.txt"
    stop_f = open(stopwords_file, "r", encoding='utf-8')
    stop_words = list()
for line in stop_f.readlines():
        line = line.strip()
if not len(line):
continue
        stop_words.append(line)
    stop_f.close()
    print(len(stop_words))

    EnumPathFiles(r'../article', callback1, stop_words)

****************************************************************************

# encoding=utf-8
import os


def merge_file(path):
    files = os.listdir(path)
print(files)
    dict = {'娱乐': '1', '汽车': '2', '游戏': '3', '科技': '4', '综合体育最新': '5', '财经': '6'}
    outfile_train = '../dataset_train/x_train.txt'
    outfile_label = '../dataset_train/y_train.txt'
    result_train = open(outfile_train, 'a', encoding='utf-8')
    result_label = open(outfile_label, 'a', encoding='utf-8')
for file in files:
        text_dir = path + '\' + file
        texts = os.listdir(text_dir)
for text in texts:
            txt_file_dir = text_dir + '\' + text
print(txt_file_dir)
            f = open(txt_file_dir, 'r', encoding='utf-8')
            content = f.read()
if len(content) > 3000:
                content = content.encode('utf-8').decode('utf-8')[0:3000]        # 截取字段
            result_train.write(content+'
')      # 合并文件
            result_label.write(dict[file]+'
')
    result_label.close()
    result_train.close()


if __name__ == "__main__":
    path = r"../dataset_train"
    merge_file(path)

*********************************************************************************

# coding:utf-8
import sys
from imp import reload
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier


reload(sys)

VECTOR_DIR = 'vectors.bin'

MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 200
TEST_SPLIT = 0.2

# 数据获取
print('(1) load texts...')
train_texts = open('../dataset_train/x_train.txt', encoding='utf-8').read().split('
')
train_labels = open('../dataset_train/y_train.txt', encoding='utf-8').read().split('
')
test_texts = open('../dataset_test/x_test.txt', encoding='utf-8').read().split('
')
test_labels = open('../dataset_test/y_test.txt', encoding='utf-8').read().split('
')
all_text = train_texts + test_texts

# 特征值抽取
print('(2) doc to var...')

count_v0 = CountVectorizer();
counts_all = count_v0.fit_transform(all_text);
count_v1 = CountVectorizer(vocabulary=count_v0.vocabulary_);
counts_train = count_v1.fit_transform(train_texts);
print("the shape of train is " + repr(counts_train.shape))
count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_);
counts_test = count_v2.fit_transform(test_texts);
print("the shape of test is " + repr(counts_test.shape))

tfidftransformer = TfidfTransformer();
train_data = tfidftransformer.fit(counts_train).transform(counts_train);
test_data = tfidftransformer.fit(counts_test).transform(counts_test);

x_train = train_data
y_train = train_labels
x_test = test_data
y_test = test_labels

# KNN算法建模
print('(3) KNN...')
for x in range(1, 15):
    knnclf = KNeighborsClassifier(n_neighbors=x)
    knnclf.fit(x_train, y_train)
    preds = knnclf.predict(x_test);
    num = 0
    preds = preds.tolist()
for i, pred in enumerate(preds):
if int(pred) == int(y_test[i]):
            num += 1
    print('K= ' + str(x) + ', precision_score:' + str(float(num) / len(preds)))

免责声明：文章转载自《KNN算法源代码》仅用于学习参考。如对内容有疑问，请及时联系本站处理。

[Swift]扩展String类：Base64的编码和解码

★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★➤微信公众号：山青咏芝（shanqingyongzhi）➤博客园地址：山青咏芝（https://www.cnblogs.com/strengthen/）➤GitHub地址：https://github.com/strengthen/LeetCode➤原文地址：https://w...

Dubbo封装rest服务返回结果

由于Dubbo服务考虑到一个是给其他系统通过RPC调用，另外一个是提供HTTP协议本身系统的后台管理页面，因此Dubbo返回参数在rest返回的时候配置拦截器进行处理。在拦截器中，对返回参数封装成如下对象，并统一输出到前端。 1 packagecom.wjs.common.web; 2 3 importorg.apache.commons.lang....

学习pycharm----自动化接口

1.pycharm中安装requests a:打开pycharm软件，点击file-setting b:在目录下点击Project Interpreter，在目录的右侧，点击右上方的+ c: 在输入框中输入requests，点击安装（提示sucessful时，表名安装第三方库成功） ps：在pycharm中安装其他第三方库是一样的步骤 2.使用pychar...

JAVA RSA私钥加密(签名) 对应 C# RSA私钥加密(签名)

非对称密钥RSA算法加解密在C#和Java之间交互的问题，这两天看了很多其他人写的文章，碰到了几个问题，最终解决问题。参考地址：http://xw-z1985.iteye.com/blog/1837376 需求目的：完成c#请求端RSA加密(签名)问题，客户端采用C#开发，服务器端采用Java开发。服务器端给客户端提供私钥，进行数据加密(签名)，客户端加...

gitlab 笔记

#http://www.jianshu.com/p/060e7223e211?open_source=weibo_search docker stop gitlabdocker stop redisdocker stop postgresqldocker rm gitlabdocker rm redisdocker rm postgresql docker...

Vue向后端请求课程展示

1.Vue结构 App.vue <template> <div id="app"> <router-link to="/index">首页</router-link> <router-link to="/course">课程</router-link> &...

KNN算法源代码

相关文章

[Swift]扩展String类：Base64的编码和解码

Dubbo封装rest服务返回结果

学习pycharm----自动化接口

JAVA RSA私钥加密(签名) 对应 C# RSA私钥加密(签名)

gitlab 笔记

Vue向后端请求课程展示

最新文章

随机推荐

思享工具箱导航

JSON工具

格式化转换

加解密编码

文本数字

网络

站长

计算

其他

对照列表

KNN算法源代码

相关文章

[Swift]扩展String类：Base64的编码和解码

Dubbo封装rest服务返回结果

学习pycharm----自动化接口

JAVA RSA私钥 加密(签名) 对应 C# RSA私钥 加密(签名)

gitlab 笔记

Vue向后端请求课程展示

最新文章

随机推荐

思享工具箱导航

JSON工具

格式化转换

加解密编码

文本数字

网络

站长

计算

其他

对照列表

JAVA RSA私钥加密(签名) 对应 C# RSA私钥加密(签名)