KNN算法源代码

摘要:
Importpandasaspdimportosdefcreatcatesdir(data,target):#获取已消除重复的类别列表categories=list(data['channelName'].unique())#打印类别打印(categories)#为类别创建类别文件夹:#Splice子目录路径final_path=target+catetry:os.mkdir(final_path)#
import pandas as pd
import os


def creatcatesdir(data, target):
# 获取去重后的分类列表
cates = list(data['channelName'].unique())
# 打印类别
print(cates)
# 建立类别文件夹
for cate in cates:
# 拼接子目录路径
final_path = target + cate
try:
os.mkdir(final_path) # 建立目录
except Exception as e:
print(str(e))


def excel2txt(data, target):
# 建立类别目录
creatcatesdir(data, target)
# 逐条获取excel中的内容
for index, row in data.iterrows():
# 文章内容
content = row['content']
# 文件名 -> 文章id
filename = row['id']
# 文章标题
title = row['title']
# 子目录 -> 类别
cate = row['channelName']
# 拼接文件路径
txt_path = target + cate + os.sep
# 将文章内容写入txt
with open(txt_path + str(filename) + ".txt", encoding='utf-8', mode='wt') as f:
f.write(str(title)+str(content))


def main():
# 使用pandas读取excel
targetfile = "../article/"
# 数据表个数
sheets = [1, 2, 3, 4, 5, 6, 7, 8]
# 遍历每个数据表 并将数据写入txt文件
for sheet in sheets:
data = pd.read_excel('1.xlsx', sheet_name=sheet)
excel2txt(data, targetfile)


if __name__ == '__main__':
main()


****************************************************************************************
# encoding=utf-8                         #遍历文件,用ProsessofWords处理文件
from imp import reload
import jieba
import os
import numpy as np
import sys

reload(sys)


def EnumPathFiles(path, callback, stop_words_list):
if not os.path.isdir(path):
print('Error:"', path, '" is not a directory or does not exist.')
return
list_dirs = os.walk(path)

for root, dirs, files in list_dirs:
for d in dirs:
print(d)
EnumPathFiles(os.path.join(root, d), callback, stop_words_list)
for f in files:
callback(root, f, stop_words_list)


def ProsessofWords(textpath, stop_words_list):
f = open(textpath, 'r', encoding='utf-8')
text = f.read()
f.close()
result = list()
outstr = ''
seg_list = jieba.cut(text, cut_all=False)
for word in seg_list:
if word not in stop_words_list:
if word != ' ':
outstr += word
outstr += " "
f = open(textpath, 'w+', encoding='utf-8')
f.write(outstr)
f.close()


def callback1(path, filename, stop_words_list):
textpath = path + '\' + filename
print(textpath)
ProsessofWords(textpath, stop_words_list)


if __name__ == '__main__':
stopwords_file = "../stopword/stopword.txt"
stop_f = open(stopwords_file, "r", encoding='utf-8')
stop_words = list()
for line in stop_f.readlines():
line = line.strip()
if not len(line):
continue
stop_words.append(line)
stop_f.close()
print(len(stop_words))

EnumPathFiles(r'../article', callback1, stop_words)

****************************************************************************
# encoding=utf-8
import os


def merge_file(path):
files = os.listdir(path)
print(files)
dict = {'娱乐': '1', '汽车': '2', '游戏': '3', '科技': '4', '综合体育最新': '5', '财经': '6'}
outfile_train = '../dataset_train/x_train.txt'
outfile_label = '../dataset_train/y_train.txt'
result_train = open(outfile_train, 'a', encoding='utf-8')
result_label = open(outfile_label, 'a', encoding='utf-8')
for file in files:
text_dir = path + '\' + file
texts = os.listdir(text_dir)
for text in texts:
txt_file_dir = text_dir + '\' + text
print(txt_file_dir)
f = open(txt_file_dir, 'r', encoding='utf-8')
content = f.read()
if len(content) > 3000:
content = content.encode('utf-8').decode('utf-8')[0:3000] # 截取字段
result_train.write(content+' ') # 合并文件
result_label.write(dict[file]+' ')
result_label.close()
result_train.close()


if __name__ == "__main__":
path = r"../dataset_train"
merge_file(path)

*********************************************************************************
# coding:utf-8
import sys
from imp import reload
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier


reload(sys)

VECTOR_DIR = 'vectors.bin'

MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 200
TEST_SPLIT = 0.2

# 数据获取
print('(1) load texts...')
train_texts = open('../dataset_train/x_train.txt', encoding='utf-8').read().split(' ')
train_labels = open('../dataset_train/y_train.txt', encoding='utf-8').read().split(' ')
test_texts = open('../dataset_test/x_test.txt', encoding='utf-8').read().split(' ')
test_labels = open('../dataset_test/y_test.txt', encoding='utf-8').read().split(' ')
all_text = train_texts + test_texts

# 特征值抽取
print('(2) doc to var...')

count_v0 = CountVectorizer();
counts_all = count_v0.fit_transform(all_text);
count_v1 = CountVectorizer(vocabulary=count_v0.vocabulary_);
counts_train = count_v1.fit_transform(train_texts);
print("the shape of train is " + repr(counts_train.shape))
count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_);
counts_test = count_v2.fit_transform(test_texts);
print("the shape of test is " + repr(counts_test.shape))

tfidftransformer = TfidfTransformer();
train_data = tfidftransformer.fit(counts_train).transform(counts_train);
test_data = tfidftransformer.fit(counts_test).transform(counts_test);

x_train = train_data
y_train = train_labels
x_test = test_data
y_test = test_labels

# KNN算法建模
print('(3) KNN...')
for x in range(1, 15):
knnclf = KNeighborsClassifier(n_neighbors=x)
knnclf.fit(x_train, y_train)
preds = knnclf.predict(x_test);
num = 0
preds = preds.tolist()
for i, pred in enumerate(preds):
if int(pred) == int(y_test[i]):
num += 1
print('K= ' + str(x) + ', precision_score:' + str(float(num) / len(preds)))









免责声明:文章转载自《KNN算法源代码》仅用于学习参考。如对内容有疑问,请及时联系本站处理。

上篇Linux环境下常用软件(个人笔记编辑更改中)Hadoop对小文件的解决方式下篇

宿迁高防,2C2G15M,22元/月;香港BGP,2C5G5M,25元/月 雨云优惠码:MjYwNzM=

相关文章

[Swift]扩展String类:Base64的编码和解码

★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★➤微信公众号:山青咏芝(shanqingyongzhi)➤博客园地址:山青咏芝(https://www.cnblogs.com/strengthen/)➤GitHub地址:https://github.com/strengthen/LeetCode➤原文地址:https://w...

Dubbo封装rest服务返回结果

由于Dubbo服务考虑到一个是给其他系统通过RPC调用,另外一个是提供HTTP协议本身系统的后台管理页面,因此Dubbo返回参数在rest返回的时候配置拦截器进行处理。 在拦截器中,对返回参数封装成如下对象,并统一输出到前端。 1 packagecom.wjs.common.web; 2 3 importorg.apache.commons.lang....

学习pycharm----自动化接口

1.pycharm中安装requests a:打开pycharm软件,点击file-setting b:在目录下点击Project Interpreter,在目录的右侧,点击右上方的+ c: 在输入框中输入requests,点击安装(提示sucessful时,表名安装第三方库成功) ps:在pycharm中安装其他第三方库是一样的步骤 2.使用pychar...

JAVA RSA私钥 加密(签名) 对应 C# RSA私钥 加密(签名)

非对称密钥RSA算法加解密在C#和Java之间交互的问题,这两天看了很多其他人写的文章,碰到了几个问题,最终解决问题。 参考地址:http://xw-z1985.iteye.com/blog/1837376 需求目的:完成c#请求端RSA加密(签名)问题,客户端采用C#开发,服务器端采用Java开发。服务器端给客户端提供私钥,进行数据加密(签名),客户端加...

gitlab 笔记

#http://www.jianshu.com/p/060e7223e211?open_source=weibo_search docker stop gitlabdocker stop redisdocker stop postgresqldocker rm gitlabdocker rm redisdocker rm postgresql docker...

Vue向后端请求课程展示

1.Vue结构 App.vue <template> <div id="app"> <router-link to="/index">首页</router-link> <router-link to="/course">课程</router-link> &...