【CUDA并行编程之八】Cuda实现Kmeans算法

摘要：

本文主要介绍如何使用CUDA并行计算框架编程实现机器学习中的Kmeans算法，Kmeans算法的详细介绍在这里，本文重点在并行实现的过程。free_memory()：释放内存空间~KMEANS()：析构函数并行的代码主要三个函数：find_nearest_cluster(...)compute_delta(...)euclid_dist_2(...)首先看一下函数euclid_dist_2(...)：[cpp]viewplaincopy__host____device__inlinestaticfloateuclid_dist_2{inti;floatans=0;for{ans+=*;}returnans;}这段代码实际上就是并行的计算向量objects[objectId]和clusters[clusterId]之间的距离，即第objectId个数据点到第clusterId个中心点的距离。

本文主要介绍如何使用CUDA并行计算框架编程实现机器学习中的Kmeans算法，Kmeans算法的详细介绍在这里，本文重点在并行实现的过程。

当然还是简单的回顾一下kmeans算法的串行过程：

伪代码：

[cpp]view plaincopy
创建k个点作为起始质心(经常是随机选择)
当任意一个点的簇分配结果发生改变时
对数据集中的每个数据点
对每个质心
计算质心与数据点之间的距离
将数据点分配到距其最近的簇
对每一个簇，计算簇中所有点的均值并将均值作为质心

我们可以观察到有两个部分可以并行优化：

①line03-04：将每个数据点到多个质心的距离计算进行并行化

②line05：将数据点到某个执行的距离计算进行并行化

KMEANS类：

[cpp]view plaincopy
classKMEANS
{
private:
intnumClusters;
intnumCoords;
intnumObjs;
int*membership;//[numObjs]
char*filename;
float**objects;//[numObjs][numCoords]dataobjects
float**clusters;//[numClusters][unmCoords]clustercenter
floatthreshold;
intloop_iterations;
public:
KMEANS(intk);
voidfile_read(char*fn);
voidfile_write();
voidcuda_kmeans();
inlineintnextPowerOfTwo(intn);
voidfree_memory();
virtual~KMEANS();
};//KMEANS

成员变量：

numClusters：中心点的个数

numCoords：每个数据点的维度

numObjs：数据点的个数

membership：每个数据点所属类别的数组，维度为numObjs

filename：读入的文件名

objects：所有数据点，维度为[numObjs][numCoords]

clusters：中心点数据，维度为[numObjs][numCoords]

threshold：控制循环次数的一个域值

loop_iterations：循环的迭代次数

成员函数：

KMEANS(int k)：含参构造函数。初始化成员变量

file_read(char *fn)：读入文件数据并初始化object以及membership变量

file_write()：将计算结果写回到结果文件中去

cuda_kmeans()：kmeans计算的入口函数

nextPowerOfTwo(int n)：它计算大于等于输入参数n的第一个2的幂次数。

free_memory()：释放内存空间

~KMEANS()：析构函数

并行的代码主要三个函数：

find_nearest_cluster(...)

compute_delta(...)

euclid_dist_2(...)

首先看一下函数euclid_dist_2(...)：

[cpp]view plaincopy
__host____device__inlinestatic
floateuclid_dist_2(intnumCoords,intnumObjs,intnumClusters,float*objects,float*clusters,intobjectId,intclusterId)
{
inti;
floatans=0;
for(i=0;i<numCoords;i++)
{
ans+=(objects[numObjs*i+objectId]-clusters[numClusters*i+clusterId])*
(objects[numObjs*i+objectId]-clusters[numClusters*i+clusterId]);
}
returnans;
}

这段代码实际上就是并行的计算向量objects[objectId]和clusters[clusterId]之间的距离，即第objectId个数据点到第clusterId个中心点的距离。

再看一下函数compute_delta(...)：

[cpp]view plaincopy
/*
*numIntermediates:Theactualnumberofintermediates
*numIntermediates2:Thenextpoweroftwo
*/
__global__staticvoidcompute_delta(int*deviceIntermediates,intnumIntermediates,intnumIntermediates2)
{
extern__shared__unsignedintintermediates[];
intermediates[threadIdx.x]=(threadIdx.x<numIntermediates)?deviceIntermediates[threadIdx.x]:0;
__syncthreads();
//numIntermediates2*must*beapoweroftwo!
for(unsignedints=numIntermediates2/2;s>0;s>>=1)
{
if(threadIdx.x<s)
{
intermediates[threadIdx.x]+=intermediates[threadIdx.x+s];
}
__syncthreads();
}
if(threadIdx.x==0)
{
deviceIntermediates[0]=intermediates[0];
}
}

这段代码的意义就是将一个线程块中每个线程的对应的intermediates的数据求和最后放到deviceIntermediates[0]中去然后拷贝回主存块中去。这个问题的更好的解释在这里，实际上就是一个数组求和的问题，应用在这里求得的是有改变的membership中所有数据的和，即改变了簇的点的个数。

最后再看函数finid_nearest_cluster(...)：

[cpp]view plaincopy
/*
*objects:[numCoords][numObjs]
*deviceClusters:[numCoords][numClusters]
*membership:[numObjs]
*/
__global__staticvoidfind_nearest_cluster(intnumCoords,intnumObjs,intnumClusters,float*objects,float*deviceClusters,int*membership,int*intermediates)
{
extern__shared__charsharedMemory[];
unsignedchar*membershipChanged=(unsignedchar*)sharedMemory;
float*clusters=deviceClusters;
membershipChanged[threadIdx.x]=0;
intobjectId=blockDim.x*blockIdx.x+threadIdx.x;
if(objectId<numObjs)
{
intindex;
floatdist,min_dist;
/*findtheclusteridthathasmindistancetoobject*/
index=0;
min_dist=euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,0);
for(inti=0;i<numClusters;i++)
{
dist=euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,i);
/*noneedsquareroot*/
if(dist<min_dist)
{
min_dist=dist;
index=i;
}
}
if(membership[objectId]!=index)
{
membershipChanged[threadIdx.x]=1;
}
//assignthemembershiptoobjectobjectId
membership[objectId]=index;
__syncthreads();//formembershipChanged[]
#if1
//blockDim.x*must*beapoweroftwo!
for(unsignedints=blockDim.x/2;s>0;s>>=1)
{
if(threadIdx.x<s)
{
membershipChanged[threadIdx.x]+=membershipChanged[threadIdx.x+s];//calculateallchangedvaluesandsaveresulttomembershipChanged[0]
}
__syncthreads();
}
if(threadIdx.x==0)
{
intermediates[blockIdx.x]=membershipChanged[0];
}
#endif
}
}//find_nearest_cluster

这个函数计算的就是第objectId个数据点到numClusters个中心点的距离，然后根据情况比较更新membership。

这三个函数将所有能够并行的地方都进行了并行，实现了整体算法的并行化~

在此呈上全部代码：

kmeans.h:

[cpp]view plaincopy
#ifndef_H_KMEANS
#define_H_KMEANS
#include<assert.h>
#definemalloc2D(name,xDim,yDim,type)do{
name=(type**)malloc(xDim*sizeof(type*));
assert(name!=NULL);
name[0]=(type*)malloc(xDim*yDim*sizeof(type));
assert(name[0]!=NULL);
for(size_ti=1;i<xDim;i++)
name[i]=name[i-1]+yDim;
}while(0)
doublewtime(void);
#endif

wtime.cu:

[cpp]view plaincopy
#include<sys/time.h>
#include<stdio.h>
#include<stdlib.h>
doublewtime(void)
{
doublenow_time;
structtimevaletstart;
structtimezonetzp;
if(gettimeofday(&etstart,&tzp)==-1)
perror("Error:callinggettimeofday()notsuccessful. ");
now_time=((double)etstart.tv_sec)+/*inseconds*/
((double)etstart.tv_usec)/1000000.0;/*inmicroseconds*/
returnnow_time;
}

cuda_kmeans.cu：

[cpp]view plaincopy
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<sys/types.h>
#include<sys/stat.h>
#include<unistd.h>
#include<iostream>
#include<cassert>
#include"kmeans.h"
usingnamespacestd;
constintMAX_CHAR_PER_LINE=1024;
classKMEANS
{
private:
intnumClusters;
intnumCoords;
intnumObjs;
int*membership;//[numObjs]
char*filename;
float**objects;//[numObjs][numCoords]dataobjects
float**clusters;//[numClusters][unmCoords]clustercenter
floatthreshold;
intloop_iterations;
public:
KMEANS(intk);
voidfile_read(char*fn);
voidfile_write();
voidcuda_kmeans();
inlineintnextPowerOfTwo(intn);
voidfree_memory();
virtual~KMEANS();
};
KMEANS::~KMEANS()
{
free(membership);
free(clusters[0]);
free(clusters);
free(objects[0]);
free(objects);
}
KMEANS::KMEANS(intk)
{
threshold=0.001;
numObjs=0;
numCoords=0;
numClusters=k;
filename=NULL;
loop_iterations=0;
}
voidKMEANS::file_write()
{
FILE*fptr;
charoutFileName[1024];
//output:thecoordinatesoftheclustercentres
sprintf(outFileName,"%s.cluster_centres",filename);
printf("WritingcoordinatesofK=%dclustercenterstofile"%s" ",numClusters,outFileName);
fptr=fopen(outFileName,"w");
for(inti=0;i<numClusters;i++)
{
fprintf(fptr,"%d",i);
for(intj=0;j<numCoords;j++)
fprintf(fptr,"%f",clusters[i][j]);
fprintf(fptr," ");
}
fclose(fptr);
//output:theclosestclustercentretoeachofthedatapoints
sprintf(outFileName,"%s.membership",filename);
printf("writingmembershipofN=%ddataobjectstofile"%s" ",numObjs,outFileName);
fptr=fopen(outFileName,"w");
for(inti=0;i<numObjs;i++)
{
fprintf(fptr,"%d%d ",i,membership[i]);
}
fclose(fptr);
}
inlineintKMEANS::nextPowerOfTwo(intn)
{
n--;
n=n>>1|n;
n=n>>2|n;
n=n>>4|n;
n=n>>8|n;
n=n>>16|n;
//n=n>>32|n;//for64-bitints
return++n;
}
__host____device__inlinestatic
floateuclid_dist_2(intnumCoords,intnumObjs,intnumClusters,float*objects,float*clusters,intobjectId,intclusterId)
{
inti;
floatans=0;
for(i=0;i<numCoords;i++)
{
ans+=(objects[numObjs*i+objectId]-clusters[numClusters*i+clusterId])*
(objects[numObjs*i+objectId]-clusters[numClusters*i+clusterId]);
}
returnans;
}
/*
*numIntermediates:Theactualnumberofintermediates
*numIntermediates2:Thenextpoweroftwo
*/
__global__staticvoidcompute_delta(int*deviceIntermediates,intnumIntermediates,intnumIntermediates2)
{
extern__shared__unsignedintintermediates[];
intermediates[threadIdx.x]=(threadIdx.x<numIntermediates)?deviceIntermediates[threadIdx.x]:0;
__syncthreads();
//numIntermediates2*must*beapoweroftwo!
for(unsignedints=numIntermediates2/2;s>0;s>>=1)
{
if(threadIdx.x<s)
{
intermediates[threadIdx.x]+=intermediates[threadIdx.x+s];
}
__syncthreads();
}
if(threadIdx.x==0)
{
deviceIntermediates[0]=intermediates[0];
}
}
/*
*objects:[numCoords][numObjs]
*deviceClusters:[numCoords][numClusters]
*membership:[numObjs]
*/
__global__staticvoidfind_nearest_cluster(intnumCoords,intnumObjs,intnumClusters,float*objects,float*deviceClusters,int*membership,int*intermediates)
{
extern__shared__charsharedMemory[];
unsignedchar*membershipChanged=(unsignedchar*)sharedMemory;
float*clusters=deviceClusters;
membershipChanged[threadIdx.x]=0;
intobjectId=blockDim.x*blockIdx.x+threadIdx.x;
if(objectId<numObjs)
{
intindex;
floatdist,min_dist;
/*findtheclusteridthathasmindistancetoobject*/
index=0;
min_dist=euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,0);
for(inti=0;i<numClusters;i++)
{
dist=euclid_dist_2(numCoords,numObjs,numClusters,objects,clusters,objectId,i);
/*noneedsquareroot*/
if(dist<min_dist)
{
min_dist=dist;
index=i;
}
}
if(membership[objectId]!=index)
{
membershipChanged[threadIdx.x]=1;
}
//assignthemembershiptoobjectobjectId
membership[objectId]=index;
__syncthreads();//formembershipChanged[]
#if1
//blockDim.x*must*beapoweroftwo!
for(unsignedints=blockDim.x/2;s>0;s>>=1)
{
if(threadIdx.x<s)
{
membershipChanged[threadIdx.x]+=membershipChanged[threadIdx.x+s];//calculateallchangedvaluesandsaveresulttomembershipChanged[0]
}
__syncthreads();
}
if(threadIdx.x==0)
{
intermediates[blockIdx.x]=membershipChanged[0];
}
#endif
}
}//find_nearest_cluster
voidKMEANS::cuda_kmeans()
{
intindex,loop=0;
int*newClusterSize;//[numClusters]:no.objectsassignedineachnewcluster
floatdelta;//%ofobjectschangestheirclusters
float**dimObjects;//[numCoords][numObjs]
float**dimClusters;
float**newClusters;//[numCoords][numClusters]
float*deviceObjects;//[numCoords][numObjs]
float*deviceClusters;//[numCoords][numclusters]
int*deviceMembership;
int*deviceIntermediates;
//Copyobjectsgivenin[numObjs][numCoords]layouttonew[numCoords][numObjs]layout
malloc2D(dimObjects,numCoords,numObjs,float);
for(inti=0;i<numCoords;i++)
{
for(intj=0;j<numObjs;j++)
{
dimObjects[i][j]=objects[j][i];
}
}
//pickfirstnumClusterselementsofobjects[]asinitialclustercenters
malloc2D(dimClusters,numCoords,numClusters,float);
for(inti=0;i<numCoords;i++)
{
for(intj=0;j<numClusters;j++)
{
dimClusters[i][j]=dimObjects[i][j];
}
}
newClusterSize=newint[numClusters];
assert(newClusterSize!=NULL);
malloc2D(newClusters,numCoords,numClusters,float);
memset(newClusters[0],0,numCoords*numClusters*sizeof(float));
//Tosupportreduction,numThreadsPerClusterBlock*must*beapoweroftwo,andit*must*benolargerthanthenumberofbitsthatwillfitintoanunsignedchar,thetypeusedtokeeptrackofmembershipchangesinthekernel.
constunsignedintnumThreadsPerClusterBlock=32;
constunsignedintnumClusterBlocks=(numObjs+numThreadsPerClusterBlock-1)/numThreadsPerClusterBlock;
constunsignedintnumReductionThreads=nextPowerOfTwo(numClusterBlocks);
constunsignedintclusterBlockSharedDataSize=numThreadsPerClusterBlock*sizeof(unsignedchar);
constunsignedintreductionBlockSharedDataSize=numReductionThreads*sizeof(unsignedint);
cudaMalloc(&deviceObjects,numObjs*numCoords*sizeof(float));
cudaMalloc(&deviceClusters,numClusters*numCoords*sizeof(float));
cudaMalloc(&deviceMembership,numObjs*sizeof(int));
cudaMalloc(&deviceIntermediates,numReductionThreads*sizeof(unsignedint));
cudaMemcpy(deviceObjects,dimObjects[0],numObjs*numCoords*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(deviceMembership,membership,numObjs*sizeof(int),cudaMemcpyHostToDevice);
do
{
cudaMemcpy(deviceClusters,dimClusters[0],numClusters*numCoords*sizeof(float),cudaMemcpyHostToDevice);
find_nearest_cluster<<<numClusterBlocks,numThreadsPerClusterBlock,clusterBlockSharedDataSize>>>(numCoords,numObjs,numClusters,deviceObjects,deviceClusters,deviceMembership,deviceIntermediates);
cudaDeviceSynchronize();
compute_delta<<<1,numReductionThreads,reductionBlockSharedDataSize>>>(deviceIntermediates,numClusterBlocks,numReductionThreads);
cudaDeviceSynchronize();
intd;
cudaMemcpy(&d,deviceIntermediates,sizeof(int),cudaMemcpyDeviceToHost);
delta=(float)d;
cudaMemcpy(membership,deviceMembership,numObjs*sizeof(int),cudaMemcpyDeviceToHost);
for(inti=0;i<numObjs;i++)
{
//findthearrayindexofnestest
index=membership[i];
//updatenewclustercenters:sumofobjectslocatedwithin
newClusterSize[index]++;
for(intj=0;j<numCoords;j++)
{
newClusters[j][index]+=objects[i][j];
}
}
//averagethesumandreplaceoldclustercenterswithnewClusters
for(inti=0;i<numClusters;i++)
{
for(intj=0;j<numCoords;j++)
{
if(newClusterSize[i]>0)
dimClusters[j][i]=newClusters[j][i]/newClusterSize[i];
newClusters[j][i]=0.0;//setbackto0
}
newClusterSize[i]=0;//setbackto0
}
delta/=numObjs;
}while(delta>threshold&&loop++<500);
loop_iterations=loop+1;
malloc2D(clusters,numClusters,numCoords,float);
for(inti=0;i<numClusters;i++)
{
for(intj=0;j<numCoords;j++)
{
clusters[i][j]=dimClusters[j][i];
}
}
cudaFree(deviceObjects);
cudaFree(deviceClusters);
cudaFree(deviceMembership);
cudaFree(deviceMembership);
free(dimObjects[0]);
free(dimObjects);
free(dimClusters[0]);
free(dimClusters);
free(newClusters[0]);
free(newClusters);
free(newClusterSize);
}
voidKMEANS::file_read(char*fn)
{
FILE*infile;
char*line=newchar[MAX_CHAR_PER_LINE];
intlineLen=MAX_CHAR_PER_LINE;
filename=fn;
infile=fopen(filename,"r");
assert(infile!=NULL);
/*findthenumberofobjects*/
while(fgets(line,lineLen,infile))
{
numObjs++;
}
/*findthedimensionofeachobject*/
rewind(infile);
while(fgets(line,lineLen,infile)!=NULL)
{
if(strtok(line," ")!=0)
{
while(strtok(NULL," "))
numCoords++;
break;
}
}
/*allocatespaceforobject[][]andreadallobjcet*/
rewind(infile);
objects=newfloat*[numObjs];
for(inti=0;i<numObjs;i++)
{
objects[i]=newfloat[numCoords];
}
inti=0;
/*readallobject*/
while(fgets(line,lineLen,infile)!=NULL)
{
if(strtok(line," ")==NULL)continue;
for(intj=0;j<numCoords;j++)
{
objects[i][j]=atof(strtok(NULL,", "));
}
i++;
}
/*membership:theclusteridforeachdataobject*/
membership=newint[numObjs];
assert(membership!=NULL);
for(inti=0;i<numObjs;i++)
membership[i]=-1;
}
intmain(intargc,char*argv[])
{
KMEANSkmeans(atoi(argv[1]));
kmeans.file_read(argv[2]);
kmeans.cuda_kmeans();
kmeans.file_write();
return0;
}

makefile：

[cpp]view plaincopy
target:
nvcccuda_kmeans.cu
./a.out4./Image_data/color100.txt

所有代码和文件数据在这里：http://yunpan.cn/cKBZMPAJ8tcAs（提取码：9476）

运行代码：

【CUDA并行编程之八】Cuda实现Kmeans算法第19张

kmeans的cuda实现代码相对复杂，在阅读的过程中可能会有困难，有问题请留言~

Author：忆之独秀

Email：leaguenew@qq.com

注明出处：http://blog.csdn.net/lavorange/article/details/41942323

【CUDA并行编程之八】Cuda实现Kmeans算法

相关文章

ws2s函数

C#中char[]与string之间的转换；byte[]与string之间的转化

C++获取MAC与IP

【转载】C/C++中的char，wchar，TCHAR

DAG优化

CAS原子操作实现无锁及性能分析

最新文章

随机推荐

思享工具箱导航

JSON工具

格式化转换

加解密编码

文本数字

网络

站长

计算

其他

对照列表