Naive Bayesian文本分类器

jopen 9年前

贝叶斯学习方法中实用性很高的一种为朴素贝叶斯学习期,常被称为朴素贝叶斯分类器。在某些领域中与神经网络和决策树学习相当。虽然朴素贝叶斯分类器忽略单词间的依赖关系,即假设所有单词是条件独立的,但朴素贝叶斯分类在实际应用中有很出色的表现。

朴素贝叶斯文本分类算法伪代码:


朴素贝叶斯文本分类算法流程:


通过计算训练集中每个类别的概率与不同类别下每个单词的概率,然后利用朴素贝叶斯公式计算新文档被分类为各个类别的概率,最终输出概率最大的类别。

C++源码:

/*   Bayesian classifier for document classifiaction   15S103182   Ethan   2015.12.27  */  #include <iostream>  #include <vector>  #include <iterator>  #include <map>  #include <fstream>  #include <iomanip>  #include <sstream>  using namespace std;  int stringToInteger(string a){   stringstream ss;   ss<<a;   int b;   ss>>b;   return b;  }  vector<int> openClassificationFile(const char* dataset){   fstream file;   file.open(dataset,ios::in);   if(!file)       {          cout <<"Open File Failed!" <<endl;          vector<int> a;          return a;      }    vector<int> data;   int i=1;   while(!file.eof()){    string temp;    file>>temp;    data.push_back(stringToInteger(temp));   }   file.close();   return data;  }  vector<string> openFile(const char* dataset){   fstream file;   file.open(dataset,ios::in);   if(!file)       {          cout <<"Open File Failed!" <<endl;          vector<string> a;          return a;      }   vector<string> data;   int i=1;   while(!file.eof()){    string temp;    file>>temp;    data.push_back(temp);   }   file.close();   for(int i=0;i<data.size();i++) cout<<data[i]<<"\t";   cout<<endl;   cout<<"Open file successfully!"<<endl;   return data;  }  vector<vector<string> > openFiles(const vector<char*> files){   vector<vector<string> > docs;   for(int i=0;i<files.size();i++){    vector<string> t = openFile(files[i]);    docs.push_back(t);   }   return docs;  }  void bayesian(vector<vector<string> > docs,vector<int> c,vector<string> d){   map<string,int> wordFrequency;//每个单词出现的个数    map<int,float> cWordProbability;//类别单词频率    map<int,int> cTotalFrequency;//类别单词个数   map<int,map<string,int> > cWordlTotalFrequency;//类别下单词个数    int totalWords=0;   for(int i=0;i<docs.size();i++){    totalWords += docs[i].size();    cWordProbability[c[i]] = cWordProbability[c[i]] + docs[i].size();    map<string,int> sn;     for(int j=0;j<docs[i].size();j++){     wordFrequency[docs[i][j]] = wordFrequency[docs[i][j]] + 1;     sn[docs[i][j]] = sn[docs[i][j]] + 1;    }    map<string,int>::iterator isn;    for(isn = sn.begin();isn!=sn.end();isn++){     cWordlTotalFrequency[c[i]][isn->first] = cWordlTotalFrequency[c[i]][isn->first] + isn->second;    }   }   int tw = wordFrequency.size();   map<int,float>::iterator icWordProbability;   for(icWordProbability=cWordProbability.begin();icWordProbability!=cWordProbability.end();icWordProbability++){    cTotalFrequency[icWordProbability->first] = icWordProbability->second;    cWordProbability[icWordProbability->first] = icWordProbability->second / totalWords;   }   cout<<"Word Frequency:"<<endl;   map<string,int>::iterator iwordFrequency;   for(iwordFrequency=wordFrequency.begin();iwordFrequency!=wordFrequency.end();iwordFrequency++){    cout<<setw(8)<<iwordFrequency->first<<"\tFrequency:"<<iwordFrequency->second<<endl;   }   cout<<"Conditional Probability:"<<endl;   map<string,int> dtw;//待分类文档词频    for(int i=0;i<d.size();i++) dtw[d[i]] = dtw[d[i]] + 1;   map<string,map<int,float> > cp;//单词类别概率    map<string,int>::iterator idtw;   for(idtw=dtw.begin();idtw!=dtw.end();idtw++){    map<int,float> cf;    for(int j=0;j<cTotalFrequency.size();j++){     float p=0;     p = (float)(cWordlTotalFrequency[j][idtw->first] +1) / (cTotalFrequency[j] + wordFrequency.size());     cf[j] = p;     cout<<"P("<<idtw->first<<"|"<<j<<") \t= "<<p<<endl;    }    cp[idtw->first] = cf;   }   cout<<"Classification Probability:"<<endl;   float mp = 0;   int classification=0;   for(int i=0;i<cTotalFrequency.size();i++){    float tcp=1;    for(int j=0;j<d.size();j++){     tcp = tcp * cp[d[j]][i];    }    tcp = tcp * cWordProbability[i];    cout<<"classification:"<<i<<"\t"<<"Probability:"<<tcp<<endl;    if(mp<tcp) {     mp = tcp;     classification = i;    }   }   cout<<"The new document classification is:"<<classification<<endl;  }    int main(int argc, char** argv) {   vector<vector<string> > docs;   vector<int> c = openClassificationFile("classification.txt");   vector<char *> files;   files.push_back("1.txt");files.push_back("2.txt");files.push_back("3.txt");files.push_back("4.txt");files.push_back("5.txt");   cout<<"训练文档集:"<<endl;   docs = openFiles(files);   vector<string> d;   cout<<"待分类文档:"<<endl;    d = openFile("new.txt");   bayesian(docs,c,d);   return 0;  }

效果展示:


结论:

朴素贝叶斯分类器用于处理离散型的文本数据,能够有效对文本文档进行分类。在实验过程中,最困难的地方在于数据结构的设计,由于要统计每个文档类别的频数和每个文档类别下单词的概率,这个地方需要用到复杂映射与统计,在编码过程中经过不断的思考,最终通过多级映射的形式储存所需的数据,最终计算出新文档的类别。通过实验,成功将新的未分类文档输入例子分类为期待的文档类型,实验结果较为满意。


来自: http://blog.csdn.net/k76853/article/details/50532195