貝葉斯學習方法中實用性很高的一種為樸素貝葉斯學習期,常被稱為樸素貝葉斯分類器。在某些領域中與神經網絡和決策樹學習相當。雖然樸素貝葉斯分類器忽略單詞間的依賴關系,即假設所有單詞是條件獨立的,但樸素貝葉斯分類在實際應用中有很出色的表現。
樸素貝葉斯文本分類算法偽代碼:

樸素貝葉斯文本分類算法流程:

通過計算訓練集中每個類別的概率與不同類別下每個單詞的概率,然后利用樸素貝葉斯公式計算新文檔被分類為各個類別的概率,最終輸出概率最大的類別。
C++源碼:
~~~
/*
Bayesian classifier for document classifiaction
15S103182
Ethan
2015.12.27
*/
#include <iostream>
#include <vector>
#include <iterator>
#include <map>
#include <fstream>
#include <iomanip>
#include <sstream>
using namespace std;
int stringToInteger(string a){
stringstream ss;
ss<<a;
int b;
ss>>b;
return b;
}
vector<int> openClassificationFile(const char* dataset){
fstream file;
file.open(dataset,ios::in);
if(!file)
{
cout <<"Open File Failed!" <<endl;
vector<int> a;
return a;
}
vector<int> data;
int i=1;
while(!file.eof()){
string temp;
file>>temp;
data.push_back(stringToInteger(temp));
}
file.close();
return data;
}
vector<string> openFile(const char* dataset){
fstream file;
file.open(dataset,ios::in);
if(!file)
{
cout <<"Open File Failed!" <<endl;
vector<string> a;
return a;
}
vector<string> data;
int i=1;
while(!file.eof()){
string temp;
file>>temp;
data.push_back(temp);
}
file.close();
for(int i=0;i<data.size();i++) cout<<data[i]<<"\t";
cout<<endl;
cout<<"Open file successfully!"<<endl;
return data;
}
vector<vector<string> > openFiles(const vector<char*> files){
vector<vector<string> > docs;
for(int i=0;i<files.size();i++){
vector<string> t = openFile(files[i]);
docs.push_back(t);
}
return docs;
}
void bayesian(vector<vector<string> > docs,vector<int> c,vector<string> d){
map<string,int> wordFrequency;//每個單詞出現的個數
map<int,float> cWordProbability;//類別單詞頻率
map<int,int> cTotalFrequency;//類別單詞個數
map<int,map<string,int> > cWordlTotalFrequency;//類別下單詞個數
int totalWords=0;
for(int i=0;i<docs.size();i++){
totalWords += docs[i].size();
cWordProbability[c[i]] = cWordProbability[c[i]] + docs[i].size();
map<string,int> sn;
for(int j=0;j<docs[i].size();j++){
wordFrequency[docs[i][j]] = wordFrequency[docs[i][j]] + 1;
sn[docs[i][j]] = sn[docs[i][j]] + 1;
}
map<string,int>::iterator isn;
for(isn = sn.begin();isn!=sn.end();isn++){
cWordlTotalFrequency[c[i]][isn->first] = cWordlTotalFrequency[c[i]][isn->first] + isn->second;
}
}
int tw = wordFrequency.size();
map<int,float>::iterator icWordProbability;
for(icWordProbability=cWordProbability.begin();icWordProbability!=cWordProbability.end();icWordProbability++){
cTotalFrequency[icWordProbability->first] = icWordProbability->second;
cWordProbability[icWordProbability->first] = icWordProbability->second / totalWords;
}
cout<<"Word Frequency:"<<endl;
map<string,int>::iterator iwordFrequency;
for(iwordFrequency=wordFrequency.begin();iwordFrequency!=wordFrequency.end();iwordFrequency++){
cout<<setw(8)<<iwordFrequency->first<<"\tFrequency:"<<iwordFrequency->second<<endl;
}
cout<<"Conditional Probability:"<<endl;
map<string,int> dtw;//待分類文檔詞頻
for(int i=0;i<d.size();i++) dtw[d[i]] = dtw[d[i]] + 1;
map<string,map<int,float> > cp;//單詞類別概率
map<string,int>::iterator idtw;
for(idtw=dtw.begin();idtw!=dtw.end();idtw++){
map<int,float> cf;
for(int j=0;j<cTotalFrequency.size();j++){
float p=0;
p = (float)(cWordlTotalFrequency[j][idtw->first] +1) / (cTotalFrequency[j] + wordFrequency.size());
cf[j] = p;
cout<<"P("<<idtw->first<<"|"<<j<<") \t= "<<p<<endl;
}
cp[idtw->first] = cf;
}
cout<<"Classification Probability:"<<endl;
float mp = 0;
int classification=0;
for(int i=0;i<cTotalFrequency.size();i++){
float tcp=1;
for(int j=0;j<d.size();j++){
tcp = tcp * cp[d[j]][i];
}
tcp = tcp * cWordProbability[i];
cout<<"classification:"<<i<<"\t"<<"Probability:"<<tcp<<endl;
if(mp<tcp) {
mp = tcp;
classification = i;
}
}
cout<<"The new document classification is:"<<classification<<endl;
}
int main(int argc, char** argv) {
vector<vector<string> > docs;
vector<int> c = openClassificationFile("classification.txt");
vector<char *> files;
files.push_back("1.txt");files.push_back("2.txt");files.push_back("3.txt");files.push_back("4.txt");files.push_back("5.txt");
cout<<"訓練文檔集:"<<endl;
docs = openFiles(files);
vector<string> d;
cout<<"待分類文檔:"<<endl;
d = openFile("new.txt");
bayesian(docs,c,d);
return 0;
}
~~~
效果展示:

結論:
樸素貝葉斯分類器用于處理離散型的文本數據,能夠有效對文本文檔進行分類。在實驗過程中,最困難的地方在于數據結構的設計,由于要統計每個文檔類別的頻數和每個文檔類別下單詞的概率,這個地方需要用到復雜映射與統計,在編碼過程中經過不斷的思考,最終通過多級映射的形式儲存所需的數據,最終計算出新文檔的類別。通過實驗,成功將新的未分類文檔輸入例子分類為期待的文檔類型,實驗結果較為滿意。
- 前言
- 插入排序
- 歸并排序
- 快速排序
- 最長公共子序列
- 斐波那契數列-臺階問題
- 求n*n階矩陣最大子矩陣階數
- 01背包
- 整數序列合并問題
- 動態規劃算法的一般解題思路
- 01背包-近似算法
- 樹搜索策略
- 求數組中的逆序對
- 并行機器最短調度問題
- 隨機算法
- 判斷兩多項式之積是否等于另一多項式
- 頂點覆蓋問題
- Apriori算法 (Introduction to data mining)
- 聚類算法-DBSCAN-C++實現
- 聚類算法-K-means-C++實現
- 聚類算法-Hierarchical(MIN)-C++
- 爬山法、分支限界法求解哈密頓環問題
- Best-First求解八數碼問題
- Naive Bayesian文本分類器