日志

机器学习与R之朴素贝叶斯分类器

已有 1106 次阅读2016-7-3 06:35

朴素贝叶斯

1联合概率分布

p(x,y)=p(y)P(x|y) 或者p(A交B)=p(A)xp(B) p(A交B)不容易求,假设条件独立拆分成两个事件的乘积

2基本假设条件独立性

3利用贝叶斯定理 p(y|x)=P(x,y)/p(x)=p(y)P(x|y)/sum(y-i)[p(y)P(x|y)]

y=max p(y)P(x|y)

贝叶斯决策理论要求计算两个概率p1(x，y),p2(x, y):

如果p1(x，y) > p2 (x, y) , 那么属于类别1

如果p2(x, y) > pl(x, y) , 那么属于类别2

拉普拉斯估计--防止概率值为0

每一个似然函数分子+1对分母加上分子中加上1的总数

在朴素贝叶斯使用数值特征采用数值特征离散化,找见数据分布分割点切分

1分割词-去大小写-去字母？-去停用词-去多余空格符合

2统计每个词在每条短信中出现的频率

3创建频率矩阵行是短信中词是否出现出现可以是yes no 列是每个词 label可以是字符串

library(e1071)

sms_classifier <- naiveBayes(sms_train, sms_train_labels)

sms_test_pred <- predict(sms_classifier, sms_test)

例子 iris[,-5]不含第5项

data(iris)

m <- naiveBayes(iris[,-5], iris[,5])

table(predict(m, iris), iris[,5])

另一个朴素贝叶斯包 klaR NaiveBayes()

朴素贝叶斯分类器通常有两种实现方式：一种基于贝努利模型实现，一种基于多项式模型实现

这里采用前一种实现方式。该实现方式中并不考虑词在文档中出现的次数，只考虑出不出现，

因此在这个意义上相当于假设词是等权重的

垃圾短信识别

# read the sms data into the sms data frame

sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)

# examine the structure of the sms data 垃圾邮件标记为spam非垃圾ham 结构type+text

str(sms_raw)

# convert spam/ham to factor.字符串分类标签转换成因子比较好

sms_raw$type <- factor(sms_raw$type)

# examine the type variable more carefully

str(sms_raw$type)

table(sms_raw$type)

# build a corpus using the text mining (tm) package

#tm文本挖掘包

library(tm)

sms_corpus <- VCorpus(VectorSource(sms_raw$text))#创建语料库 VCorpus存储R文本文档

# examine the sms corpus

print(sms_corpus)

inspect(sms_corpus[1:2]) #查看1-2个语料库内容

as.character(sms_corpus[[1]])

lapply(sms_corpus[1:2], as.character)

# clean up the corpus using tm_map()字母转换成小写

#sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower))

# show the difference between sms_corpus and corpus_clean

#as.character(sms_corpus[[1]])

#as.character(sms_corpus_clean[[1]])

sms_corpus_clean <- tm_map(sms_corpus, removeNumbers) # remove numbers去掉数字

sms_corpus_clean <- tm_map(sms_corpus_clean, content_transformer(tolower)) #字母转换成小写

sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords()) # remove stop words去掉停用词

sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation) # remove punctuation去掉标点

# tip: create a custom function to replace (rather than remove) punctuation

removePunctuation("hello...world")

replacePunctuation <- function(x) { gsub("[[:punct:]]+", " ", x) }

replacePunctuation("hello...world")

# illustration of word stemming

library(SnowballC)

wordStem(c("learn", "learned", "learning", "learns"))

sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)

sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace) # eliminate unneeded whitespace去掉多余的空格

# examine the final clean corpus

lapply(sms_corpus[1:3], as.character)

lapply(sms_corpus_clean[1:3], as.character)

# create a document-term sparse matrix创建一个稀疏矩阵

sms_dtm <- DocumentTermMatrix(sms_corpus_clean)

# alternative solution: create a document-term sparse matrix directly from the SMS corpus

#直接从语料库创建一个稀疏矩阵

sms_dtm2 <- DocumentTermMatrix(sms_corpus, control = list(

tolower = TRUE,

removeNumbers = TRUE,

stopwords = TRUE,

removePunctuation = TRUE,

stemming = TRUE

))

# alternative solution: using custom stop words function ensures identical result

# 使用自定义的停用词

sms_dtm3 <- DocumentTermMatrix(sms_corpus, control = list(

tolower = TRUE,

removeNumbers = TRUE,

stopwords = function(x) { removeWords(x, stopwords()) },

removePunctuation = TRUE,

stemming = TRUE

))

# compare the result

sms_dtm

sms_dtm2

sms_dtm3

# creating training and test datasets

sms_dtm_train <- sms_dtm[1:4169, ]

sms_dtm_test <- sms_dtm[4170:5559, ]

# also save the labels

sms_train_labels <- sms_raw[1:4169, ]$type

sms_test_labels <- sms_raw[4170:5559, ]$type

# check that the proportion of spam is similar查看类别比例

prop.table(table(sms_train_labels))

prop.table(table(sms_test_labels))

# word cloud visualization词云可视化

library(wordcloud)

#从语料库直接创建词云

wordcloud(sms_corpus_clean, min.freq = 50, random.order = FALSE)

# subset the training data into spam and ham groups

spam <- subset(sms_raw, type == "spam")

ham <- subset(sms_raw, type == "ham")

wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))

wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))

sms_dtm_freq_train <- removeSparseTerms(sms_dtm_train, 0.999)

sms_dtm_freq_train

# indicator features for frequent words找出不少于5条短信的单词--减少特征

findFreqTerms(sms_dtm_train, 5)

# save frequently-appearing terms to a character vector

sms_freq_words <- findFreqTerms(sms_dtm_train, 5)

str(sms_freq_words)

# create DTMs with only the frequent terms

sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words]

sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]

# convert counts to a factor转为成yes no

convert_counts <- function(x) {

x <- ifelse(x > 0, "Yes", "No")

}

# apply() convert_counts() to columns of train/test data MARGIN = 2 2是列1是行

sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)

sms_test <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)

## Step 3: Training a model on the data ----训练

library(e1071)

sms_classifier <- naiveBayes(sms_train, sms_train_labels)

## Step 4: Evaluating model performance ----预测

sms_test_pred <- predict(sms_classifier, sms_test)

#----评估

library(gmodels)

CrossTable(sms_test_pred, sms_test_labels,

prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,

dnn = c('predicted', 'actual'))

## Step 5: Improving model performance ----加入拉普拉斯估计laplace = 1

sms_classifier2 <- naiveBayes(sms_train, sms_train_labels, laplace = 1)

sms_test_pred2 <- predict(sms_classifier2, sms_test)

CrossTable(sms_test_pred2, sms_test_labels,

prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,

dnn = c('predicted', 'actual'))

路过

雷人

握手

鲜花

langke93的个人空间 https://www.aboutyun.com/?1415 [收藏] [复制] [分享] [RSS]

日志

机器学习与R之朴素贝叶斯分类器

全部作者的其他最新日志

评论 (0 个评论)

langke93

推荐 /2