library(bitops)
library(httr)
library(RCurl)
library(XML)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
##
## content
library(NLP)
library(tmcn)
## # tmcn Version: 0.2-12
library(jiebaRD)
library(jiebaR)
library(factoextra)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(Matrix)
docs.corpus <- Corpus(DirSource("./data"))
docs.seg <- tm_map(docs.corpus, segmentCN)
docs.tdm <- TermDocumentMatrix(docs.seg)
docs.tf <- apply(as.matrix(docs.tdm), 2, function(word) { word/sum(word) })
idf <- function(doc) {
return ( log2( length(doc)+1 / nnzero(doc)) )
}
docs.idf <- apply(as.matrix(docs.tdm), 1, idf)
docs.tfidf <- docs.tf * docs.idf
inspect(docs.tdm)
## <<TermDocumentMatrix (terms: 1296, documents: 2)>>
## Non-/sparse entries: 1606/986
## Sparsity : 38%
## Maximal term length: 16
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt
## and 113 89
## are 22 21
## but 20 13
## for 23 28
## not 16 23
## our 67 76
## that 50 57
## the 129 104
## this 24 13
## will 19 21
docs.tf <- apply(as.matrix(docs.tdm), 2, function(doc) {doc / sum(doc)})
idf.function <- function(word_doc) { log2( (length(word_doc)+1) / nnzero(word_doc) ) }
docs.idf <- apply(docs.tdm, 1, idf.function)
docs.tfidf <- docs.tf * docs.idf
head(docs.tfidf)
## Docs
## Terms 歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt
## abandoned 0.0008267932 0
## ability 0.0008267932 0
## accept 0.0008267932 0
## account 0.0008267932 0
## achieve 0.0008267932 0
## across 0.0024803795 0
定義查詢函數,查詢nation,Americans, against,economy,threaten五個詞在各篇文章的tfidf值
query.tfidf <- function(q){
q.position <- which(rownames(docs.tfidf) %in% q)
q.tfidf <- docs.tfidf[q.position, ]
return (q.tfidf)
}
query.tfidf(c("nation", "fear", "against", "economy", "threaten"))
## Docs
## Terms 歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt
## against 0.0003051448 0.0003457225
## economy 0.0009154343 0.0003457225
## fear 0.0006102895 0.0003457225
## nation 0.0036617371 0.0020743351
## threaten 0.0008267932 0.0000000000
分別就單字來看,against在2013年的演講稿(以下簡稱2013年)稍微提高 可能歐巴馬認為在他的任期內面對的危機,不管政治、軍事都有提高 所以在演講稿中提及此字的頻率增加。
最明顯的就是threaten,可以猜得出來,他當初一開始當選可能利用這個威脅這個單字 讓全國人民達到一種同仇敵愾的感覺,對他來說是製造向心力的手法,也是想讓人民表示 他將會有能力領導好大家的感覺。
但在後期就沒出現了…
cos <- function(x, y){
return (x %*% y / sqrt(x %*% x * y %*% y))[1, 1]
}
# compare with first doc
docs.cos.sim <- apply(docs.tfidf, 2, cos, y = docs.tfidf[, 1])
docs.cos.sim
## 歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt
## 1.0000000 0.8318437
# compare with first doc
docs.cos.sim <- apply(docs.tfidf, 2, cos, y = docs.tfidf[, 2])
docs.cos.sim
## 歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt
## 0.8318437 1.0000000
library(wordcloud)
## Loading required package: RColorBrewer
f <- sort(rowSums(docs.tfidf), decreasing = T)
docs.df <- data.frame(
word = names(f),
freq = f
)
wordcloud(docs.df$word, docs.df$freq, scale=c(20,0.1),min.freq=180,max.words=40, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(docs.df$word, docs.df$freq, scale = c(20, 0.1),
## min.freq = 180, : the could not be fit on page. It will not be plotted.
## Warning in wordcloud(docs.df$word, docs.df$freq, scale = c(20, 0.1),
## min.freq = 180, : our could not be fit on page. It will not be plotted.
## Warning in wordcloud(docs.df$word, docs.df$freq, scale = c(20, 0.1),
## min.freq = 180, : and could not be fit on page. It will not be plotted.
## Warning in wordcloud(docs.df$word, docs.df$freq, scale = c(20, 0.1),
## min.freq = 180, : that could not be fit on page. It will not be plotted.
docs.pca <- prcomp(docs.tfidf, scale = T)
fviz_eig(docs.pca)
fviz_pca_ind(docs.pca, geom.ind = c("point"), col.ind = "cos2")
fviz_pca_var(docs.pca, col.var = "contrib")
fviz_pca_biplot(docs.pca, geom.ind = "point")
docs.eig <- get_eig(docs.pca)
docs.var <- get_pca_var(docs.pca)
docs.ind <- get_pca_ind(docs.pca)
ind.coord2 <- docs.ind$coord[, 1:2]
wss <- c()
for (i in 1:10) { wss[i] <- kmeans(ind.coord2, i)$tot.withinss }
plot(wss, type = "b")
km <- kmeans(ind.coord2, 3)
plot(ind.coord2, col = km$cluster)
points(km$centers, col = 1:50, pch = 8, cex = 2)