歐巴馬2009年及2013年就職演講分析

TFIDF

Load package

library(bitops)
library(httr)
library(RCurl)
library(XML)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
## 
##     content
library(NLP)
library(tmcn)
## # tmcn Version: 0.2-12
library(jiebaRD)
library(jiebaR)
library(factoextra)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(Matrix)

Term-Document Matrix

docs.corpus <- Corpus(DirSource("./data"))
docs.seg <- tm_map(docs.corpus, segmentCN)
docs.tdm <- TermDocumentMatrix(docs.seg)

TFIDF

docs.tf <- apply(as.matrix(docs.tdm), 2, function(word) { word/sum(word) })
idf <- function(doc) {
  return ( log2( length(doc)+1 / nnzero(doc)) )
}
docs.idf <- apply(as.matrix(docs.tdm), 1, idf)
docs.tfidf <- docs.tf * docs.idf

看資料

inspect(docs.tdm)
## <<TermDocumentMatrix (terms: 1296, documents: 2)>>
## Non-/sparse entries: 1606/986
## Sparsity           : 38%
## Maximal term length: 16
## Weighting          : term frequency (tf)
## Sample             :
##       Docs
## Terms  歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt
##   and                      113                      89
##   are                       22                      21
##   but                       20                      13
##   for                       23                      28
##   not                       16                      23
##   our                       67                      76
##   that                      50                      57
##   the                      129                     104
##   this                      24                      13
##   will                      19                      21

TFIDF Counting

每個詞的 term frequency

docs.tf <- apply(as.matrix(docs.tdm), 2, function(doc) {doc / sum(doc)})
idf.function <- function(word_doc) { log2( (length(word_doc)+1) / nnzero(word_doc) ) }
docs.idf <- apply(docs.tdm, 1, idf.function)
docs.tfidf <- docs.tf * docs.idf
head(docs.tfidf)
##            Docs
## Terms       歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt
##   abandoned            0.0008267932                       0
##   ability              0.0008267932                       0
##   accept               0.0008267932                       0
##   account              0.0008267932                       0
##   achieve              0.0008267932                       0
##   across               0.0024803795                       0

Query of Words

定義查詢函數,查詢nation,Americans, against,economy,threaten五個詞在各篇文章的tfidf值

query.tfidf <- function(q){
  q.position <- which(rownames(docs.tfidf) %in% q)
  q.tfidf <- docs.tfidf[q.position, ]
  return (q.tfidf)
}
query.tfidf(c("nation", "fear", "against", "economy", "threaten"))
##           Docs
## Terms      歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt
##   against             0.0003051448            0.0003457225
##   economy             0.0009154343            0.0003457225
##   fear                0.0006102895            0.0003457225
##   nation              0.0036617371            0.0020743351
##   threaten            0.0008267932            0.0000000000

分別就單字來看,against在2013年的演講稿(以下簡稱2013年)稍微提高 可能歐巴馬認為在他的任期內面對的危機,不管政治、軍事都有提高 所以在演講稿中提及此字的頻率增加。

最明顯的就是threaten,可以猜得出來,他當初一開始當選可能利用這個威脅這個單字 讓全國人民達到一種同仇敵愾的感覺,對他來說是製造向心力的手法,也是想讓人民表示 他將會有能力領導好大家的感覺。

但在後期就沒出現了…

Cosine Similiarity

定義「計算 x, y 兩向量 cosine 值」函數

cos <- function(x, y){
  return (x %*% y / sqrt(x %*% x * y %*% y))[1, 1]
}

計算 “各篇文章的 tfidf 向量” 與 “第一篇文章 tfidf 向量” 的 cosine 值

# compare with first doc
docs.cos.sim <- apply(docs.tfidf, 2, cos, y = docs.tfidf[, 1])
docs.cos.sim
## 歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt 
##               1.0000000               0.8318437

計算 “各篇文章的 tfidf 向量” 與 “第二篇文章 tfidf 向量” 的 cosine 值

# compare with first doc
docs.cos.sim <- apply(docs.tfidf, 2, cos, y = docs.tfidf[, 2])
docs.cos.sim
## 歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt 
##               0.8318437               1.0000000

文字雲

library(wordcloud)
## Loading required package: RColorBrewer
f <- sort(rowSums(docs.tfidf), decreasing = T)
docs.df <- data.frame(
  word = names(f),
  freq = f
)
wordcloud(docs.df$word, docs.df$freq, scale=c(20,0.1),min.freq=180,max.words=40, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(docs.df$word, docs.df$freq, scale = c(20, 0.1),
## min.freq = 180, : the could not be fit on page. It will not be plotted.
## Warning in wordcloud(docs.df$word, docs.df$freq, scale = c(20, 0.1),
## min.freq = 180, : our could not be fit on page. It will not be plotted.
## Warning in wordcloud(docs.df$word, docs.df$freq, scale = c(20, 0.1),
## min.freq = 180, : and could not be fit on page. It will not be plotted.
## Warning in wordcloud(docs.df$word, docs.df$freq, scale = c(20, 0.1),
## min.freq = 180, : that could not be fit on page. It will not be plotted.

PCA

docs.pca <- prcomp(docs.tfidf, scale = T)

作圖

fviz_eig(docs.pca)

fviz_pca_ind(docs.pca, geom.ind = c("point"), col.ind = "cos2")

fviz_pca_var(docs.pca, col.var = "contrib")

fviz_pca_biplot(docs.pca, geom.ind = "point")

PCA results

docs.eig <- get_eig(docs.pca)
docs.var <- get_pca_var(docs.pca)
docs.ind <- get_pca_ind(docs.pca)

K-means

Choosing K

ind.coord2 <- docs.ind$coord[, 1:2]
wss <- c()
for (i in 1:10) { wss[i] <- kmeans(ind.coord2, i)$tot.withinss }
plot(wss, type = "b")

Clustering

km <- kmeans(ind.coord2, 3)
plot(ind.coord2, col = km$cluster)
points(km$centers, col = 1:50, pch = 8, cex = 2)