TFIDF

Load package

library(bitops)
library(httr)
library(RCurl)
library(XML)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:httr':
## 
##     content

library(NLP)
library(tmcn)

## # tmcn Version: 0.2-12

library(jiebaRD)
library(jiebaR)
library(factoextra)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

library(Matrix)

Term-Document Matrix

docs.corpus <- Corpus(DirSource("./data"))
docs.seg <- tm_map(docs.corpus, segmentCN)
docs.tdm <- TermDocumentMatrix(docs.seg)

TFIDF

docs.tf <- apply(as.matrix(docs.tdm), 2, function(word) { word/sum(word) })
idf <- function(doc) {
  return ( log2( length(doc)+1 / nnzero(doc)) )
}
docs.idf <- apply(as.matrix(docs.tdm), 1, idf)
docs.tfidf <- docs.tf * docs.idf

看資料

inspect(docs.tdm)

## <<TermDocumentMatrix (terms: 1296, documents: 2)>>
## Non-/sparse entries: 1606/986
## Sparsity           : 38%
## Maximal term length: 16
## Weighting          : term frequency (tf)
## Sample             :
##       Docs
## Terms  歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt
##   and                      113                      89
##   are                       22                      21
##   but                       20                      13
##   for                       23                      28
##   not                       16                      23
##   our                       67                      76
##   that                      50                      57
##   the                      129                     104
##   this                      24                      13
##   will                      19                      21

TFIDF Counting

每個詞的 term frequency

docs.tf <- apply(as.matrix(docs.tdm), 2, function(doc) {doc / sum(doc)})
idf.function <- function(word_doc) { log2( (length(word_doc)+1) / nnzero(word_doc) ) }
docs.idf <- apply(docs.tdm, 1, idf.function)
docs.tfidf <- docs.tf * docs.idf
head(docs.tfidf)

##            Docs
## Terms       歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt
##   abandoned            0.0008267932                       0
##   ability              0.0008267932                       0
##   accept               0.0008267932                       0
##   account              0.0008267932                       0
##   achieve              0.0008267932                       0
##   across               0.0024803795                       0

Query of Words

定義查詢函數,查詢nation,Americans, against,economy,threaten五個詞在各篇文章的tfidf值

query.tfidf <- function(q){
  q.position <- which(rownames(docs.tfidf) %in% q)
  q.tfidf <- docs.tfidf[q.position, ]
  return (q.tfidf)
}
query.tfidf(c("nation", "fear", "against", "economy", "threaten"))

##           Docs
## Terms      歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt
##   against             0.0003051448            0.0003457225
##   economy             0.0009154343            0.0003457225
##   fear                0.0006102895            0.0003457225
##   nation              0.0036617371            0.0020743351
##   threaten            0.0008267932            0.0000000000

分別就單字來看，against在2013年的演講稿(以下簡稱2013年)稍微提高可能歐巴馬認為在他的任期內面對的危機，不管政治、軍事都有提高所以在演講稿中提及此字的頻率增加。

最明顯的就是threaten，可以猜得出來，他當初一開始當選可能利用這個威脅這個單字讓全國人民達到一種同仇敵愾的感覺，對他來說是製造向心力的手法，也是想讓人民表示他將會有能力領導好大家的感覺。

但在後期就沒出現了…

Cosine Similiarity

定義「計算 x, y 兩向量 cosine 值」函數

cos <- function(x, y){
  return (x %*% y / sqrt(x %*% x * y %*% y))[1, 1]
}

計算 “各篇文章的 tfidf 向量” 與 “第一篇文章 tfidf 向量” 的 cosine 值

# compare with first doc
docs.cos.sim <- apply(docs.tfidf, 2, cos, y = docs.tfidf[, 1])
docs.cos.sim

## 歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt 
##               1.0000000               0.8318437

計算 “各篇文章的 tfidf 向量” 與 “第二篇文章 tfidf 向量” 的 cosine 值

# compare with first doc
docs.cos.sim <- apply(docs.tfidf, 2, cos, y = docs.tfidf[, 2])
docs.cos.sim

## 歐巴馬就職演說-2009.txt 歐巴馬就職演說-2013.txt 
##               0.8318437               1.0000000

文字雲

library(wordcloud)

## Loading required package: RColorBrewer

f <- sort(rowSums(docs.tfidf), decreasing = T)
docs.df <- data.frame(
  word = names(f),
  freq = f
)
wordcloud(docs.df$word, docs.df$freq, scale=c(20,0.1),min.freq=180,max.words=40, colors=brewer.pal(8, "Dark2"))

## Warning in wordcloud(docs.df$word, docs.df$freq, scale = c(20, 0.1),
## min.freq = 180, : the could not be fit on page. It will not be plotted.

## Warning in wordcloud(docs.df$word, docs.df$freq, scale = c(20, 0.1),
## min.freq = 180, : our could not be fit on page. It will not be plotted.

## Warning in wordcloud(docs.df$word, docs.df$freq, scale = c(20, 0.1),
## min.freq = 180, : and could not be fit on page. It will not be plotted.

## Warning in wordcloud(docs.df$word, docs.df$freq, scale = c(20, 0.1),
## min.freq = 180, : that could not be fit on page. It will not be plotted.

TFIDF_PCA_K-means

歐巴馬2009年及2013年就職演講分析

TFIDF

Load package

Term-Document Matrix

TFIDF

看資料

TFIDF Counting

每個詞的 term frequency

Query of Words

Cosine Similiarity

定義「計算 x, y 兩向量 cosine 值」函數

計算 “各篇文章的 tfidf 向量” 與 “第一篇文章 tfidf 向量” 的 cosine 值

計算 “各篇文章的 tfidf 向量” 與 “第二篇文章 tfidf 向量” 的 cosine 值

文字雲

PCA

作圖

PCA results

K-means

Choosing K

Clustering