task2

source('pttTestFunction.R')

## # tmcn Version: 0.2-12

#https://www.ptt.cc/bbs/marvel/index.html
id = c(2170:2175)
URL = paste0("https://www.ptt.cc/bbs/marvel/index", id, ".html")
filename = paste0(id, ".txt")
pttTestFunction(URL[1], filename[1])
mapply(pttTestFunction, 
       URL = URL, filename = filename)

## $`https://www.ptt.cc/bbs/marvel/index2170.html`
## NULL
## 
## $`https://www.ptt.cc/bbs/marvel/index2171.html`
## NULL
## 
## $`https://www.ptt.cc/bbs/marvel/index2172.html`
## NULL
## 
## $`https://www.ptt.cc/bbs/marvel/index2173.html`
## NULL
## 
## $`https://www.ptt.cc/bbs/marvel/index2174.html`
## NULL
## 
## $`https://www.ptt.cc/bbs/marvel/index2175.html`
## NULL

library(jiebaR)

## Loading required package: jiebaRD

cutter <- worker()
cutter

## Worker Type:  Jieba Segment
## 
## Default Method  :  mix
## Detect Encoding :  TRUE
## Default Encoding:  UTF-8
## Keep Symbols    :  FALSE
## Output Path     :  
## Write File      :  TRUE
## By Lines        :  FALSE
## Max Word Length :  20
## Max Read Lines  :  1e+05
## 
## Fixed Model Components:  
## 
## $dict
## [1] "C:/Users/tuan i  king/Documents/R/win-library/3.5/jiebaRD/dict/jieba.dict.utf8"
## 
## $user
## [1] "C:/Users/tuan i  king/Documents/R/win-library/3.5/jiebaRD/dict/user.dict.utf8"
## 
## $hmm
## [1] "C:/Users/tuan i  king/Documents/R/win-library/3.5/jiebaRD/dict/hmm_model.utf8"
## 
## $stop_word
## NULL
## 
## $user_weight
## [1] "max"
## 
## $timestamp
## [1] 1531642891
## 
## $default $detect $encoding $symbol $output $write $lines $bylines can be reset.

cutter["他就在這裡"]

## [1] "他"   "就"   "在"   "這裡"

new_user_word(cutter,'好毛',"n")

## [1] TRUE

new_user_word(cutter,'媽佛',"n")

## [1] TRUE

new_user_word(cutter,'獵奇',"n")

## [1] TRUE

new_user_word(cutter,'敲碗',"n")

## [1] TRUE

new_user_word(cutter,'就在',"n")

## [1] TRUE

new_user_word(cutter,'自殺',"n")

## [1] TRUE

rm(list=ls(all.names = TRUE))
library(NLP)
library(tm)
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(wordcloud)
filenames <- list.files(getwd(), pattern="*.txt")
files <- lapply(filenames, readLines)
docs <- Corpus(VectorSource(files))
#移除可能有問題的符號
toSpace <- content_transformer(function(x, pattern) {
  return (gsub(pattern, " ", x))
}
)
docs <- tm_map(docs, toSpace, "※")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "※"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "◆")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "◆"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "‧")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "‧"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "的")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "的"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "我")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "我"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "是")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "是"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "看板")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "看板"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "作者")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "作者"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "發信站")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "發信站"): transformation
## drops documents

docs <- tm_map(docs, toSpace, "批踢踢實業坊")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "批踢踢實業坊"):
## transformation drops documents

docs <- tm_map(docs, toSpace, "[a-zA-Z]")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "[a-zA-Z]"): transformation
## drops documents

docs <- tm_map(docs, toSpace, "推")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "推"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "敲碗")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "敲碗"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "了")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "了"): transformation drops
## documents

#移除標點符號 (punctuation)
#移除數字 (digits)、空白 (white space)
docs <- tm_map(docs, removePunctuation)

## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents

docs <- tm_map(docs, removeNumbers)

## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents

docs <- tm_map(docs, stripWhitespace)

## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents

docs

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 6

mixseg = worker()
jieba_tokenizer=function(d){
  unlist(segment(d[[1]],mixseg))
}
seg = lapply(docs, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))
freqFrame = freqFrame[order(freqFrame$Freq,decreasing=TRUE), ]
library(knitr)
kable(head(freqFrame), format = "markdown")

	Var1	Freq
4900	在	2099
14994	就	1575
3068	他	1388
1608	不	1301
5458	有	1260
6250	你	1234

wordcloud(freqFrame$Var1,freqFrame$Freq,
          scale=c(5,0.1),min.freq=50,max.words=150,
          random.order=TRUE, random.color=FALSE, 
          rot.per=.1, colors=brewer.pal(8, "Dark2"),
          ordered.colors=FALSE,use.r.layout=FALSE,
          fixed.asp=TRUE)