EUC-KR의 text file 읽기

path <- '.'
story <- list.files(path=path,pattern = "*e.txt")
story <- paste(path,story,sep='/')
story

myReadLines <- function(filename) {
    readLines(file(filename,encoding='euc-kr'))
}

story2 <- unlist(unname(sapply(story, myReadLines)))

Warning message in readLines(file(filename, encoding = "euc-kr")):
“invalid input found on input connection './1204-e.txt'”Warning message in readLines(file(filename, encoding = "euc-kr")):
“incomplete final line found on './1204-e.txt'”

head(story2)

UTF8의 text file 읽기

path <- '.'
story <- list.files(path=path,pattern = "*u.txt")
story <- paste(path,story,sep='/')
story

story2 <- unlist(unname(sapply(story, readLines)))

head(story2)

Vector source

library(tm)
library(KoNLP)

Loading required package: NLP
Checking user defined dictionary!

vs <- VectorSource(story2)
corp <- VCorpus(vs)

useSejongDic()

Backup was just finished!
370957 words dictionary was built.

konlp_tokenize <- function(corp) {
    corp <- extractNoun(paste(corp,collapse=' '))
    corp <- gsub("[[:digit:]]", "", corp, perl = TRUE)
    corp <- gsub("[[:graph:]]", "", corp, perl = TRUE)
    corp <- gsub("[[:print:]]", "", corp, perl = TRUE)
    corp <- gsub("[[:space:]]", "", corp, perl = TRUE)
}

stopwords <- c("그", "수", "이", "영화", "있는", "더", "당시", "한", "위해", "아직도", "그리고", "것", "대한",
               "잘", "영화를", "왜","있다", "영화는", "많은", "너무", "이런", "사람들이", "하는", "대해", "할",
               "영화가", "정말", "없는", "한다", "것이다","알고", "우리가", "것을", "택시", "보고", "있었다",
               "전", "아닌", "그런", "많이", "얼마나", "아니라", "꼭", "것이", "하지만", "같은", "내가", "하고",
               "않고", "좀", "다", "본", "어떻게", "것은", "때문에", "된", "그렇게", "바로", "영화였다", "는",
               "살고", "큰", "있다는", "의", "만섭의", "듯", "통해", "보는", "이렇게", "또", "라는", "그저",
               "사람이", "합니다", "볼", "되는", "때", "자신의", "있을", "게", "영화의", "잊지", "한번", "있습니다",
               "영화다", "일이", "된다", "같다", "모습을", "아니다", "저는", "보면서", "위한", "없었다면", "만든",
               "우리의", "나는", "없이", "화가", "그냥", "사람들의", "수도", "18의", "만섭은", "알게", "를", "내", "참", 
               "않는", "말이", "가", "그러나", "그래서", "하지", "누가", "없다", "것도", "18을", "안", "있었고",
               "잘살고", "모르는", "다룬", "사람들은", "장면은", "있고", "일을", "영화에서", "그를", "독일", "있던",
               "보며", "아주", "못하고", "더욱", "아니고", "될", "너무나", "푸른","광주에", "광주는", "광주로",
               "광주에서", "광주", "광주의","광주를","18")

tdm <- TermDocumentMatrix(corp, control = 
                            list(tokenize = konlp_tokenize, 
                                 stopwords = stopwords,
                                 wordLengths = c(2, Inf)))

inspect(tdm)

<<TermDocumentMatrix (terms: 3069, documents: 509)>>
Non-/sparse entries: 7010/1555111
Sparsity           : 100%
Maximal term length: 18
Weighting          : term frequency (tf)
Sample             :
            Docs
Terms        143 147 155 160 161 165 219 220 248 273
  생각         1   0   0   1   0   0   1   0   0   0
  역사         1   0   1   0   1   1   0   0   0   0
  시민         3   8   2   0   3   0   2   9   2   0
  운동         2   0   0   1   0   0   0   3   1   0
  사람         3   0   0   0   2   0   1   1   0   0
  들이         2   1   1   0   0   0   2   3   2   2
  진실         0   0   0   0   0   0   0   0   0   1
  우리         0   0   1   0   0   0   0   0   0   0
  민주화       0   0   0   0   1   0   1   4   1   0
  택시운전사   2   0   1   0   0   1   0   0   0   0

findFreqTerms(tdm, lowfreq = 10)
findAssocs(tdm, "민주항쟁", 0.5)

# creaste a word cloud. 
m <- as.matrix(tdm)

v <- sort(rowSums(m), decreasing = TRUE)
d <- data.frame(word = names(v),freq=v)

# '민주항쟁' 키워드 관련 전체 어휘빈도수: 상위 20개까지 표기
head(d, 20)

df <- d[c(d$freq >= 10), ]

library(wordcloud)

Loading required package: RColorBrewer

wordcloud(words = df$word, freq = df$freq, min.freq = 10,
          max.words= 100, random.order=FALSE, rot.per=0.35, 
          scale = c(4, 0.5), colors=brewer.pal(8, "Set2"))

library(wordcloud2)

wordcloud2(df, size = 1.6, shape = "circle", minRotation = -pi/4, maxRotation = -pi/4,
           rotateRatio = 0.4, ellipticity = .7, color = "random-light", backgroundColor = "grey", figPath = NULL)

tdm <- as.matrix(tdm)
# row.names(tdm)
tdmrs <- sort(rowSums(tdm), decreasing = TRUE)
head(tdmrs,20)
tdmrs <- as.matrix(tdmrs)
# row.names(tdmrs)
tdmrs2 <- tdmrs[c(1:27), ]
tdmrs3 <- as.matrix(tdmrs2)

termMatrix <- tdmrs3 %*% t(tdmrs3)

library(igraph)

g <- graph.adjacency(termMatrix, weighted = TRUE, mode = "undirected") 
g <- graph_from_adjacency_matrix(termMatrix, weighted = TRUE, mode = "undirected")
g <- simplify(g)

Attaching package: ‘igraph’

The following objects are masked from ‘package:stats’:

    decompose, spectrum

The following object is masked from ‘package:base’:

    union

V(g)$label <- V(g)$name 
V(g)$degree <- degree(g) 
V(g)$label.cex <- seq(0.8, 0.2, length.out = 27)
V(g)$label.font <- 2
V(g)$size <- seq(30, 2, length.out = 27)
set.seed(3952) 
layout1 <- layout.fruchterman.reingold(g) 
egam <- (log(E(g)$weight) + 1.5) / max(log(E(g)$weight) + 1.5)
plot(g, edge.width = egam, 
     edge.arrow.size = 0.1,layout = layout1)

# make it look better
V(g)$label.cex <- 0.3 * V(g)$degree/max(V(g)$degree)+ .3 
V(g)$label.color <- rgb(0, 0, .2, .8) 
V(g)$size <- seq(25, 1, length.out = 27)
V(g)$frame.color <- NA
V(g)$label.font <- 2
egam <- (log(E(g)$weight) + 1.5) / max(log(E(g)$weight) + 1.5)

plot(g, edge.width = egam, 
     edge.arrow.size = 0.1,layout = layout1)

	word	freq
민주화	민주화	138
역사	역사	101
시민	시민	99
운동	운동	98
사람	사람	94
택시운전사	택시운전사	73
들이	들이	70
진실	진실	68
생각	생각	66
우리	우리	61
하게	하게	60
폭동	폭동	43
국민	국민	41
이야기	이야기	40
사실	사실	37
장면	장면	37
광주시민	광주시민	35
사람들	사람들	34
빨갱이	빨갱이	34
민국	민국	33