path <- '.'
story <- list.files(path=path,pattern = "*e.txt")
story <- paste(path,story,sep='/')
story
myReadLines <- function(filename) {
readLines(file(filename,encoding='euc-kr'))
}
story2 <- unlist(unname(sapply(story, myReadLines)))
head(story2)
path <- '.'
story <- list.files(path=path,pattern = "*u.txt")
story <- paste(path,story,sep='/')
story
story2 <- unlist(unname(sapply(story, readLines)))
head(story2)
library(tm)
library(KoNLP)
vs <- VectorSource(story2)
corp <- VCorpus(vs)
useSejongDic()
konlp_tokenize <- function(corp) {
corp <- extractNoun(paste(corp,collapse=' '))
corp <- gsub("[[:digit:]]", "", corp, perl = TRUE)
corp <- gsub("[[:graph:]]", "", corp, perl = TRUE)
corp <- gsub("[[:print:]]", "", corp, perl = TRUE)
corp <- gsub("[[:space:]]", "", corp, perl = TRUE)
}
stopwords <- c("그", "수", "이", "영화", "있는", "더", "당시", "한", "위해", "아직도", "그리고", "것", "대한",
"잘", "영화를", "왜","있다", "영화는", "많은", "너무", "이런", "사람들이", "하는", "대해", "할",
"영화가", "정말", "없는", "한다", "것이다","알고", "우리가", "것을", "택시", "보고", "있었다",
"전", "아닌", "그런", "많이", "얼마나", "아니라", "꼭", "것이", "하지만", "같은", "내가", "하고",
"않고", "좀", "다", "본", "어떻게", "것은", "때문에", "된", "그렇게", "바로", "영화였다", "는",
"살고", "큰", "있다는", "의", "만섭의", "듯", "통해", "보는", "이렇게", "또", "라는", "그저",
"사람이", "합니다", "볼", "되는", "때", "자신의", "있을", "게", "영화의", "잊지", "한번", "있습니다",
"영화다", "일이", "된다", "같다", "모습을", "아니다", "저는", "보면서", "위한", "없었다면", "만든",
"우리의", "나는", "없이", "화가", "그냥", "사람들의", "수도", "18의", "만섭은", "알게", "를", "내", "참",
"않는", "말이", "가", "그러나", "그래서", "하지", "누가", "없다", "것도", "18을", "안", "있었고",
"잘살고", "모르는", "다룬", "사람들은", "장면은", "있고", "일을", "영화에서", "그를", "독일", "있던",
"보며", "아주", "못하고", "더욱", "아니고", "될", "너무나", "푸른","광주에", "광주는", "광주로",
"광주에서", "광주", "광주의","광주를","18")
tdm <- TermDocumentMatrix(corp, control =
list(tokenize = konlp_tokenize,
stopwords = stopwords,
wordLengths = c(2, Inf)))
inspect(tdm)
findFreqTerms(tdm, lowfreq = 10)
findAssocs(tdm, "민주항쟁", 0.5)
# creaste a word cloud.
m <- as.matrix(tdm)
v <- sort(rowSums(m), decreasing = TRUE)
d <- data.frame(word = names(v),freq=v)
# '민주항쟁' 키워드 관련 전체 어휘빈도수: 상위 20개까지 표기
head(d, 20)
df <- d[c(d$freq >= 10), ]
library(wordcloud)
wordcloud(words = df$word, freq = df$freq, min.freq = 10,
max.words= 100, random.order=FALSE, rot.per=0.35,
scale = c(4, 0.5), colors=brewer.pal(8, "Set2"))
library(wordcloud2)
wordcloud2(df, size = 1.6, shape = "circle", minRotation = -pi/4, maxRotation = -pi/4,
rotateRatio = 0.4, ellipticity = .7, color = "random-light", backgroundColor = "grey", figPath = NULL)
tdm <- as.matrix(tdm)
# row.names(tdm)
tdmrs <- sort(rowSums(tdm), decreasing = TRUE)
head(tdmrs,20)
tdmrs <- as.matrix(tdmrs)
# row.names(tdmrs)
tdmrs2 <- tdmrs[c(1:27), ]
tdmrs3 <- as.matrix(tdmrs2)
termMatrix <- tdmrs3 %*% t(tdmrs3)
library(igraph)
g <- graph.adjacency(termMatrix, weighted = TRUE, mode = "undirected")
g <- graph_from_adjacency_matrix(termMatrix, weighted = TRUE, mode = "undirected")
g <- simplify(g)
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
V(g)$label.cex <- seq(0.8, 0.2, length.out = 27)
V(g)$label.font <- 2
V(g)$size <- seq(30, 2, length.out = 27)
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
egam <- (log(E(g)$weight) + 1.5) / max(log(E(g)$weight) + 1.5)
plot(g, edge.width = egam,
edge.arrow.size = 0.1,layout = layout1)
# make it look better
V(g)$label.cex <- 0.3 * V(g)$degree/max(V(g)$degree)+ .3
V(g)$label.color <- rgb(0, 0, .2, .8)
V(g)$size <- seq(25, 1, length.out = 27)
V(g)$frame.color <- NA
V(g)$label.font <- 2
egam <- (log(E(g)$weight) + 1.5) / max(log(E(g)$weight) + 1.5)
plot(g, edge.width = egam,
edge.arrow.size = 0.1,layout = layout1)