close_btn
조회 수 4641 추천 수 0 댓글 0
?

단축키

Prev이전 문서

Next다음 문서

+ - Up Down Comment Print Files
?

단축키

Prev이전 문서

Next다음 문서

+ - Up Down Comment Print Files

투빅스세미나_텍스트마이닝_relationships between words-1.jpg


투빅스세미나_텍스트마이닝_relationships between words-2.jpg


투빅스세미나_텍스트마이닝_relationships between words-3.jpg


투빅스세미나_텍스트마이닝_relationships between words-4.jpg


투빅스세미나_텍스트마이닝_relationships between words-5.jpg


투빅스세미나_텍스트마이닝_relationships between words-6.jpg


투빅스세미나_텍스트마이닝_relationships between words-7.jpg



#http://tidytextmining.com/

#####Relationships between words: n-grams and correlations

#투빅스 4기 연다인


#그동안의 분석은 감정분석(sentiment analysis)이나 빈도분석(frequency analysis)을 위해 단어 위주의 분석을 진행해옴

#이번 세미나에서는 연속적인 단어들 간의 관계를 분석

#Word X가 word Y와 빈번하게 함께 나타난다는 것을 바탕으로 단어들간의 관계를 탐색 


#그 중 n=2인 n-gram을 흔히 bigram이라고 함

#n=2라는 것은 연속적인 2개의 단어를 본다는 것

#n=1:unigram, n=2:bigram, n=3:trigram


#4.1 Tokenizing by n-gram

#Counting and filtering n-grams

library(dplyr)

library(tidytext)

library(janeaustenr)


austen_bigrams <- austen_books() %>%

  unnest_tokens(bigram, text, token = "ngrams", n = 2)

austen_bigrams

# A tibble: 725,048 × 2

                  book          bigram

                <fctr>           <chr>

1  Sense & Sensibility       sense and

2  Sense & Sensibility and sensibility

3  Sense & Sensibility  sensibility by

4  Sense & Sensibility         by jane

5  Sense & Sensibility     jane austen

6  Sense & Sensibility     austen 1811

7  Sense & Sensibility    1811 chapter

8  Sense & Sensibility       chapter 1

9  Sense & Sensibility           1 the

10 Sense & Sensibility      the family

# ... with 725,038 more rows


austen_bigrams %>%

  count(bigram, sort = TRUE)

# A tibble: 211,237 × 2

     bigram     n

      <chr> <int>

1    of the  3017

2     to be  2787

3    in the  2368

4    it was  1781

5      i am  1545

6   she had  1472

7    of her  1445

8    to the  1387

9   she was  1377

10 had been  1299

# ... with 211,227 more rows

#stop word 제거하기 위해

library(tidyr)


bigrams_separated <- austen_bigrams %>%

  separate(bigram, c("word1", "word2"), sep = " ")


bigrams_filtered <- bigrams_separated %>%

  filter(!word1 %in% stop_words$word) %>%

  filter(!word2 %in% stop_words$word)


# new bigram counts:

bigram_counts <- bigrams_filtered %>% 

  count(word1, word2, sort = TRUE)

bigram_counts #주로 주인공의 이름

Source: local data frame [33,421 x 3]

Groups: word1 [6,711]


     word1     word2     n

     <chr>     <chr> <int>

1      sir    thomas   287

2     miss  crawford   215

3  captain wentworth   170

4     miss woodhouse   162

5    frank churchill   132

6     lady   russell   118

7     lady   bertram   114

8      sir    walter   113

9     miss   fairfax   109

10 colonel   brandon   108

# ... with 33,411 more rows

bigrams_united <- bigrams_filtered %>%

  unite(bigram, word1, word2, sep = " ")

bigrams_united #stopwords를 제거한 bigram 결과

# A tibble: 44,784 × 2

                  book                   bigram

*               <fctr>                    <chr>

1  Sense & Sensibility              jane austen

2  Sense & Sensibility              austen 1811

3  Sense & Sensibility             1811 chapter

4  Sense & Sensibility                chapter 1

5  Sense & Sensibility             norland park

6  Sense & Sensibility surrounding acquaintance

7  Sense & Sensibility               late owner

8  Sense & Sensibility             advanced age

9  Sense & Sensibility       constant companion

10 Sense & Sensibility             happened ten

# ... with 44,774 more rows

#trigram

austen_books() %>%

  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%

  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%

  filter(!word1 %in% stop_words$word,

         !word2 %in% stop_words$word,

         !word3 %in% stop_words$word) %>%

  count(word1, word2, word3, sort = TRUE)

Source: local data frame [8,757 x 4]

Groups: word1, word2 [7,462]


       word1     word2     word3     n

       <chr>     <chr>     <chr> <int>

1       dear      miss woodhouse    23

2       miss        de    bourgh    18

3       lady catherine        de    14

4  catherine        de    bourgh    13

5       poor      miss    taylor    11

6        sir    walter    elliot    11

7        ten  thousand    pounds    11

8       dear       sir    thomas    10

9     twenty  thousand    pounds     8

10   replied      miss  crawford     7

# ... with 8,747 more rows




#Analyzing bigrams

#most common streets

bigrams_filtered %>%

  filter(word2 == "street") %>%

  count(book, word1, sort = TRUE)

Source: local data frame [34 x 3]
Groups: book [6]

                  book       word1     n
                <fctr>       <chr> <int>
1  Sense & Sensibility    berkeley    16
2  Sense & Sensibility      harley    16
3     Northanger Abbey    pulteney    14
4     Northanger Abbey      milsom    11
5       Mansfield Park     wimpole    10
6    Pride & Prejudice gracechurch     9
7  Sense & Sensibility     conduit     6
8  Sense & Sensibility        bond     5
9           Persuasion      milsom     5
10          Persuasion      rivers     4
# ... with 24 more rows

bigram_tf_idf <- bigrams_united %>%

  count(book, bigram) %>%

  bind_tf_idf(bigram, book, n) %>%

  arrange(desc(tf_idf))

bigram_tf_idf #bigram으로 tf-idf분석

Source: local data frame [36,217 x 6]

Groups: book [6]


                  book            bigram     n         tf

                <fctr>             <chr> <int>      <dbl>

1           Persuasion captain wentworth   170 0.02985599

2       Mansfield Park        sir thomas   287 0.02873160

3       Mansfield Park     miss crawford   215 0.02152368

4           Persuasion      lady russell   118 0.02072357

5           Persuasion        sir walter   113 0.01984545

6                 Emma    miss woodhouse   162 0.01700966

7     Northanger Abbey       miss tilney    82 0.01594400

8  Sense & Sensibility   colonel brandon   108 0.01502086

9                 Emma   frank churchill   132 0.01385972

10   Pride & Prejudice    lady catherine   100 0.01380453

# ... with 36,207 more rows, and 2 more variables: idf <dbl>,

#   tf_idf <dbl>

library(ggplot2)

bigram_tf_idf %>%

  arrange(desc(tf_idf)) %>%

  group_by(book) %>%

  top_n(12, tf_idf) %>%

  ungroup() %>%

  mutate(bigram = reorder(bigram, tf_idf)) %>%

  ggplot(aes(bigram, tf_idf, fill = book)) +

  geom_col(show.legend = FALSE) +

  facet_wrap(~ book, ncol = 2, scales = "free") +

  coord_flip() +

  labs(y = "tf-idf of bigram to novel", x = "") 

#이름 뿐만 아니라 오만과 편견의 "replied elinor","cried marianne"나 엠마의 "cried emma"와 같이

#기존의 단어로만 분석했던 tf-idf에서는 찾아 볼 수 없었던 동사와 이름의 조합을 찾아볼 수 있다.

bigram_tf_idf.png


#sentiment analysis by using bigrams

bigrams_separated %>%

  filter(word1 == "not") %>%

  count(word1, word2, sort = TRUE) #not으로 시작하는 bigram

Source: local data frame [1,246 x 3]

Groups: word1 [1]


   word1 word2     n

   <chr> <chr> <int>

1    not    be   610

2    not    to   355

3    not  have   327

4    not  know   252

5    not     a   189

6    not think   176

7    not  been   160

8    not   the   147

9    not    at   129

10   not    in   118

# ... with 1,236 more rows

AFINN <- get_sentiments("afinn")

AFINN #단어마다 sentiment score (+:긍정,-:부정)

# A tibble: 2,476 × 2
         word score
        <chr> <int>
1     abandon    -2
2   abandoned    -2
3    abandons    -2
4    abducted    -2
5   abduction    -2
6  abductions    -2
7       abhor    -3
8    abhorred    -3
9   abhorrent    -3
10     abhors    -3
# ... with 2,466 more rows

not_words <- bigrams_separated %>%

  filter(word1 == "not") %>%

  inner_join(AFINN, by = c(word2 = "word")) %>%

  count(word2, score, sort = TRUE) %>%

  ungroup()

not_words

# A tibble: 245 × 3

     word2 score     n

     <chr> <int> <int>

1     like     2    99

2     help     2    82

3     want     1    45

4     wish     1    39

5    allow     1    36

6     care     2    23

7    sorry    -1    21

8    leave    -1    18

9  pretend    -1    18

10   worth     2    17

# ... with 235 more rows

not_words %>%

  mutate(contribution = n * score) %>%

  arrange(desc(abs(contribution))) %>%

  head(20) %>%

  mutate(word2 = reorder(word2, contribution)) %>%

  ggplot(aes(word2, n * score, fill = n * score > 0)) +

  geom_col(show.legend = FALSE) +

  xlab("Words preceded by \"not\"") +

  ylab("Sentiment score * number of occurrences") +

  coord_flip()

# "not like"와 "not help"와 같은 경우,

# 실제 의미는 부정적이나, 단어 개개로 보면 긍정적이기 때문에 오인의 여지가 크다.

# "not afraid"와 "not fail"과 같은 경우도, 

# 실제 의미는 긍정적이나, afraid나 fail의 단어로 인해 부정적으로 보일 수 있다.

not_words.png

negation_words <- c("not", "no", "never", "without")


negated_words <- bigrams_separated %>%

  filter(word1 %in% negation_words) %>%

  inner_join(AFINN, by = c(word2 = "word")) %>%

  count(word1, word2, score, sort = TRUE) %>%

  ungroup()


negated_words %>%

  mutate(contribution = n * score,

         abscontribution = abs(contribution),

         word2 = reorder(paste(word2, word1, sep = "__"), contribution)) %>%

  group_by(word1) %>%

  top_n(12, abscontribution) %>%

  ggplot(aes(word2, contribution, fill = n * score > 0)) +

  geom_col(show.legend = FALSE) +

  facet_wrap(~ word1, scales = "free") +

  scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +

  xlab("Words preceded by negation term") +

  ylab("Sentiment score * # of occurrences") +

  coord_flip()

negation_words.png

#Visualizing a network of bigrams with ggraph

library(igraph)


# original counts

bigram_counts

Source: local data frame [33,421 x 3]

Groups: word1 [6,711]


     word1     word2     n

     <chr>     <chr> <int>

1      sir    thomas   287

2     miss  crawford   215

3  captain wentworth   170

4     miss woodhouse   162

5    frank churchill   132

6     lady   russell   118

7     lady   bertram   114

8      sir    walter   113

9     miss   fairfax   109

10 colonel   brandon   108

# ... with 33,411 more rows

# filter for only relatively common combinations

bigram_graph <- bigram_counts %>%

  filter(n > 20) %>%

  graph_from_data_frame()

bigram_graph

IGRAPH DN-- 91 77 -- 

+ attr: name (v/c), n (e/n)

+ edges (vertex names):

 [1] sir      ->thomas     miss     ->crawford   captain  ->wentworth  miss     ->woodhouse  frank    ->churchill 

 [6] lady     ->russell    lady     ->bertram    sir      ->walter     miss     ->fairfax    colonel  ->brandon   

[11] miss     ->bates      lady     ->catherine  sir      ->john       jane     ->fairfax    miss     ->tilney    

[16] lady     ->middleton  miss     ->bingley    thousand ->pounds     miss     ->dashwood   miss     ->bennet    

[21] john     ->knightley  miss     ->morland    captain  ->benwick    dear     ->miss       miss     ->smith     

[26] miss     ->crawford's henry    ->crawford   miss     ->elliot     dr       ->grant      miss     ->bertram   

[31] sir      ->thomas's   ten      ->minutes    miss     ->price      miss     ->taylor     sir      ->william   

[36] john     ->dashwood   de       ->bourgh     dear     ->sir        dear     ->fanny      miss     ->darcy     

+ ... omitted several edges

library(ggraph)

set.seed(2017)


ggraph(bigram_graph, layout = "fr") +

  geom_edge_link() +

  geom_node_point() +

  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

#이름과 함께 사용되는 "miss","lady","sir"과 "colonel"이 네트워크 중심에 위치

#"half hour","thousand pounds", "short time/pause"와 같은 common phrase들은 네트워크 외부에 위치

bigram_graph.png

set.seed(2016)


a <- grid::arrow(type = "closed", length = unit(.15, "inches"))


ggraph(bigram_graph, layout = "fr") +

  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,

                 arrow = a, end_cap = circle(.07, 'inches')) +

  geom_node_point(color = "lightblue", size = 5) +

  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +

  theme_void()bigram_graph2.png

 

#Visualizing bigrams in other texts

library(dplyr)

library(tidyr)

library(tidytext)

library(ggplot2)

library(igraph)

library(ggraph)


count_bigrams <- function(dataset) {

  dataset %>%

    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%

    separate(bigram, c("word1", "word2"), sep = " ") %>%

    filter(!word1 %in% stop_words$word,

           !word2 %in% stop_words$word) %>%

    count(word1, word2, sort = TRUE)

}


visualize_bigrams <- function(bigrams) {

  set.seed(2016)

  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

  

  bigrams %>%

    graph_from_data_frame() %>%

    ggraph(layout = "fr") +

    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +

    geom_node_point(color = "lightblue", size = 5) +

    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +

    theme_void()

}


# the King James version is book 10 on Project Gutenberg:
#install.packages("gutenbergr")
library(gutenbergr)
kjv <- gutenberg_download(10)

library(stringr)

kjv_bigrams <- kjv %>%
  count_bigrams()

# filter out rare combinations, as well as digits
kjv_bigrams %>%
  filter(n > 40,
         !str_detect(word1, "\\d"),
         !str_detect(word2, "\\d")) %>%
  visualize_bigrams()
#성경의 초안??청사진??
#네트워크의 중심인 thy(your)와 thou(you)

gutenberg.png
-----------------------------------------------------------------------------------------------------------------------------------------------#4.2 Counting and correlating pairs of words with the widyr package
#인접해 있는 단어뿐만 아니라 특정한 문서나 챕터에서 인접해 있지 않은 단어간의 연관관계
#Counting and correlating among sections
austen_section_words <- austen_books() %>%
  filter(book == "Pride & Prejudice") %>%
  mutate(section = row_number() %/% 10) %>%
  filter(section > 0) %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word)
austen_section_words
#오만과 편견을 10줄 단위로 나누어서
#어떤 단어가 섹션별로 나타나는지
# A tibble: 37,240 × 3
                book section         word
              <fctr>   <dbl>        <chr>
1  Pride & Prejudice       1        truth
2  Pride & Prejudice       1  universally
3  Pride & Prejudice       1 acknowledged
4  Pride & Prejudice       1       single
5  Pride & Prejudice       1   possession
6  Pride & Prejudice       1      fortune
7  Pride & Prejudice       1         wife
8  Pride & Prejudice       1     feelings
9  Pride & Prejudice       1        views
10 Pride & Prejudice       1     entering
# ... with 37,230 more rows

library(devtools)

install_github("dgrtwo/widyr")

library(widyr)


# count words co-occuring within sections

word_pairs <- austen_section_words %>%

  pairwise_count(word, section, sort = TRUE)

word_pairs

#pairwise_count() : 두 단어가 함께 나온 횟수를 센다 

#함께 제일 빈번하게 나타난 단어인 darcy와 elizabeth는 두 주인공의 이름이다

# A tibble: 796,008 × 3

       item1     item2     n

       <chr>     <chr> <dbl>

1      darcy elizabeth   144

2  elizabeth     darcy   144

3       miss elizabeth   110

4  elizabeth      miss   110

5  elizabeth      jane   106

6       jane elizabeth   106

7       miss     darcy    92

8      darcy      miss    92

9  elizabeth   bingley    91

10   bingley elizabeth    91

# ... with 795,998 more rows

word_pairs %>%

  filter(item1 == "darcy")

#darcy와 함께 자주 나타난 단어들

# A tibble: 2,930 × 3
   item1     item2     n
   <chr>     <chr> <dbl>
1  darcy elizabeth   144
2  darcy      miss    92
3  darcy   bingley    86
4  darcy      jane    46
5  darcy    bennet    45
6  darcy    sister    45
7  darcy      time    41
8  darcy      lady    38
9  darcy    friend    37
10 darcy   wickham    37
# ... with 2,920 more rows

#Pairwise correlation

#위에서 elizabeth와 darcy는 함께 가장 빈번하게 나타나지만,

#각각으로도 가장 빈번하게 나타나는 단어이므로

#특별한 의미를 가진다고 보기 어렵다.

#따라서 단어들 간의 correlation을 살펴보자 (phi coefficient)

# we need to filter for at least relatively common words first

word_cors <- austen_section_words %>%

  group_by(word) %>%

  filter(n() >= 20) %>%

  pairwise_cor(word, section, sort = TRUE)

word_cors

#pairwise_cor() : 파이계수를 계산해준다

# A tibble: 154,842 × 3

       item1     item2 correlation

       <chr>     <chr>       <dbl>

1     bourgh        de   0.9508501

2         de    bourgh   0.9508501

3     pounds  thousand   0.7005808

4   thousand    pounds   0.7005808

5    william       sir   0.6644719

6        sir   william   0.6644719

7  catherine      lady   0.6633048

8       lady catherine   0.6633048

9    forster   colonel   0.6220950

10   colonel   forster   0.6220950

# ... with 154,832 more rows

#"pound"와 연관성이 가장 높은 단어들은?

word_cors %>%

  filter(item1 == "pounds")

# A tibble: 393 × 3
    item1     item2 correlation
    <chr>     <chr>       <dbl>
1  pounds  thousand  0.70058081
2  pounds       ten  0.23057580
3  pounds   fortune  0.16386264
4  pounds   settled  0.14946049
5  pounds wickham's  0.14152401
6  pounds  children  0.12900011
7  pounds  mother's  0.11905928
8  pounds  believed  0.09321518
9  pounds    estate  0.08896876
10 pounds     ready  0.08597038
# ... with 383 more rows

word_cors %>%

  filter(item1 %in% c("elizabeth", "pounds", "married", "pride")) %>%

  group_by(item1) %>%

  top_n(6) %>%

  ungroup() %>%

  mutate(item2 = reorder(item2, correlation)) %>%

  ggplot(aes(item2, correlation)) +

  geom_bar(stat = "identity") +

  facet_wrap(~ item1, scales = "free") +

  coord_flip()

word_cors.png

#correlation도 위의 bigram과 같은 방법으로 시각화

set.seed(2016)


word_cors %>%

  filter(correlation > .15) %>%

  graph_from_data_frame() %>%

  ggraph(layout = "fr") +

  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +

  geom_node_point(color = "lightblue", size = 5) +

  geom_node_text(aes(label = name), repel = TRUE) +

  theme_void()word_cors2.png


List of Articles
번호 제목 글쓴이 날짜 조회 수
7 텍스트 마이닝 3장 - Analyzing word and document frequency: tf-idf 곽대훈 2017.05.20 4333
» 텍스트 마이닝 4장 - Relationships between words file 연다인 2017.05.17 4641
5 텍스트 마이닝 5장 - tidying and casting dtm and corpus objects file 유누리 2017.05.14 5469
4 텍스트 마이닝 2장 감성분석 1 file 호잇짜 2017.04.11 14076
3 3/29 텍스트마이닝 - 파이썬 기초1 1 file 2017.04.05 5721
2 3/29 텍스트마이닝 1장 tidy text format 1 file cys109 2017.04.05 4738
1 3/29 텍스트 마이닝 - 파이썬 기초 2 2 file 허능호 2017.04.03 4702
Board Pagination ‹ Prev 1 Next ›
/ 1

나눔글꼴 설치 안내


이 PC에는 나눔글꼴이 설치되어 있지 않습니다.

이 사이트를 나눔글꼴로 보기 위해서는
나눔글꼴을 설치해야 합니다.

설치 취소

Designed by sketchbooks.co.kr / sketchbook5 board skin

Sketchbook5, 스케치북5

Sketchbook5, 스케치북5

Sketchbook5, 스케치북5

Sketchbook5, 스케치북5