TF-IDF_川普選前演講分析 1. 從Kaggle下載川普在總統大選前三個月的64場演講逐字稿,分析演講用字的使用頻率。

library(NLP)
library(tm)
library(stats)
library(proxy)
library(dplyr)
library(readtext)
library(slam)
library(Matrix)
library(tidytext)
library(ggplot2)

使用readtext一次下載多個txt檔

setwd("~/Documents/GitHub/CSX_RProject_Fall_2018/week_6,7,8/trump")
rawData <- readtext("*.txt")
rawData 

為了讓原始資料第一行(rawData$doc_id)元素更簡潔,用gsub把檔名中多餘的文字刪掉

rawData$doc_id <- gsub("Trump_"," ",rawData$doc_id)
print(rawData$doc_id)
rawData$doc_id <- gsub("-16.txt"," ",rawData$doc_id)
print(rawData$doc_id)

建立文本資料結構與基本文字清洗,刪去標點符號、數字以及一些英文常見字詞

docs <- Corpus(VectorSource(rawData$text))
toSpace <- content_transformer(function(x, pattern) {
  return(gsub(pattern, " ", x))
})
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, removeWords, stopwords("english")) 
docs <- tm_map(docs, stripWhitespace)
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(wordcloud)

mixseg = worker()
jieba_tokenizer = function(d)
{
  unlist( segment(d[[1]], mixseg) )
}

count_token = function(d)
{
  as.data.frame(table(d))
}

idfCal <- function(word_doc, n)
{ 
  log2( n / nnzero(word_doc) ) 
}
###
seg_date = lapply(docs, jieba_tokenizer)
tokens_date = lapply(seg_date, count_token)
###
n_date = length(seg_date)
TDM_date = tokens_date[[1]]
dateNames <- rawData[,"doc_id"]
for(doc_id in c(2:n_date))
{
  TDM_date = merge(TDM_date, tokens_date[[doc_id]], by="d", all = TRUE)
  names(TDM_date) = c('d', dateNames[1:doc_id])
}
TDM_date[is.na(TDM_date)] <- 0 #將NA填0
###
# tf_date <- apply(as.matrix(TDM_date[,2:(n_date + 1)]), 2, sum) #直向相加計算總數
# idf_date <- apply(as.matrix(TDM_date[,2:(n_date + 1)]), 1, idfCal, n=n_date)
doc.tfidf_date <- TDM_date

###
input.date <- "AshevilleNC_Sep-12"
words_count_date = TDM_date[,1:2]
colnames(words_count_date) = c('word', 'count')
words_count_date = words_count_date[rev(order(words_count_date$count)),]
# words_count_date$count = sort(words_count_date$count, decreasing = T)
rownames(words_count_date)=NULL

ggplot(words_count_date[1:20,], aes(x = reorder(word, count), y =count)) + 
  geom_bar(stat = "identity", fill='lightblue') + 
  coord_flip() +
  labs(x='word', y='count', title=paste('Author: ', input.date)) +
  theme(panel.background = element_blank(),
        axis.title = element_text(color = '#2d2d2d'),
        axis.text.x = element_text(hjust = 1, size=15),
        axis.text.y = element_text(hjust = 1, size=15),
        strip.text.x = element_text(color='#2d2d2d',face='bold',size=10),
        plot.title = element_text(hjust=0.5,face='bold',size=15))

###
rownames(doc.tfidf_date) = doc.tfidf_date$d
doc.tfidf_date <- doc.tfidf_date[,1:n_date+1]

建立文本矩陣 TermDocumentMatrix

tdm <- TermDocumentMatrix(docs)   
tdm
print(tf <- as.matrix(tdm))
DF <- tidy(tf) 
DF <- DF[-1, ]

將原始資料第一行(rawData$doc_id)的64個演講名稱,設定為向量。之後,再將新的資料(所有字詞在各場演講 出現的次數)的變數名稱,轉成自己命名的變數名稱。

speech_data <- c(rawData$doc_id)
print(speech_data)
colnames(DF) <- c("", speech_data)
print(colnames(DF))

將已建好的 TDM 轉成 TF-IDF

tf <- apply(tdm, 2, sum)
idfCal <- function(word_doc){log2((length(word_doc)+1) / nnzero(word_doc))}
idf <- apply(tdm, 1, idfCal)
doc.tfidf <- as.matrix(tdm)
doc.tfidf[ ,-1]
for(i in 1:nrow(tdm)){
  for(j in 1:ncol(tdm)){
    doc.tfidf[i,j] <- (doc.tfidf[i,j] / tf[j]) * idf[i]
  }
}

findZeroId <- as.matrix(apply(doc.tfidf, 1, sum))
tfidfnn <- doc.tfidf[-which(findZeroId == 0),]

write.csv(tfidfnn, "show.csv")


colnames(doc.tfidf) <- speech_data
print(colnames(doc.tfidf))

計算新資料的前十列、後十列文字,在所有演講出現的次數

termFrequency = rowSums(as.matrix(tdm))
termFrequency = subset(termFrequency, termFrequency>=10)

df = data.frame(term=names(termFrequency), freq=termFrequency)
head(termFrequency,10)
tail(termFrequency,10)

找出所有演講中,出現次數最高的30個字詞

high.freq=tail(sort(termFrequency),n=30)
hfp.df=as.data.frame(sort(high.freq))
hfp.df$names <- rownames(hfp.df)

畫柱狀圖,顯示出現次數最高的30個字,各別出現的次數

library(knitr)
library(ggplot2)
# png('Trump_speeches.png')

ggplot(hfp.df, aes(reorder(names,high.freq), high.freq)) +
  geom_bar(stat="identity") + coord_flip() + 
  xlab("Terms") + ylab("Frequency") + 
  ggtitle("Term frequencies")

接著找出次數最高的50個字詞

high.freq_1=tail(sort(termFrequency),n=50)
hfp.df_1=as.data.frame(sort(high.freq_1))
hfp.df_1$names <- rownames(hfp.df_1)
library(knitr)
library(ggplot2)
# png('Trump_speeches.png')

ggplot(hfp.df_1, aes(reorder(names,high.freq_1), high.freq_1)) +
  geom_bar(stat="identity") + coord_flip() + 
  xlab("Terms") + ylab("Frequency") + 
  ggtitle("Term frequencies")