TF-IDF_川普選前演講分析 1. 從Kaggle下載川普在總統大選前三個月的64場演講逐字稿,分析演講用字的使用頻率。

library(NLP)
library(tm)
library(stats)
library(proxy)
library(dplyr)
library(readtext)
library(slam)
library(Matrix)
library(tidytext)
library(ggplot2)

使用readtext一次下載多個txt檔

setwd("~/Documents/GitHub/CSX_RProject_Fall_2018/week_6,7,8/trump")
rawData <- readtext("*.txt")
rawData 

為了讓原始資料第一行(rawData$doc_id)元素更簡潔,用gsub把檔名中多餘的文字刪掉

rawData$doc_id <- gsub("Trump_"," ",rawData$doc_id)
print(rawData$doc_id)
rawData$doc_id <- gsub("-16.txt"," ",rawData$doc_id)
print(rawData$doc_id)

建立文本資料結構與基本文字清洗,刪去標點符號、數字以及一些英文常見字詞

docs <- Corpus(VectorSource(rawData$text))
toSpace <- content_transformer(function(x, pattern) {
  return(gsub(pattern, " ", x))
})
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, removeWords, stopwords("english")) 
docs <- tm_map(docs, stripWhitespace)
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(wordcloud)

mixseg = worker()
jieba_tokenizer = function(d)
{
  unlist( segment(d[[1]], mixseg) )
}

count_token = function(d)
{
  as.data.frame(table(d))
}

idfCal <- function(word_doc, n)
{ 
  log2( n / nnzero(word_doc) ) 
}
###
seg_date = lapply(docs, jieba_tokenizer)
tokens_date = lapply(seg_date, count_token)
###
n_date = length(seg_date)
TDM_date = tokens_date[[1]]
dateNames <- rawData[,"doc_id"]
for(doc_id in c(2:n_date))
{
  TDM_date = merge(TDM_date, tokens_date[[doc_id]], by="d", all = TRUE)
  names(TDM_date) = c('d', dateNames[1:doc_id])
}
TDM_date[is.na(TDM_date)] <- 0 #將NA填0
###
# tf_date <- apply(as.matrix(TDM_date[,2:(n_date + 1)]), 2, sum) #直向相加計算總數
# idf_date <- apply(as.matrix(TDM_date[,2:(n_date + 1)]), 1, idfCal, n=n_date)
doc.tfidf_date <- TDM_date

###
input.date <- "AshevilleNC_Sep-12"
words_count_date = TDM_date[,1:2]
colnames(words_count_date) = c('word', 'count')
words_count_date = words_count_date[rev(order(words_count_date$count)),]
# words_count_date$count = sort(words_count_date$count, decreasing = T)
rownames(words_count_date)=NULL

ggplot(words_count_date[1:20,], aes(x = reorder(word, count), y =count)) + 
  geom_bar(stat = "identity", fill='lightblue') + 
  coord_flip() +
  labs(x='word', y='count', title=paste('Author: ', input.date)) +
  theme(panel.background = element_blank(),
        axis.title = element_text(color = '#2d2d2d'),
        axis.text.x = element_text(hjust = 1, size=15),
        axis.text.y = element_text(hjust = 1, size=15),
        strip.text.x = element_text(color='#2d2d2d',face='bold',size=10),
        plot.title = element_text(hjust=0.5,face='bold',size=15))

###
rownames(doc.tfidf_date) = doc.tfidf_date$d
doc.tfidf_date <- doc.tfidf_date[,1:n_date+1]

建立文本矩陣 TermDocumentMatrix

tdm <- TermDocumentMatrix(docs)   
tdm
print(tf <- as.matrix(tdm))
DF <- tidy(tf) 
DF <- DF[-1, ]

將原始資料第一行(rawData$doc_id)的64個演講名稱,設定為向量。之後,再將新的資料(所有字詞在各場演講 出現的次數)的變數名稱,轉成自己命名的變數名稱。

speech_data <- c(rawData$doc_id)
print(speech_data)
colnames(DF) <- c("", speech_data)
print(colnames(DF))

將已建好的 TDM 轉成 TF-IDF

tf <- apply(tdm, 2, sum)
idfCal <- function(word_doc){log2((length(word_doc)+1) / nnzero(word_doc))}
idf <- apply(tdm, 1, idfCal)
doc.tfidf <- as.matrix(tdm)
doc.tfidf[ ,-1]
for(i in 1:nrow(tdm)){
  for(j in 1:ncol(tdm)){
    doc.tfidf[i,j] <- (doc.tfidf[i,j] / tf[j]) * idf[i]
  }
}

findZeroId <- as.matrix(apply(doc.tfidf, 1, sum))
tfidfnn <- doc.tfidf[-which(findZeroId == 0),]

write.csv(tfidfnn, "show.csv")


colnames(doc.tfidf) <- speech_data
print(colnames(doc.tfidf))

計算新資料的前十列、後十列文字,在所有演講出現的次數

termFrequency = rowSums(as.matrix(tdm))
termFrequency = subset(termFrequency, termFrequency>=10)

df = data.frame(term=names(termFrequency), freq=termFrequency)
head(termFrequency,10)
tail(termFrequency,10)

找出所有演講中,出現次數最高的30個字詞

high.freq=tail(sort(termFrequency),n=30)
hfp.df=as.data.frame(sort(high.freq))
hfp.df$names <- rownames(hfp.df)

畫柱狀圖,顯示出現次數最高的30個字,各別出現的次數

library(knitr)
library(ggplot2)
# png('Trump_speeches.png')

ggplot(hfp.df, aes(reorder(names,high.freq), high.freq)) +
  geom_bar(stat="identity") + coord_flip() + 
  xlab("Terms") + ylab("Frequency") + 
  ggtitle("Term frequencies")

接著找出次數最高的50個字詞

high.freq_1=tail(sort(termFrequency),n=50)
hfp.df_1=as.data.frame(sort(high.freq_1))
hfp.df_1$names <- rownames(hfp.df_1)
library(knitr)
library(ggplot2)
# png('Trump_speeches.png')

ggplot(hfp.df_1, aes(reorder(names,high.freq_1), high.freq_1)) +
  geom_bar(stat="identity") + coord_flip() + 
  xlab("Terms") + ylab("Frequency") + 
  ggtitle("Term frequencies")

結論:在這些演講中,扣除will、going、one、just等常見的字詞,川普較常提到的字詞包含people、country、hillary clinton、jobs、america/american等,而這些詞也正是候選人在演講中常提到的詞。上述情形是最常使用的30個字,一旦把範圍拉到最常使用的50個字詞,此時多出了trade、government、money、 plan等詞,比較值得注意的詞大概就是trade,最近幾個月鬧得沸沸揚揚的中美貿易戰,也許能呼應川普 在演講時提到有關貿易的政策。

DF_1 <- t(DF)
DF_1

DF_1.df <- as.data.frame(DF_1)
DF_1.df

colnames(DF_1.df) <- c(DF_1.df[1, ])
DF_1.df <- DF_1.df[-1, ]
# scale(DF)
pcs <- prcomp(DF[ ,-1], center = T, scale = T)
pcs
plot(pcs)

使用PCA,描繪出兩個主成份群之間的關係

library(magrittr)
pcs$x

pcs$x %>%
  as.data.frame() %>%
  ggplot(aes(PC1, PC2)) + geom_point()

apply(DF[ ,2:65], 1, sum)
total_amount <- as.vector(apply(DF[ ,2:65], 1, sum))
DF.df <- cbind(DF, total_amount)
DF.df.sort <- DF.df[order(DF.df$total_amount, decreasing = T), ]

因為演講內容有許多字無助於分析,因此篩去一些無關緊要的字詞,將剩下重要的字依據不同的性質, 分成5大類,分別是“name”、“military/safety”、“ethnics/human rights”、“country/state/city”、 “policy/economy”。

row.names(DF.df.sort) <- c(DF.df.sort[,1])
DF.df.sort_latest <- as.data.frame(DF.df.sort[c("hillary","clinton","clintons","obama","hillarys","obamaclinton","illegal","crime","criminal","corruption","violence","killed","violent","crime","weapons","isis","military","terrorism","terrorists","crisis","nuclear","borders","africanamerican","africanamericans","african","children","school","schools","education","childcare","women","woman","workers","refugees","immigrant","refugee","veterans","lobbyist","religious","justice","justices","mexico","china","islamic","iraq","syria","russia","americas","libya","korea","haiti","pennsylvania","ohio","florida","carolina","hampshire","detroit","chicago","baltimore","arizona","orlando","hispanic","poverty","lowincome","jobs","money","prosperity","wealthy","jobkilling","nafta","tpp","transpacific","economic","economy","deficit","debt","currency","infrastructure","trade","obamacare","reform","reforms","jobkilling"),])
row.names(DF.df.sort_latest) <- c(1:83)    

type <- c(rep("name",times = 6),rep("military/safety",times = 16),rep("ethnics/human rights",times = 19),           rep("country/state/city",times = 21),rep("policy/economy",times = 21))
type <- as.data.frame(type)
DF.df.sort_latest <- cbind(DF.df.sort_latest, type)



data_latest <- DF.df.sort_latest[ ,c(-1,-66,-67)]



E.dist <- dist(data_latest, method = "euclidean")
# par(mfrow=c(1,2))

hc.s <- hclust(E.dist, method = "single")
hc.c <- hclust(E.dist, method = "complete")
hc.a <- hclust(E.dist, method = "average")
hc.w <- hclust(E.dist, method = "ward.D") 

par(mar = c(0, 4, 4, 2), mfrow = c(2, 2))
plot(hc.s, labels = FALSE, main = "single", xlab = " ")
plot(hc.c, labels = FALSE, main = "complete", xlab = " ")
plot(hc.a, labels = FALSE, main = "average", xlab = " ")
plot(hc.w, labels = FALSE, main = "ward.D", xlab = " ")

# abline(h=9, col="red")


# op <- par(mar = c(1, 4, 4, 1))
par(mar = c(1, 4, 4, 1), mfrow = c(1, 1))
plot(hc.w, labels = DF.df.sort_latest$type, cex = 0.6, main = "ward.D showing 3 clusters")
rect.hclust(hc.w, k = 5)

使用K-Means進行分群: 以平均數作為聚集點

library(xtable)
library(factoextra)

set.seed(20)
data_latest.km <- kmeans(data_latest, centers = 5, nstart = 50)

# We can look at the within sum of squares of each cluster
data_latest.km$withinss

data_latest.km_Table <- data.frame(group = DF.df.sort_latest$type, cluster = data_latest.km$cluster)
data_latest_Table <- xtable(with(data_latest.km_Table, table(group, cluster)), 
                caption = "Number of samples from each experimental group within each k-means cluster")


fviz_cluster(data_latest.km,           # 分群結果
             data = data_latest,       # 資料
             geom = c("point","text"), # 點和標籤(point & label)
             frame.type = "norm")      # 框架型態

使用K-Medoid進行分群: 以中位數作為聚集點

library(cluster)

data_latest.kmedoid <- pam(data_latest, k = 5)     # pam = Partitioning Around Medoids
data_latest.kmedoid$objective                      # 群內的變異數

data_latest.kmedoid_Table <- data.frame(group = DF.df.sort_latest$type, 
                                        cluster = data_latest.kmedoid$clustering)
data_latest_Table_1 <- xtable(with(data_latest.kmedoid_Table, table(group, cluster)), 
                       caption = "Number of samples from each experimental group within each PAM cluster")


par(mar = c(5, 1, 4, 4))
plot(data_latest.kmedoid, main = "Silhouette Plot for 5 clusters")

fviz_cluster(data_latest.kmedoid,   # 分群結果
             data = data_latest,    # 資料
             geom = c("point"),     # 點 (point)
             frame.type = "norm")   # 框架型態

使用Hierarchical Clustering的Elbow Method: 最佳分群數為5群

fviz_nbclust(data_latest, 
             FUNcluster = hcut,  # hierarchical clustering
             method = "wss",     # total within sum of square
             k.max = 12          # max number of clusters to consider
             ) + 
    
labs(title = "Elbow Method for HC") +
    
geom_vline(xintercept = 5,       # 在 X=5的地方 
           linetype = 2)         # 畫一條虛線

使用K-Means的Elbow Method: 最佳分群數為5群

fviz_nbclust(data_latest, 
             FUNcluster = kmeans,# K-Means
             method = "wss",     # total within sum of square
             k.max = 12          # max number of clusters to consider
             ) + 
    
labs(title = "Elbow Method for K-Means") +
    
geom_vline(xintercept = 5,       # 在 X=5的地方 
           linetype = 2)         # 畫一條虛線

使用K-Medoid的Elbow Method: 最佳分群數為5群

fviz_nbclust(data_latest, 
             FUNcluster = pam,   # K-Medoid
             method = "wss",     # total within sum of square
             k.max = 12          # max number of clusters to consider
             ) + 
    
labs(title = "Elbow Method for K-Medoid") +
    
geom_vline(xintercept = 5,       # 在 X=5的地方 
           linetype = 2)         # 畫一條虛線

使用平均側影法(Average silhouette Method: 取得每個資料點的側影平均值,來決定最佳的分群數目,此時最佳分群數為2群

fviz_nbclust(data_latest, 
             FUNcluster = kmeans,   # K-Means
             method = "silhouette", # Avg. Silhouette
             k.max = 12             # max number of clusters
             ) +
    
labs(title = "Avg.Silhouette Method for K-Means")