-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpca_plot.R
More file actions
executable file
·81 lines (69 loc) · 2.66 KB
/
pca_plot.R
File metadata and controls
executable file
·81 lines (69 loc) · 2.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
## Call libraries
# suppressMessages(library(easyPubMed))
# suppressMessages(library(parallel))
# suppressMessages(library(foreach))
# suppressMessages(library(doParallel))
suppressMessages(library(dplyr))
suppressMessages(library(workflows))
suppressMessages(library(tune))
suppressMessages(library(stopwords))
suppressMessages(library(tidymodels))
suppressMessages(library(tidytext))
suppressMessages(library(textrecipes))
suppressMessages(library(discrim))
pca_plot <- function(data){
train_data <- data$train_data
test_data <- data$test_data
# Creating recipe and specifying outcome and predictors
train_rec <-
recipe(class ~ ., data = train_data) %>%
update_role(pmid, new_role = "id") %>%
step_tokenize(abstract) %>%
step_stopwords(abstract) %>%
step_stem(abstract) %>%
step_tokenfilter(abstract, max_tokens = 500) %>%
step_tfidf(abstract)
dtm_tfidf <- train_rec %>%
prep() %>%
juice() %>%
dplyr::select(-c(class, year, title))
colnames(dtm_tfidf) <- gsub(pattern = "tfidf_abstract_", "", colnames(dtm_tfidf))
dtm_tfidf <- dtm_tfidf[,grepl(pattern = "[a-z]", colnames(dtm_tfidf), ignore.case=TRUE, perl=TRUE)]
pca_rec <- recipe(~., data = dtm_tfidf) %>%
update_role(pmid, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors())
pca_prep <- pca_rec %>%
prep()
tidied_pca <- tidy(pca_prep, 2)
tidied_pca %>%
filter(component %in% paste0("PC", 1:5)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(value, terms, fill = terms)) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, nrow = 1) +
labs(y = NULL)
tidied_pca %>%
filter(component %in% paste0("PC", 1:4)) %>%
group_by(component) %>%
top_n(8, abs(value)) %>%
ungroup() %>%
mutate(terms = reorder_within(terms, abs(value), component)) %>%
ggplot(aes(abs(value), terms, fill = value > 0)) +
geom_col() +
facet_wrap(~component, scales = "free_y") +
scale_y_reordered() +
labs(
x = "Absolute value of contribution",
y = NULL, fill = "Positive?"
)
juice(pca_prep) %>%
inner_join(., train_data %>% dplyr::select(pmid, class),
by = "pmid") %>%
ggplot(aes(PC1, PC2)) +
geom_point(aes(color = class), alpha = 0.7, size = 2) +
ggtitle("PCA plot of the positive and negative articles") +
theme(plot.title = element_text(size = 15, family = "Helvetica",
hjust = 0.5,
margin = margin(t = 20, r = 0, b = 20, l = 0)))
}