-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.R
More file actions
40 lines (31 loc) · 1.16 KB
/
preprocessing.R
File metadata and controls
40 lines (31 loc) · 1.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# read the red wine data, which measures physical characteristics of wine
dat = read.csv("https://raw.githubusercontent.com/peterkabai/dataScience/master/data/wine.csv")
# make a small version of the data set by selecting 20 random rows of it
small = dat[sample(1:nrow(dat), 20),]
# scale the data using Z score normalization.
scaled_small = scale(small)
# find the column minimums as a vector
apply(scaled_small, 2, min)
# find the column maximums as a vector
apply(scaled_small, 2, max)
# scale the original data using unit interval normalization
scaled_large = scale(dat, center=TRUE)
# find the column mins and maxes
apply(scaled_large, 2, min)
apply(scaled_large, 2, max)
# use the function cor() on the large dataset to
# see how the features are correlated.
correlation = cor(scaled_large)
# display the correlations with a plot
heatmap(
correlation,
Rowv=NA, Colv=NA,
col = blues9,
scale="column",
margins=c(10,10)
)
# show the features most correlated to the 'quality'
correlation[correlation==1.00] = 0
sort(correlation[,"quality"], decreasing=TRUE)[1]
sort(correlation[,"quality"], decreasing=TRUE)[2]
sort(correlation[,"quality"], decreasing=TRUE)[3]