Skip to content
This repository was archived by the owner on Sep 28, 2021. It is now read-only.

pinkyfox/System-Analysis

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

13 Commits
 
 
 
 

Repository files navigation

System-Analysis

INDEX

  1. Regression | LW#1
  2. Mathematical expectation, dispersion, standard deviation | LW#2
  3. Descriptive statistics for images | LW#3
  4. Cluster Analysis | LW#4
  5. Kohonen SOM | LW#5

1. Regression

Upload Data

data <- read.csv('echocardiogram_1.csv', sep = ',', dec = '.')

Prepearing Data Frame

data <- subset(data, select = c(5, 8))
data <- na.omit(data)
head(data)
A data.frame: 6 × 2
fractionalshorteningwallmotionscore
<dbl><dbl>
10.26014
20.38014
30.26014
40.25316
50.16018
60.26012

Data Plot

plot(data$wallmotionscore, data$fractionalshortening, pch = 16, col = 'blue')

png

Calculate Linear Regression

regression <- lm(fractionalshortening ~ wallmotionscore, data = data)
summary(regression)
sub <- paste('fractionalshortening = ', coef(regression)[2], ' * wallmotionscore + ', coef(regression)[1])
plot(data$fractionalshortening ~ data$wallmotionscore, pch = 16, col = 'blue', sub = sub)
abline(coef(regression)[1:2])
Call:
lm(formula = fractionalshortening ~ wallmotionscore, data = data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.18453 -0.07267 -0.00884  0.05202  0.38753 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      0.273106   0.031100   8.782 1.34e-14 ***
wallmotionscore -0.003895   0.002042  -1.908   0.0588 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1046 on 120 degrees of freedom
Multiple R-squared:  0.02943,	Adjusted R-squared:  0.02134 
F-statistic: 3.639 on 1 and 120 DF,  p-value: 0.05883

png

Calaculate Correlation

cor(data$fractionalshortening, data$wallmotionscore)

-0.171557734476155

Calculate Polynomial Regression

polynomial_regression <- lm(fractionalshortening ~ poly(wallmotionscore, 2, raw = TRUE), data = data)
coef(summary(polynomial_regression))
sub <- paste("fractionalshortening = ", coef(polynomial_regression)[3], " * wallmotionscore^2 + ", coef(polynomial_regression)[2],  " * wallmotionscore + ",coef(polynomial_regression)[1])
plot(data$fractionalshortening ~ data$wallmotionscore, pch = 16, col = 'blue', sub = sub)
abline(coef(regression)[1:2])
lines(sort(data$wallmotionscore), fitted(polynomial_regression)[order(data$wallmotionscore)], col='red', type='l')
A matrix: 3 × 4 of type dbl
EstimateStd. Errort valuePr(>|t|)
(Intercept) 0.19179372930.0684112224 2.80354190.005903814
poly(wallmotionscore, 2, raw = TRUE)1 0.00604215910.0077255844 0.78209740.435710788
poly(wallmotionscore, 2, raw = TRUE)2-0.00027091610.0002031897-1.33331620.184974802

png


2. Mathematical expectation, dispersion, standard deviation

Load Data

library(repr)

options(repr.plot.width = 16, repr.plot.height = 12)

data_frame <- read.csv('echocardiogram.csv', sep = ',', dec = '.')
data_frame <- subset(data_frame, select = c(5, 8))
data_frame_without_na <- na.omit(data_frame)
fractionalshortening <- na.omit(data_frame$fractionalshortening)
wallmotionscore <- na.omit(data_frame$wallmotionscore)

Create Gistograms

par(mfrow = c(1, 2))#The whole point of the line is to place to gistograms side by side
hist(fractionalshortening, freq = TRUE, right = F, col = 'seagreen', main = 'Fractional shortening')
hist(wallmotionscore, freq = TRUE, right = F, col = 'slateblue1', main = 'Wallmotion score')

png

Calculate Mean Values, Despersions and Standart Deviations

statistics_params <- matrix(c(mean(fractionalshortening), mean(wallmotionscore), #Mean Values
                        var(fractionalshortening), var(wallmotionscore),         #Variation or Dispersion 
                        sd(fractionalshortening), sd(wallmotionscore)), ncol = 3, nrow = 2) #sd -- Standart Deviation
colnames(statistics_params) <- c('Mean value', 'Despersion', 'Standart deviation')
rownames(statistics_params) <- c('Fractional shortening', 'Wallmotion score')
statistics_params
A matrix: 2 × 3 of type dbl
Mean valueDespersionStandart deviation
Fractional shortening 0.2167339 0.011559010.1075128
Wallmotion score14.438125025.186009065.0185664

Confidence Intervals for Mean Values and Despersions

library(DescTools) #Load lib that contains functions for calculating confidence intervals

ci_for_means <- matrix(c(MeanCI(fractionalshortening),
                         MeanCI(wallmotionscore)), ncol = 3, byrow = TRUE)
colnames(ci_for_means) <- c("Mean value", "Lower CI", "Upper CI")
rownames(ci_for_means) <- c("Fractional shortening", "Wallmotion score")

ci_for_vars <- matrix(c(VarCI(fractionalshortening),
                        VarCI(wallmotionscore)), ncol = 3, byrow = TRUE)
colnames(ci_for_vars) <- c("Despersion", "Lower CI", "Upper CI")
rownames(ci_for_vars) <- c("Fractional shortening", "Wallmotion score")

ci_for_means
ci_for_vars
A matrix: 2 × 3 of type dbl
Mean valueLower CIUpper CI
Fractional shortening 0.2167339 0.1976225 0.2358452
Wallmotion score14.438125013.560354715.3158953
A matrix: 2 × 3 of type dbl
DespersionLower CIUpper CI
Fractional shortening 0.01155901 0.009137898 0.01509378
Wallmotion score25.1860090619.98067850332.73974143

Student

  • Dispersions are known
t.testk <- function(x, y, v_x, v_y, m0 = 0, alpha = 0.05, alternative = 'two.sided') {
    mean_x <- mean(x)
    mean_y <- mean(y)
    
    length_x <- length(x)
    length_y <- length(y)
    
    sd_x <- sqrt(v_x)
    sd_y <- sqrt(v_y)
    
    s <- sqrt((v_x / length_x) + (v_y / length_y))
    
    statistic <- (mean_x - mean_y - m0) / s
    
    p <- if (alternative == 'two.sided') {
        2 * pnorm(abs(statistic), lower.tail = FALSE)
    } else if (alternative == 'less') {
        pnorm(statistic, lower.tail = TRUE)
    } else {
        pnorm(statistic, lower.tail = FALSE)
    }
    
    LCL <- (mean_x - mean_y - s * qnorm(1 - alpha / 2))
    UCL <- (mean_x - mean_y + s * qnorm(1 - alpha / 2))
  
    result <- matrix(c(mean_x, mean_y, m0, sd_x, sd_y, s, statistic, p, LCL, UCL, alternative),
                     ncol = 11, byrow = TRUE)
    colnames(result) <- c('mean_x', 'mean_y', 'm0', 'sd_x', 'sd_y', 
                          's', 'statistic', 'p.value', 'LCL', 'UCL', 'alternative')
    return(result)
}

t.testk(fractionalshortening, wallmotionscore, 1, 0.78)
A matrix: 1 × 11 of type chr
mean_xmean_ym0sd_xsd_ysstatisticp.valueLCLUCLalternative
0.21673387096774214.438125010.8831760866327850.118988512592738-119.5190259895740-14.454604328288-13.9881779297765two.sided
  • Dispersions are unknown
t.test(fractionalshortening, wallmotionscore)
	Welch Two Sample t-test

data:  fractionalshortening and wallmotionscore
t = -32.053, df = 127.12, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -15.09936 -13.34342
sample estimates:
 mean of x  mean of y 
 0.2167339 14.4381250 

3. Descriptive statistics for images

Load Images

library(imager)
library(repr)

options(repr.plot.width = 16, repr.plot.height = 12)

picture_1_path <- file.path('C:', 'Users', 'pinky', 'Desktop', '6sem', 'ml', 'lab3', 'picture-1.jpg') 
picture_2_path <- file.path('C:', 'Users', 'pinky', 'Desktop', '6sem', 'ml', 'lab3', 'picture-2.jpg')
picture_1 <- load.image(picture_1_path)
picture_2 <- load.image(picture_2_path)

Convert Images into Grayscale Pictures

par(mfrow = c(1,2))

plot(picture_1)
picture_1 <- grayscale(picture_1)
plot(picture_1)

plot(picture_2)
picture_2 <- grayscale(picture_2)
plot(picture_2)

png

png

Pictures Histograms

par(mfrow = c(1,2))
picture_1 %>% hist(main = 'Luminance values in picture_1')
picture_2 %>% hist(main = 'Luminance values in picture_2')

png

Mean Values, Standart deviations, Modes and Medians for Histograms

mode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}
statistics_params_for_pictures <- matrix(c(mean(picture_1), mean(picture_2), 
                                           sd(picture_1), sd(picture_2), 
                                           mode(picture_1), mode(picture_2), 
                                           median(picture_1), median(picture_2)), ncol = 4, nrow = 2)
colnames(statistics_params_for_pictures) <- c('Mean value', 'Standart deviation', 'Mode', 'Median')
rownames(statistics_params_for_pictures) <- c('Picture-1', 'Picture-2')
statistics_params_for_pictures <- as.table(statistics_params_for_pictures)
statistics_params_for_pictures
          Mean value Standart deviation      Mode    Median
Picture-1  0.5100096          0.2356856 0.9007059 0.4631373
Picture-2  0.5879227          0.2512607 0.9976863 0.5405882

Pearson Test

Так как статистика Пирсона измеряет разницу между эмпирическим и теоретическим распределениями, то чем больше ее наблюдаемое значение Kнабл, тем сильнее довод против основной гипотезы.

Hypothesises:

  • (H0): The data are normally distributed
  • (HA): The data are not normally distributed
library(nortest)
pearson.test(picture_1, adjust = TRUE)
pearson.test(picture_2, adjust = TRUE)
	Pearson chi-square normality test

data:  picture_1
P = 88688, p-value < 2.2e-16





	Pearson chi-square normality test

data:  picture_2
P = 125823, p-value < 2.2e-16

The low p-value(<0.05) indicates that we fail to reject the null hypothesis and that the distribution is not normal.

According to the fd calculation formula

x <- x[complete.cases(x)]
n <- length(x)
if (adjust) {
    dfd <- 2
} else {
    dfd <- 0
}
df <- n.classes - dfd - 1 

we can get the fd values for two data frames. Due to the same size of pictures we get fd = 260. Using Excel function ХИ2ОБРwe can calculate Pc. It equals to205.02.

Result:

P1 > PcandP2 > Pc, hence picture_1 and picture_2 ARE'N NORMALLY DISTRIBUTED.


4. Cluster Analysis

Load Libraries

library(scatterplot3d)
library(heplots)
library(cluster)
library(MVN)
library(plotly)
library(klaR)
library(Morpho)
library(caret)
library(mclust)
library(ggplot2)
library(GGally)
library(plyr)
library(psych)
library(factoextra)
library(repr)

options(repr.plot.width = 16, repr.plot.height = 12)

Load Data

dataframe <- read.table('data.csv', header = TRUE,  sep = ',')
dataframe <- na.omit(dataframe)
head(dataframe)
A data.frame: 6 × 12
ABCDEFGHIJKName
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><chr>
157.6023821.7132251.6518263.1549147.5025959.35368 8.75338320.603756 6.49143665.02040 8.078522Dum
268.8313424.3147457.2268762.4269143.1883951.6318214.51357573.93534820.22459574.94872 8.554909Dum
326.3912933.5438797.8155062.1877312.4691261.8045233.145005 9.06149926.64690346.1895011.804890Albukerke
467.0704378.4268649.0976348.8777922.9888147.4117185.11171741.58710018.94121664.5046113.156304Eugene
572.7610548.3648468.3673044.1690919.6851668.1071883.20604752.037685 5.54796174.6859012.098432Eugene
659.5459578.8865665.1574721.7116922.0390464.9301554.31572171.587874 7.97914673.8083812.670125Eugene

Prepare Data for Further Analysis

1st class 2st class 3st class 1st symptom 2st symptom 3st symptom
Dum Eugene Baxan E J D
dataframe <- subset(dataframe, Name %in% c('Dum', 'Eugene', 'Baxan'), select = c(D, E, J, Name))
dataframe$Name <- as.factor(dataframe$Name)
head(dataframe)
A data.frame: 6 × 4
DEJName
<dbl><dbl><dbl><fct>
163.1549147.50258965.02040Dum
262.4269143.18838574.94872Dum
448.8777922.98881464.50461Eugene
544.1690919.68516274.68590Eugene
621.7116922.03904273.80838Eugene
778.19632 8.67190849.24287Baxan

Calculate Standart Statistics Metrics

summary(dataframe)
       D                  E                  J             Name    
 Min.   : 0.03765   Min.   : 0.01439   Min.   :35.01   Baxan :749  
 1st Qu.:44.71768   1st Qu.:10.82736   1st Qu.:56.32   Dum   :749  
 Median :62.92686   Median :18.06381   Median :64.80   Eugene:749  
 Mean   :56.61332   Mean   :19.20471   Mean   :63.92               
 3rd Qu.:71.65353   3rd Qu.:24.29174   3rd Qu.:72.14               
 Max.   :99.98825   Max.   :49.99397   Max.   :86.99               

Cluster Analysis

Get Optimal Numbers of Clusters

fviz_nbclust(dataframe[,1:3], kmeans, method = 'wss') + geom_vline(xintercept = 2, linetype = 1)

png

Оптимальное число кластеров соответствует точке перегиба графика. В данном случае лучше разделить записи на 2 кластера. Теперь выполним непосредственно кластерный анализ методом k-средних для 2 кластеров с помощью функции kmeans:

km <- kmeans(dataframe[,1:3], 2, nstart = 1000)
table(km$cluster, dataframe$Name)
    Baxan Dum Eugene
  1     6   0    599
  2   743 749    150

Rough Accurancy Estimating

Total number of correctly classified instances are: 743 + 749 + 599 = 2091. Total number of incorrectly classified instances are: 6 + 150 = 156. Accuracy = 2091/(2091 + 156) = 0.93 i.e our model has achieved 93% accuracy.

Чтобы убедиться в результатах анализа определим средние значениях всех анализируемых параметров в каждом из кластеров:

centroids <- aggregate(dataframe[,1:3], by = list(km$cluster),FUN = mean)
centroids
A data.frame: 2 × 4
Group.1DEJ
<int><dbl><dbl><dbl>
123.9118919.8042973.94881
268.6622618.9838060.22417

Из полученной таблицы различие между записями в разных кластерах уже видно. Теперь присвоим номера кластеров каждой из записей исходного набора данных:

dataframe <- data.frame(dataframe, km$cluster)
head(dataframe)
A data.frame: 6 × 5
DEJNamekm.cluster
<dbl><dbl><dbl><fct><int>
163.1549147.50258965.02040Dum 2
262.4269143.18838574.94872Dum 2
448.8777922.98881464.50461Eugene2
544.1690919.68516274.68590Eugene1
621.7116922.03904273.80838Eugene1
778.19632 8.67190849.24287Baxan 2

3D Plot Based on Cluster Group

colors <- c('rosybrown3', 'paleturquoise4')
point_shapes <- c(15, 16)
s3d <- scatterplot3d(dataframe[,1:3], pch = point_shapes[dataframe$km.cluster], color = colors[dataframe$km.cluster])
s3d$points3d(centroids$D[1], centroids$E[1], centroids$J[1], pch = point_shapes[1], cex = 3)
s3d$points3d(centroids$D[2], centroids$E[2], centroids$J[2], pch = point_shapes[2], cex = 3)
legend('right', legend = unique(dataframe$km.cluster), 
       col = colors[unique(dataframe$km.cluster)], pch = point_shapes[unique(dataframe$km.cluster)])

png

Validation of Clustering Results

We'll use the silhouette coefficient (silhouette width) to evaluate the goodness of our clustering.

The silhouette coefficient is calculated as follows:

For each observation i,it calculates the average dissimilarity betweeniand all the other points within the same cluster whichibelongs. Let’s call this average dissimilarityDi.
Now we do the same dissimilarity calculation between iand all the other clusters and get the lowest value among them. That is, we find the dissimilarity betweeniand the cluster that is closest toiright after its own cluster. Let’s call that valueCi.
The silhouette (Si) width is the difference between CiandDi divided by the greatest of those two values (max(Di, Ci)).
Si = (Ci — Di) / max(Di, Ci)

So, the interpretation of the silhouette width is the following:

  • Si > 0 means that the observation is well clustered. The closest it is to 1, the best it is clustered.
  • Si < 0 means that the observation was placed in the wrong cluster.
  • Si = 0 means that the observation is between two clusters.
silhouette_width <- silhouette(dataframe$km.cluster, dist(dataframe[,1:3]))
fviz_silhouette(silhouette_width)
  cluster size ave.sil.width
1       1  605          0.57
2       2 1642          0.44

png

The silhouette plot above gives us evidence that our clustering using four groups is good because there’s no negative silhouette width and most of the values are bigger than 0.5.


5. Kohonen SOM

Load Libraries

Kohonen SOM Tutorial

library(scatterplot3d)
library(repr)
library(kohonen)
library(heplots)
library(cluster)
library(MVN)
library(plotly)
library(klaR)
library(Morpho)
library(caret)
library(mclust)
library(ggplot2)
library(GGally)
library(plyr)
library(psych)
library(factoextra)
library(GPArotation)
library(RColorBrewer)
library(ggpubr)

options(repr.plot.width = 16, repr.plot.height = 12)

Load Data

dataframe <- read.table('data.csv', header = TRUE,  sep = ',')
dataframe <- na.omit(dataframe)
head(dataframe)
A data.frame: 6 × 12
ABCDEFGHIJKName
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><chr>
157.6023821.7132251.6518263.1549147.5025959.35368 8.75338320.603756 6.49143665.02040 8.078522Dum
268.8313424.3147457.2268762.4269143.1883951.6318214.51357573.93534820.22459574.94872 8.554909Dum
326.3912933.5438797.8155062.1877312.4691261.8045233.145005 9.06149926.64690346.1895011.804890Albukerke
467.0704378.4268649.0976348.8777922.9888147.4117185.11171741.58710018.94121664.5046113.156304Eugene
572.7610548.3648468.3673044.1690919.6851668.1071883.20604752.037685 5.54796174.6859012.098432Eugene
659.5459578.8865665.1574721.7116922.0390464.9301554.31572171.587874 7.97914673.8083812.670125Eugene

Prepare Data for Further Analysis

1st class 2st class 3st class 1st symptom 2st symptom 3st symptom 4st symptom 5st symptom
Baxan Choluteco Eugene K J G B A
dataframe <- subset(dataframe, Name %in% c('Choluteco', 'Eugene', 'Baxan'), select = c(K, J, G, B, A, Name))
dataframe$Name <- as.factor(dataframe$Name)
head(dataframe)
A data.frame: 6 × 6
KJGBAName
<dbl><dbl><dbl><dbl><dbl><fct>
413.15630464.5046185.1117278.4268667.07043Eugene
512.09843274.6859083.2060548.3648472.76105Eugene
612.67012573.8083854.3157278.8865659.54595Eugene
7 9.06447949.2428755.8642032.1814443.06241Baxan
811.27695985.9316071.5999663.6894436.47152Eugene
9 9.24931241.7956848.3035939.1419813.87603Baxan

Calculate Standart Statistics Metrics

summary(dataframe)
       K                J               G               B        
 Min.   : 9.002   Min.   :35.01   Min.   :45.01   Min.   :30.03  
 1st Qu.:10.487   1st Qu.:44.90   1st Qu.:55.73   1st Qu.:40.23  
 Median :11.610   Median :55.78   Median :58.37   Median :43.18  
 Mean   :11.490   Mean   :57.96   Mean   :62.34   Mean   :47.45  
 3rd Qu.:12.504   3rd Qu.:70.02   3rd Qu.:64.07   3rd Qu.:53.21  
 Max.   :14.000   Max.   :86.99   Max.   :99.93   Max.   :79.98  
       A                Name    
 Min.   :10.06   Baxan    :749  
 1st Qu.:40.69   Choluteco:749  
 Median :57.08   Eugene   :749  
 Mean   :56.91                  
 3rd Qu.:77.44                  
 Max.   :84.99                  

Prepare Training Data

dataframe_train_matrix <- as.matrix(scale(dataframe[,1:5]))

Train SOM

set.seed(100)
som_grid = somgrid(xdim = 6, ydim = 6, topo = 'hexagonal')
som_model <- som(dataframe_train_matrix,
                 grid = som_grid,
                 rlen = 5000,
                 alpha = c(0.05, 0.01),
                 keep.data = TRUE)
plot(som_model, type = 'changes')

png

set.seed(100)
clusterdata <- getCodes(som_model)
wss <- (nrow(clusterdata) - 1) * sum(apply(clusterdata, 2, var))

for (i in 2:35) { #i must be less than 6*6 the grid size defined at the begining
  wss[i] <- sum(kmeans(clusterdata, centers = i)$withinss)
}

par(mar = c(5.1, 4.1, 4.1, 2.1))
plot(wss, type = 'l',
     xlab = 'Number of Clusters',
     ylab = 'Within groups sum of squares',
     main = 'Within cluster sum of squares (WCSS)')
abline(v = 3, col = 'red')

png

set.seed(100)
fit_kmeans <- kmeans(clusterdata[,1:5], 3)
cl_assignment_k <- fit_kmeans$cluster[som_model$unit.classif] 
#The above is to assign units to clusters based on their class-id (code-id) in the SOM model.
dataframe$clustersKm <- cl_assignment_k #back to original data.
dataframe$trueN <- ifelse(dataframe$Name == 'Baxan',
                          1,
                          ifelse(dataframe$Name == 'Choluteco',
                                 2,
                                 ifelse(dataframe$Name == 'Eugene',
                                        3,
                                        NA
                                       )
                                )
                         )
head(dataframe)
dataframe$testkm <- 1 #make everything "wrong" (i.e. 1).
#Cluster assignments can only be right in one whay, but wrong in k (no of clusters-1) ways.
#Better look for the "rights", rather than the "wrongs".
dataframe$testkm[dataframe$clustersKm == 2 & dataframe$trueN == 1] <- 0 #Make 2 right, i.e. 0.
dataframe$testkm[dataframe$clustersKm == 3 & dataframe$trueN == 2] <- 0 #3 was right for true 2.
dataframe$testkm[dataframe$clustersKm == 1 & dataframe$trueN == 3] <- 0 #1 was righht for true 3.
testKM <- sum(dataframe$testkm)
as.matrix(list(accurancy = 1 - (testKM / 36)))
A data.frame: 6 × 8
KJGBANameclustersKmtrueN
<dbl><dbl><dbl><dbl><dbl><fct><int><dbl>
413.15630464.5046185.1117278.4268667.07043Eugene13
512.09843274.6859083.2060548.3648472.76105Eugene13
612.67012573.8083854.3157278.8865659.54595Eugene13
7 9.06447949.2428755.8642032.1814443.06241Baxan 21
811.27695985.9316071.5999663.6894436.47152Eugene13
9 9.24931241.7956848.3035939.1419813.87603Baxan 21
A matrix: 1 × 1
accurancy0.9444444

Clustering

ACCCORING TO THE GRAPHICS PRINTED ABOVE THE OPTIMAL VALUE OF k EQUALS TO 3

som_cluster <- cutree(hclust(dist(clusterdata)), k = 3)

palette <- c('#2ca02c', '#d62728', '#9467bd')

par(mfrow = c(1,2))
plot(som_model, type = 'codes') 
plot(som_model, type = 'codes', bgcol = palette[som_cluster], main = 'Clusters') 
add.cluster.boundaries(som_model, som_cluster, col = 'darkblue')

png

About

SA using R

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

 
 
 

Contributors