System-Analysis

INDEX

Regression | LW#1
Mathematical expectation, dispersion, standard deviation | LW#2
Descriptive statistics for images | LW#3
Cluster Analysis | LW#4
Kohonen SOM | LW#5

1. Regression

Upload Data

data <- read.csv('echocardiogram_1.csv', sep = ',', dec = '.')

Prepearing Data Frame

data <- subset(data, select = c(5, 8))
data <- na.omit(data)
head(data)

A data.frame: 6 × 2

	fractionalshortening	wallmotionscore
	<dbl>	<dbl>
1	0.260	14
2	0.380	14
3	0.260	14
4	0.253	16
5	0.160	18
6	0.260	12

Data Plot

plot(data$wallmotionscore, data$fractionalshortening, pch = 16, col = 'blue')

Calculate Linear Regression

regression <- lm(fractionalshortening ~ wallmotionscore, data = data)
summary(regression)
sub <- paste('fractionalshortening = ', coef(regression)[2], ' * wallmotionscore + ', coef(regression)[1])
plot(data$fractionalshortening ~ data$wallmotionscore, pch = 16, col = 'blue', sub = sub)
abline(coef(regression)[1:2])

Call:
lm(formula = fractionalshortening ~ wallmotionscore, data = data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.18453 -0.07267 -0.00884  0.05202  0.38753 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      0.273106   0.031100   8.782 1.34e-14 ***
wallmotionscore -0.003895   0.002042  -1.908   0.0588 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1046 on 120 degrees of freedom
Multiple R-squared:  0.02943,	Adjusted R-squared:  0.02134 
F-statistic: 3.639 on 1 and 120 DF,  p-value: 0.05883

Calaculate Correlation

cor(data$fractionalshortening, data$wallmotionscore)

-0.171557734476155

Calculate Polynomial Regression

polynomial_regression <- lm(fractionalshortening ~ poly(wallmotionscore, 2, raw = TRUE), data = data)
coef(summary(polynomial_regression))
sub <- paste("fractionalshortening = ", coef(polynomial_regression)[3], " * wallmotionscore^2 + ", coef(polynomial_regression)[2],  " * wallmotionscore + ",coef(polynomial_regression)[1])
plot(data$fractionalshortening ~ data$wallmotionscore, pch = 16, col = 'blue', sub = sub)
abline(coef(regression)[1:2])
lines(sort(data$wallmotionscore), fitted(polynomial_regression)[order(data$wallmotionscore)], col='red', type='l')

A matrix: 3 × 4 of type dbl

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	0.1917937293	0.0684112224	2.8035419	0.005903814
poly(wallmotionscore, 2, raw = TRUE)1	0.0060421591	0.0077255844	0.7820974	0.435710788
poly(wallmotionscore, 2, raw = TRUE)2	-0.0002709161	0.0002031897	-1.3333162	0.184974802

2. Mathematical expectation, dispersion, standard deviation

Load Data

library(repr)

options(repr.plot.width = 16, repr.plot.height = 12)

data_frame <- read.csv('echocardiogram.csv', sep = ',', dec = '.')
data_frame <- subset(data_frame, select = c(5, 8))
data_frame_without_na <- na.omit(data_frame)
fractionalshortening <- na.omit(data_frame$fractionalshortening)
wallmotionscore <- na.omit(data_frame$wallmotionscore)

Create Gistograms

par(mfrow = c(1, 2))#The whole point of the line is to place to gistograms side by side
hist(fractionalshortening, freq = TRUE, right = F, col = 'seagreen', main = 'Fractional shortening')
hist(wallmotionscore, freq = TRUE, right = F, col = 'slateblue1', main = 'Wallmotion score')

Calculate Mean Values, Despersions and Standart Deviations

statistics_params <- matrix(c(mean(fractionalshortening), mean(wallmotionscore), #Mean Values
                        var(fractionalshortening), var(wallmotionscore),         #Variation or Dispersion 
                        sd(fractionalshortening), sd(wallmotionscore)), ncol = 3, nrow = 2) #sd -- Standart Deviation
colnames(statistics_params) <- c('Mean value', 'Despersion', 'Standart deviation')
rownames(statistics_params) <- c('Fractional shortening', 'Wallmotion score')
statistics_params

A matrix: 2 × 3 of type dbl

	Mean value	Despersion	Standart deviation
Fractional shortening	0.2167339	0.01155901	0.1075128
Wallmotion score	14.4381250	25.18600906	5.0185664

Confidence Intervals for Mean Values and Despersions

library(DescTools) #Load lib that contains functions for calculating confidence intervals

ci_for_means <- matrix(c(MeanCI(fractionalshortening),
                         MeanCI(wallmotionscore)), ncol = 3, byrow = TRUE)
colnames(ci_for_means) <- c("Mean value", "Lower CI", "Upper CI")
rownames(ci_for_means) <- c("Fractional shortening", "Wallmotion score")

ci_for_vars <- matrix(c(VarCI(fractionalshortening),
                        VarCI(wallmotionscore)), ncol = 3, byrow = TRUE)
colnames(ci_for_vars) <- c("Despersion", "Lower CI", "Upper CI")
rownames(ci_for_vars) <- c("Fractional shortening", "Wallmotion score")

ci_for_means
ci_for_vars

A matrix: 2 × 3 of type dbl

	Mean value	Lower CI	Upper CI
Fractional shortening	0.2167339	0.1976225	0.2358452
Wallmotion score	14.4381250	13.5603547	15.3158953

A matrix: 2 × 3 of type dbl

	Despersion	Lower CI	Upper CI
Fractional shortening	0.01155901	0.009137898	0.01509378
Wallmotion score	25.18600906	19.980678503	32.73974143

Student

Dispersions are known

t.testk <- function(x, y, v_x, v_y, m0 = 0, alpha = 0.05, alternative = 'two.sided') {
    mean_x <- mean(x)
    mean_y <- mean(y)
    
    length_x <- length(x)
    length_y <- length(y)
    
    sd_x <- sqrt(v_x)
    sd_y <- sqrt(v_y)
    
    s <- sqrt((v_x / length_x) + (v_y / length_y))
    
    statistic <- (mean_x - mean_y - m0) / s
    
    p <- if (alternative == 'two.sided') {
        2 * pnorm(abs(statistic), lower.tail = FALSE)
    } else if (alternative == 'less') {
        pnorm(statistic, lower.tail = TRUE)
    } else {
        pnorm(statistic, lower.tail = FALSE)
    }
    
    LCL <- (mean_x - mean_y - s * qnorm(1 - alpha / 2))
    UCL <- (mean_x - mean_y + s * qnorm(1 - alpha / 2))
  
    result <- matrix(c(mean_x, mean_y, m0, sd_x, sd_y, s, statistic, p, LCL, UCL, alternative),
                     ncol = 11, byrow = TRUE)
    colnames(result) <- c('mean_x', 'mean_y', 'm0', 'sd_x', 'sd_y', 
                          's', 'statistic', 'p.value', 'LCL', 'UCL', 'alternative')
    return(result)
}

t.testk(fractionalshortening, wallmotionscore, 1, 0.78)

A matrix: 1 × 11 of type chr

mean_x	mean_y	m0	sd_x	sd_y	s	statistic	p.value	LCL	UCL	alternative
0.216733870967742	14.438125	0	1	0.883176086632785	0.118988512592738	-119.519025989574	0	-14.454604328288	-13.9881779297765	two.sided

Dispersions are unknown

t.test(fractionalshortening, wallmotionscore)

	Welch Two Sample t-test

data:  fractionalshortening and wallmotionscore
t = -32.053, df = 127.12, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -15.09936 -13.34342
sample estimates:
 mean of x  mean of y 
 0.2167339 14.4381250

3. Descriptive statistics for images

Load Images

library(imager)
library(repr)

options(repr.plot.width = 16, repr.plot.height = 12)

picture_1_path <- file.path('C:', 'Users', 'pinky', 'Desktop', '6sem', 'ml', 'lab3', 'picture-1.jpg') 
picture_2_path <- file.path('C:', 'Users', 'pinky', 'Desktop', '6sem', 'ml', 'lab3', 'picture-2.jpg')
picture_1 <- load.image(picture_1_path)
picture_2 <- load.image(picture_2_path)

Convert Images into Grayscale Pictures

par(mfrow = c(1,2))

plot(picture_1)
picture_1 <- grayscale(picture_1)
plot(picture_1)

plot(picture_2)
picture_2 <- grayscale(picture_2)
plot(picture_2)

Pictures Histograms

par(mfrow = c(1,2))
picture_1 %>% hist(main = 'Luminance values in picture_1')
picture_2 %>% hist(main = 'Luminance values in picture_2')

Mean Values, Standart deviations, Modes and Medians for Histograms

mode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

statistics_params_for_pictures <- matrix(c(mean(picture_1), mean(picture_2), 
                                           sd(picture_1), sd(picture_2), 
                                           mode(picture_1), mode(picture_2), 
                                           median(picture_1), median(picture_2)), ncol = 4, nrow = 2)
colnames(statistics_params_for_pictures) <- c('Mean value', 'Standart deviation', 'Mode', 'Median')
rownames(statistics_params_for_pictures) <- c('Picture-1', 'Picture-2')
statistics_params_for_pictures <- as.table(statistics_params_for_pictures)
statistics_params_for_pictures

          Mean value Standart deviation      Mode    Median
Picture-1  0.5100096          0.2356856 0.9007059 0.4631373
Picture-2  0.5879227          0.2512607 0.9976863 0.5405882

Pearson Test

Так как статистика Пирсона измеряет разницу между эмпирическим и теоретическим распределениями, то чем больше ее наблюдаемое значение Kнабл, тем сильнее довод против основной гипотезы.

Hypothesises:

(H0): The data are normally distributed

(HA): The data are not normally distributed

library(nortest)
pearson.test(picture_1, adjust = TRUE)
pearson.test(picture_2, adjust = TRUE)

	Pearson chi-square normality test

data:  picture_1
P = 88688, p-value < 2.2e-16





	Pearson chi-square normality test

data:  picture_2
P = 125823, p-value < 2.2e-16

The low `p-value(<0.05)` indicates that we fail to reject the null hypothesis and that the distribution is not normal.

According to the fd calculation formula

x <- x[complete.cases(x)]
n <- length(x)
if (adjust) {
    dfd <- 2
} else {
    dfd <- 0
}
df <- n.classes - dfd - 1

we can get the fd values for two data frames. Due to the same size of pictures we get fd = 260. Using Excel function ХИ2ОБРwe can calculate Pc. It equals to205.02.

`Result:`

P1 > PcandP2 > Pc, hence picture_1 and picture_2 ARE'N NORMALLY DISTRIBUTED.

4. Cluster Analysis

Load Libraries

library(scatterplot3d)
library(heplots)
library(cluster)
library(MVN)
library(plotly)
library(klaR)
library(Morpho)
library(caret)
library(mclust)
library(ggplot2)
library(GGally)
library(plyr)
library(psych)
library(factoextra)
library(repr)

options(repr.plot.width = 16, repr.plot.height = 12)

Load Data

dataframe <- read.table('data.csv', header = TRUE,  sep = ',')
dataframe <- na.omit(dataframe)
head(dataframe)

A data.frame: 6 × 12

	A	B	C	D	E	F	G	H	I	J	K	Name
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>
1	57.60238	21.71322	51.65182	63.15491	47.50259	59.35368	8.753383	20.603756	6.491436	65.02040	8.078522	Dum
2	68.83134	24.31474	57.22687	62.42691	43.18839	51.63182	14.513575	73.935348	20.224595	74.94872	8.554909	Dum
3	26.39129	33.54387	97.81550	62.18773	12.46912	61.80452	33.145005	9.061499	26.646903	46.18950	11.804890	Albukerke
4	67.07043	78.42686	49.09763	48.87779	22.98881	47.41171	85.111717	41.587100	18.941216	64.50461	13.156304	Eugene
5	72.76105	48.36484	68.36730	44.16909	19.68516	68.10718	83.206047	52.037685	5.547961	74.68590	12.098432	Eugene
6	59.54595	78.88656	65.15747	21.71169	22.03904	64.93015	54.315721	71.587874	7.979146	73.80838	12.670125	Eugene

Prepare Data for Further Analysis

1st class	2st class	3st class	1st symptom	2st symptom	3st symptom
Dum	Eugene	Baxan	E	J	D

dataframe <- subset(dataframe, Name %in% c('Dum', 'Eugene', 'Baxan'), select = c(D, E, J, Name))
dataframe$Name <- as.factor(dataframe$Name)
head(dataframe)

A data.frame: 6 × 4

	D	E	J	Name
	<dbl>	<dbl>	<dbl>	<fct>
1	63.15491	47.502589	65.02040	Dum
2	62.42691	43.188385	74.94872	Dum
4	48.87779	22.988814	64.50461	Eugene
5	44.16909	19.685162	74.68590	Eugene
6	21.71169	22.039042	73.80838	Eugene
7	78.19632	8.671908	49.24287	Baxan

Calculate Standart Statistics Metrics

summary(dataframe)

       D                  E                  J             Name    
 Min.   : 0.03765   Min.   : 0.01439   Min.   :35.01   Baxan :749  
 1st Qu.:44.71768   1st Qu.:10.82736   1st Qu.:56.32   Dum   :749  
 Median :62.92686   Median :18.06381   Median :64.80   Eugene:749  
 Mean   :56.61332   Mean   :19.20471   Mean   :63.92               
 3rd Qu.:71.65353   3rd Qu.:24.29174   3rd Qu.:72.14               
 Max.   :99.98825   Max.   :49.99397   Max.   :86.99

Cluster Analysis

`Get Optimal Numbers of Clusters`

fviz_nbclust(dataframe[,1:3], kmeans, method = 'wss') + geom_vline(xintercept = 2, linetype = 1)

`Оптимальное число кластеров соответствует точке перегиба графика. В данном случае лучше разделить записи на 2 кластера. Теперь выполним непосредственно кластерный анализ методом k-средних для 2 кластеров с помощью функции kmeans:`

km <- kmeans(dataframe[,1:3], 2, nstart = 1000)
table(km$cluster, dataframe$Name)

    Baxan Dum Eugene
  1     6   0    599
  2   743 749    150

`Rough Accurancy Estimating`

Total number of correctly classified instances are: 743 + 749 + 599 = 2091. Total number of incorrectly classified instances are: 6 + 150 = 156. Accuracy = 2091/(2091 + 156) = 0.93 i.e our model has achieved 93% accuracy.

`Чтобы убедиться в результатах анализа определим средние значениях всех анализируемых параметров в каждом из кластеров:`

centroids <- aggregate(dataframe[,1:3], by = list(km$cluster),FUN = mean)
centroids

A data.frame: 2 × 4

Group.1	D	E	J
<int>	<dbl>	<dbl>	<dbl>
1	23.91189	19.80429	73.94881
2	68.66226	18.98380	60.22417

`Из полученной таблицы различие между записями в разных кластерах уже видно. Теперь присвоим номера кластеров каждой из записей исходного набора данных:`

dataframe <- data.frame(dataframe, km$cluster)
head(dataframe)

A data.frame: 6 × 5

	D	E	J	Name	km.cluster
	<dbl>	<dbl>	<dbl>	<fct>	<int>
1	63.15491	47.502589	65.02040	Dum	2
2	62.42691	43.188385	74.94872	Dum	2
4	48.87779	22.988814	64.50461	Eugene	2
5	44.16909	19.685162	74.68590	Eugene	1
6	21.71169	22.039042	73.80838	Eugene	1
7	78.19632	8.671908	49.24287	Baxan	2

3D Plot Based on Cluster Group

colors <- c('rosybrown3', 'paleturquoise4')
point_shapes <- c(15, 16)
s3d <- scatterplot3d(dataframe[,1:3], pch = point_shapes[dataframe$km.cluster], color = colors[dataframe$km.cluster])
s3d$points3d(centroids$D[1], centroids$E[1], centroids$J[1], pch = point_shapes[1], cex = 3)
s3d$points3d(centroids$D[2], centroids$E[2], centroids$J[2], pch = point_shapes[2], cex = 3)
legend('right', legend = unique(dataframe$km.cluster), 
       col = colors[unique(dataframe$km.cluster)], pch = point_shapes[unique(dataframe$km.cluster)])

Validation of Clustering Results

`We'll use the silhouette coefficient (silhouette width) to evaluate the goodness of our clustering.`

The silhouette coefficient is calculated as follows:

For each observation i,it calculates the average dissimilarity betweeniand all the other points within the same cluster whichibelongs. Let’s call this average dissimilarityDi.
Now we do the same dissimilarity calculation between iand all the other clusters and get the lowest value among them. That is, we find the dissimilarity betweeniand the cluster that is closest toiright after its own cluster. Let’s call that valueCi.
The silhouette (Si) width is the difference between CiandDi divided by the greatest of those two values (max(Di, Ci)).
Si = (Ci — Di) / max(Di, Ci)

So, the interpretation of the silhouette width is the following:

Si > 0 means that the observation is well clustered. The closest it is to 1, the best it is clustered.

Si < 0 means that the observation was placed in the wrong cluster.

Si = 0 means that the observation is between two clusters.

silhouette_width <- silhouette(dataframe$km.cluster, dist(dataframe[,1:3]))
fviz_silhouette(silhouette_width)

  cluster size ave.sil.width
1       1  605          0.57
2       2 1642          0.44

`The silhouette plot above gives us evidence that our clustering using four groups is good because there’s no negative silhouette width and most of the values are bigger than 0.5.`

5. Kohonen SOM

Load Libraries

Kohonen SOM Tutorial

library(scatterplot3d)
library(repr)
library(kohonen)
library(heplots)
library(cluster)
library(MVN)
library(plotly)
library(klaR)
library(Morpho)
library(caret)
library(mclust)
library(ggplot2)
library(GGally)
library(plyr)
library(psych)
library(factoextra)
library(GPArotation)
library(RColorBrewer)
library(ggpubr)

options(repr.plot.width = 16, repr.plot.height = 12)

Load Data

dataframe <- read.table('data.csv', header = TRUE,  sep = ',')
dataframe <- na.omit(dataframe)
head(dataframe)

A data.frame: 6 × 12

	A	B	C	D	E	F	G	H	I	J	K	Name
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>
1	57.60238	21.71322	51.65182	63.15491	47.50259	59.35368	8.753383	20.603756	6.491436	65.02040	8.078522	Dum
2	68.83134	24.31474	57.22687	62.42691	43.18839	51.63182	14.513575	73.935348	20.224595	74.94872	8.554909	Dum
3	26.39129	33.54387	97.81550	62.18773	12.46912	61.80452	33.145005	9.061499	26.646903	46.18950	11.804890	Albukerke
4	67.07043	78.42686	49.09763	48.87779	22.98881	47.41171	85.111717	41.587100	18.941216	64.50461	13.156304	Eugene
5	72.76105	48.36484	68.36730	44.16909	19.68516	68.10718	83.206047	52.037685	5.547961	74.68590	12.098432	Eugene
6	59.54595	78.88656	65.15747	21.71169	22.03904	64.93015	54.315721	71.587874	7.979146	73.80838	12.670125	Eugene

Prepare Data for Further Analysis

1st class	2st class	3st class	1st symptom	2st symptom	3st symptom	4st symptom	5st symptom
Baxan	Choluteco	Eugene	K	J	G	B	A

dataframe <- subset(dataframe, Name %in% c('Choluteco', 'Eugene', 'Baxan'), select = c(K, J, G, B, A, Name))
dataframe$Name <- as.factor(dataframe$Name)
head(dataframe)

A data.frame: 6 × 6

	K	J	G	B	A	Name
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<fct>
4	13.156304	64.50461	85.11172	78.42686	67.07043	Eugene
5	12.098432	74.68590	83.20605	48.36484	72.76105	Eugene
6	12.670125	73.80838	54.31572	78.88656	59.54595	Eugene
7	9.064479	49.24287	55.86420	32.18144	43.06241	Baxan
8	11.276959	85.93160	71.59996	63.68944	36.47152	Eugene
9	9.249312	41.79568	48.30359	39.14198	13.87603	Baxan

Calculate Standart Statistics Metrics

summary(dataframe)

       K                J               G               B        
 Min.   : 9.002   Min.   :35.01   Min.   :45.01   Min.   :30.03  
 1st Qu.:10.487   1st Qu.:44.90   1st Qu.:55.73   1st Qu.:40.23  
 Median :11.610   Median :55.78   Median :58.37   Median :43.18  
 Mean   :11.490   Mean   :57.96   Mean   :62.34   Mean   :47.45  
 3rd Qu.:12.504   3rd Qu.:70.02   3rd Qu.:64.07   3rd Qu.:53.21  
 Max.   :14.000   Max.   :86.99   Max.   :99.93   Max.   :79.98  
       A                Name    
 Min.   :10.06   Baxan    :749  
 1st Qu.:40.69   Choluteco:749  
 Median :57.08   Eugene   :749  
 Mean   :56.91                  
 3rd Qu.:77.44                  
 Max.   :84.99

Prepare Training Data

dataframe_train_matrix <- as.matrix(scale(dataframe[,1:5]))

Train SOM

set.seed(100)
som_grid = somgrid(xdim = 6, ydim = 6, topo = 'hexagonal')
som_model <- som(dataframe_train_matrix,
                 grid = som_grid,
                 rlen = 5000,
                 alpha = c(0.05, 0.01),
                 keep.data = TRUE)
plot(som_model, type = 'changes')

set.seed(100)
clusterdata <- getCodes(som_model)
wss <- (nrow(clusterdata) - 1) * sum(apply(clusterdata, 2, var))

for (i in 2:35) { #i must be less than 6*6 the grid size defined at the begining
  wss[i] <- sum(kmeans(clusterdata, centers = i)$withinss)
}

par(mar = c(5.1, 4.1, 4.1, 2.1))
plot(wss, type = 'l',
     xlab = 'Number of Clusters',
     ylab = 'Within groups sum of squares',
     main = 'Within cluster sum of squares (WCSS)')
abline(v = 3, col = 'red')

set.seed(100)
fit_kmeans <- kmeans(clusterdata[,1:5], 3)
cl_assignment_k <- fit_kmeans$cluster[som_model$unit.classif] 
#The above is to assign units to clusters based on their class-id (code-id) in the SOM model.
dataframe$clustersKm <- cl_assignment_k #back to original data.

dataframe$trueN <- ifelse(dataframe$Name == 'Baxan',
                          1,
                          ifelse(dataframe$Name == 'Choluteco',
                                 2,
                                 ifelse(dataframe$Name == 'Eugene',
                                        3,
                                        NA
                                       )
                                )
                         )

head(dataframe)
dataframe$testkm <- 1 #make everything "wrong" (i.e. 1).
#Cluster assignments can only be right in one whay, but wrong in k (no of clusters-1) ways.
#Better look for the "rights", rather than the "wrongs".
dataframe$testkm[dataframe$clustersKm == 2 & dataframe$trueN == 1] <- 0 #Make 2 right, i.e. 0.
dataframe$testkm[dataframe$clustersKm == 3 & dataframe$trueN == 2] <- 0 #3 was right for true 2.
dataframe$testkm[dataframe$clustersKm == 1 & dataframe$trueN == 3] <- 0 #1 was righht for true 3.
testKM <- sum(dataframe$testkm)
as.matrix(list(accurancy = 1 - (testKM / 36)))

A data.frame: 6 × 8

	K	J	G	B	A	Name	clustersKm	trueN
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<fct>	<int>	<dbl>
4	13.156304	64.50461	85.11172	78.42686	67.07043	Eugene	1	3
5	12.098432	74.68590	83.20605	48.36484	72.76105	Eugene	1	3
6	12.670125	73.80838	54.31572	78.88656	59.54595	Eugene	1	3
7	9.064479	49.24287	55.86420	32.18144	43.06241	Baxan	2	1
8	11.276959	85.93160	71.59996	63.68944	36.47152	Eugene	1	3
9	9.249312	41.79568	48.30359	39.14198	13.87603	Baxan	2	1

A matrix: 1 × 1

accurancy	0.9444444

Clustering

`ACCCORING TO THE GRAPHICS PRINTED ABOVE THE OPTIMAL VALUE OF` `k` `EQUALS TO 3`

som_cluster <- cutree(hclust(dist(clusterdata)), k = 3)

palette <- c('#2ca02c', '#d62728', '#9467bd')

par(mfrow = c(1,2))
plot(som_model, type = 'codes') 
plot(som_model, type = 'codes', bgcol = palette[som_cluster], main = 'Clusters') 
add.cluster.boundaries(som_model, som_cluster, col = 'darkblue')

Name		Name	Last commit message	Last commit date
Latest commit History 13 Commits
stuff		stuff
README.md		README.md

Folders and files

Latest commit

History

Repository files navigation

System-Analysis

INDEX

1. Regression

Upload Data

Prepearing Data Frame

Data Plot

Calculate Linear Regression

Calaculate Correlation

Calculate Polynomial Regression

2. Mathematical expectation, dispersion, standard deviation

Load Data

Create Gistograms

Calculate Mean Values, Despersions and Standart Deviations

Confidence Intervals for Mean Values and Despersions

Student

3. Descriptive statistics for images

Load Images

Convert Images into Grayscale Pictures

Pictures Histograms

Mean Values, Standart deviations, Modes and Medians for Histograms

Pearson Test

The low p-value(<0.05) indicates that we fail to reject the null hypothesis and that the distribution is not normal.

Result:

4. Cluster Analysis

Load Libraries

Load Data

Prepare Data for Further Analysis

Calculate Standart Statistics Metrics

Cluster Analysis

Get Optimal Numbers of Clusters

Rough Accurancy Estimating

Чтобы убедиться в результатах анализа определим средние значениях всех анализируемых параметров в каждом из кластеров:

Из полученной таблицы различие между записями в разных кластерах уже видно. Теперь присвоим номера кластеров каждой из записей исходного набора данных:

3D Plot Based on Cluster Group

Validation of Clustering Results

We'll use the silhouette coefficient (silhouette width) to evaluate the goodness of our clustering.

The silhouette plot above gives us evidence that our clustering using four groups is good because there’s no negative silhouette width and most of the values are bigger than 0.5.

5. Kohonen SOM

Load Libraries

Load Data

Prepare Data for Further Analysis

Calculate Standart Statistics Metrics

Prepare Training Data

Train SOM

Clustering

ACCCORING TO THE GRAPHICS PRINTED ABOVE THE OPTIMAL VALUE OF k EQUALS TO 3

About

Resources

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Uh oh!

Contributors

Uh oh!

The low `p-value(<0.05)` indicates that we fail to reject the null hypothesis and that the distribution is not normal.

`Result:`

`Get Optimal Numbers of Clusters`

`Rough Accurancy Estimating`

`Чтобы убедиться в результатах анализа определим средние значениях всех анализируемых параметров в каждом из кластеров:`

`Из полученной таблицы различие между записями в разных кластерах уже видно. Теперь присвоим номера кластеров каждой из записей исходного набора данных:`

`We'll use the silhouette coefficient (silhouette width) to evaluate the goodness of our clustering.`

`The silhouette plot above gives us evidence that our clustering using four groups is good because there’s no negative silhouette width and most of the values are bigger than 0.5.`

`ACCCORING TO THE GRAPHICS PRINTED ABOVE THE OPTIMAL VALUE OF` `k` `EQUALS TO 3`

Packages