Skip to content
Snippets Groups Projects
Commit 9c4756ed authored by Samuel's avatar Samuel
Browse files

ANOVA -testi ja Silhouette Score -testi

parent 7c222bbe
No related branches found
No related tags found
No related merge requests found
...@@ -3,6 +3,8 @@ import pandas as pd ...@@ -3,6 +3,8 @@ import pandas as pd
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler #StandardScaler datan normalisointi, eli skaalaus from sklearn.preprocessing import StandardScaler #StandardScaler datan normalisointi, eli skaalaus
import matplotlib.pyplot as plt #matplotlib visualisointiin import matplotlib.pyplot as plt #matplotlib visualisointiin
from sklearn.metrics import silhouette_score
from scipy.stats import f_oneway
# Ladataan raakadata # Ladataan raakadata
data = pd.read_csv("raakadata.csv") data = pd.read_csv("raakadata.csv")
...@@ -44,3 +46,19 @@ print(indicators) ...@@ -44,3 +46,19 @@ print(indicators)
summary = data[["Height", "Circumference", "BarkThickness", "PineNo", "NeedleNo"]].describe() summary = data[["Height", "Circumference", "BarkThickness", "PineNo", "NeedleNo"]].describe()
print(summary) print(summary)
# Lasketaan Silhouette Score raakadataklusteroinnille
silhouette_raw = silhouette_score(X_scaled, data["Cluster_Raw"])
print(f"Raakadatan klusteroinnin Silhouette Score: {silhouette_raw:.3f}")
# ANOVA-testit raakadatan klusteroinnille
print("\nANOVA-testit raakadatan klusteroinnille:")
for feature in features:
groups = []
for cluster_label in sorted(data["Cluster_Raw"].unique()):
group = data[data["Cluster_Raw"] == cluster_label][feature]
groups.append(group)
stat, p_value = f_oneway(*groups)
print(f"{feature}: p-arvo = {p_value:.2e}")
\ No newline at end of file
...@@ -4,6 +4,8 @@ from sklearn.cluster import KMeans ...@@ -4,6 +4,8 @@ from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler # StandardScaler datan normalisointi, eli skaalaus from sklearn.preprocessing import StandardScaler # StandardScaler datan normalisointi, eli skaalaus
import matplotlib.pyplot as plt # matplotlib visualisointiin, lataa gitillä "pip install matplotlib" import matplotlib.pyplot as plt # matplotlib visualisointiin, lataa gitillä "pip install matplotlib"
import numpy as np # Liian kaukaisten pisteiden poistamiseksi kuvaajasta import numpy as np # Liian kaukaisten pisteiden poistamiseksi kuvaajasta
from sklearn.metrics import silhouette_score
from scipy.stats import f_oneway
# Ladataan raakadata # Ladataan raakadata
data = pd.read_csv("raakadata.csv") data = pd.read_csv("raakadata.csv")
...@@ -99,6 +101,10 @@ clusters = kmeans.fit_predict(X_scaled) ...@@ -99,6 +101,10 @@ clusters = kmeans.fit_predict(X_scaled)
# Lisätään klusterit dataan # Lisätään klusterit dataan
data['Cluster'] = clusters data['Cluster'] = clusters
# Lasketaan Silhouette Score uusien ominaisuuksien klusteroinnille
silhouette_new_features = silhouette_score(X_scaled, clusters)
print(f"Uusilla ominaisuuksilla klusteroidun datan Silhouette Score: {silhouette_new_features:.3f}")
# PCA # PCA
pca = PCA(n_components=2) pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled) X_pca = pca.fit_transform(X_scaled)
...@@ -121,3 +127,11 @@ legend1 = plt.legend(*scatter.legend_elements(), title="Klusterit") ...@@ -121,3 +127,11 @@ legend1 = plt.legend(*scatter.legend_elements(), title="Klusterit")
plt.gca().add_artist(legend1) plt.gca().add_artist(legend1)
plt.show() plt.show()
anova_features = features # käytetään samoja kuin klusteroinnissa
print("\nANOVA-testit uusien ominaisuuksien klusteroinnille:")
for feature in anova_features:
groups = [data[data['Cluster'] == cluster][feature] for cluster in range(4)] # Klusterit 0–3
f_stat, p_value = f_oneway(*groups)
print(f"{feature}: p-arvo = {p_value:.2e}")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment