diff --git a/Raakadatan klusterointi.py b/Raakadatan klusterointi.py index 48810e2221d34a20dc189ba651b9b04f28301fd4..39185c21a813d81a7e15097cb6778a38654fd5ff 100644 --- a/Raakadatan klusterointi.py +++ b/Raakadatan klusterointi.py @@ -3,6 +3,8 @@ import pandas as pd from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler #StandardScaler datan normalisointi, eli skaalaus import matplotlib.pyplot as plt #matplotlib visualisointiin +from sklearn.metrics import silhouette_score +from scipy.stats import f_oneway # Ladataan raakadata data = pd.read_csv("raakadata.csv") @@ -43,4 +45,20 @@ print(indicators) #perustunnusluvut summary = data[["Height", "Circumference", "BarkThickness", "PineNo", "NeedleNo"]].describe() -print(summary) \ No newline at end of file +print(summary) + +# Lasketaan Silhouette Score raakadataklusteroinnille +silhouette_raw = silhouette_score(X_scaled, data["Cluster_Raw"]) +print(f"Raakadatan klusteroinnin Silhouette Score: {silhouette_raw:.3f}") + +# ANOVA-testit raakadatan klusteroinnille +print("\nANOVA-testit raakadatan klusteroinnille:") + +for feature in features: + groups = [] + for cluster_label in sorted(data["Cluster_Raw"].unique()): + group = data[data["Cluster_Raw"] == cluster_label][feature] + groups.append(group) + + stat, p_value = f_oneway(*groups) + print(f"{feature}: p-arvo = {p_value:.2e}") \ No newline at end of file diff --git a/Uusien ominaisuuksien klusterointi.py b/Uusien ominaisuuksien klusterointi.py index fe5d30a93a0a67a030d79ba7218bcf37e806cce9..7fa6ec606e91ebaacc731b5bf8a2a7e8326ed0f7 100644 --- a/Uusien ominaisuuksien klusterointi.py +++ b/Uusien ominaisuuksien klusterointi.py @@ -4,6 +4,8 @@ from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler # StandardScaler datan normalisointi, eli skaalaus import matplotlib.pyplot as plt # matplotlib visualisointiin, lataa gitillä "pip install matplotlib" import numpy as np # Liian kaukaisten pisteiden poistamiseksi kuvaajasta +from sklearn.metrics import silhouette_score +from scipy.stats import f_oneway # Ladataan raakadata data = pd.read_csv("raakadata.csv") @@ -99,6 +101,10 @@ clusters = kmeans.fit_predict(X_scaled) # Lisätään klusterit dataan data['Cluster'] = clusters +# Lasketaan Silhouette Score uusien ominaisuuksien klusteroinnille +silhouette_new_features = silhouette_score(X_scaled, clusters) +print(f"Uusilla ominaisuuksilla klusteroidun datan Silhouette Score: {silhouette_new_features:.3f}") + # PCA pca = PCA(n_components=2) X_pca = pca.fit_transform(X_scaled) @@ -120,4 +126,12 @@ plt.ylabel(yakseli) legend1 = plt.legend(*scatter.legend_elements(), title="Klusterit") plt.gca().add_artist(legend1) -plt.show() \ No newline at end of file +plt.show() + +anova_features = features # käytetään samoja kuin klusteroinnissa + +print("\nANOVA-testit uusien ominaisuuksien klusteroinnille:") +for feature in anova_features: + groups = [data[data['Cluster'] == cluster][feature] for cluster in range(4)] # Klusterit 0–3 + f_stat, p_value = f_oneway(*groups) + print(f"{feature}: p-arvo = {p_value:.2e}") \ No newline at end of file