kozo2 commited on
Commit
d8b0e7f
ยท
verified ยท
1 Parent(s): 7aa8120

Upload src/streamlit_app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +200 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,202 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ from sklearn.decomposition import PCA
6
+ from sklearn.datasets import load_iris, load_wine, load_breast_cancer
7
+ from sklearn.preprocessing import StandardScaler
8
+ import plotly.graph_objects as go
9
+
10
+ # Set page config
11
+ st.set_page_config(
12
+ page_title="PCA Visualization App",
13
+ page_icon="๐Ÿ“Š",
14
+ layout="wide"
15
+ )
16
+
17
+ # Title and description
18
+ st.title("๐Ÿ“Š PCA Visualization Dashboard")
19
+ st.markdown("""
20
+ This app demonstrates Principal Component Analysis (PCA) visualization using different datasets.
21
+ Use the controls in the sidebar to customize your analysis.
22
+ """)
23
+
24
+ # Sidebar controls
25
+ st.sidebar.header("๐ŸŽ›๏ธ Controls")
26
+
27
+ # Dataset selection
28
+ dataset_name = st.sidebar.selectbox(
29
+ "Select Dataset",
30
+ ("Iris", "Wine", "Breast Cancer")
31
+ )
32
+
33
+ # Number of components
34
+ n_components = st.sidebar.slider(
35
+ "Number of Components",
36
+ min_value=2,
37
+ max_value=3,
38
+ value=2,
39
+ help="Select 2D or 3D visualization"
40
+ )
41
+
42
+ # Load selected dataset
43
+ @st.cache_data
44
+ def load_data(dataset_name):
45
+ if dataset_name == "Iris":
46
+ data = load_iris()
47
+ elif dataset_name == "Wine":
48
+ data = load_wine()
49
+ else:
50
+ data = load_breast_cancer()
51
+
52
+ df = pd.DataFrame(data.data, columns=data.feature_names)
53
+ df['target'] = data.target
54
+ df['target_names'] = [data.target_names[i] for i in data.target]
55
+ return df, data.target_names
56
+
57
+ # Load data
58
+ df, target_names = load_data(dataset_name)
59
+
60
+ # Display dataset info
61
+ st.subheader(f"Dataset: {dataset_name}")
62
+ st.write(f"Shape: {df.shape[0]} rows, {df.shape[1]-2} features")
63
+ st.write(f"Target classes: {', '.join(target_names)}")
64
+
65
+ # Show raw data toggle
66
+ if st.checkbox("Show raw data"):
67
+ st.write(df.head())
68
+
69
+ # Prepare data for PCA
70
+ X = df.drop(['target', 'target_names'], axis=1)
71
+ y = df['target']
72
+ target_names = df['target_names'].unique()
73
+
74
+ # Standardize the data
75
+ scaler = StandardScaler()
76
+ X_scaled = scaler.fit_transform(X)
77
+
78
+ # Apply PCA
79
+ pca = PCA(n_components=n_components)
80
+ X_pca = pca.fit_transform(X_scaled)
81
+
82
+ # Create DataFrame with PCA results
83
+ pca_columns = [f"PC{i+1}" for i in range(n_components)]
84
+ df_pca = pd.DataFrame(X_pca, columns=pca_columns)
85
+ df_pca['target'] = y
86
+ df_pca['target_names'] = df['target_names']
87
+
88
+ # Display PCA info
89
+ st.subheader("PCA Analysis")
90
+ col1, col2 = st.columns(2)
91
+ with col1:
92
+ st.write(f"Explained Variance Ratio: {[f'{val:.2%}' for val in pca.explained_variance_ratio_]}")
93
+ st.write(f"Total Variance Explained: {pca.explained_variance_ratio_.sum():.2%}")
94
+
95
+ with col2:
96
+ st.write(f"Cumulative Variance Explained: {[f'{val:.2%}' for val in pca.explained_variance_ratio_.cumsum()]}")
97
+
98
+ # Create visualization
99
+ if n_components == 2:
100
+ fig = px.scatter(
101
+ df_pca,
102
+ x='PC1',
103
+ y='PC2',
104
+ color='target_names',
105
+ title=f"PCA Visualization - {dataset_name} Dataset (2D)",
106
+ labels={
107
+ 'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
108
+ 'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)'
109
+ },
110
+ hover_data=['target_names']
111
+ )
112
+
113
+ fig.update_traces(marker=dict(size=8, opacity=0.8))
114
+ fig.update_layout(
115
+ width=800,
116
+ height=600,
117
+ legend_title_text='Classes'
118
+ )
119
+
120
+ else: # 3D visualization
121
+ fig = px.scatter_3d(
122
+ df_pca,
123
+ x='PC1',
124
+ y='PC2',
125
+ z='PC3',
126
+ color='target_names',
127
+ title=f"PCA Visualization - {dataset_name} Dataset (3D)",
128
+ labels={
129
+ 'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
130
+ 'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)',
131
+ 'PC3': f'PC3 ({pca.explained_variance_ratio_[2]:.1%} variance)'
132
+ },
133
+ hover_data=['target_names']
134
+ )
135
+
136
+ fig.update_traces(marker=dict(size=5, opacity=0.8))
137
+ fig.update_layout(
138
+ width=800,
139
+ height=600,
140
+ legend_title_text='Classes'
141
+ )
142
+
143
+ # Display plot
144
+ st.plotly_chart(fig, use_container_width=True)
145
+
146
+ # Feature contribution to principal components
147
+ st.subheader("Feature Contributions to Principal Components")
148
+ feature_importance = pd.DataFrame(
149
+ pca.components_.T,
150
+ columns=pca_columns,
151
+ index=X.columns
152
+ )
153
+
154
+ # Display feature importance as a heatmap
155
+ fig_importance = px.imshow(
156
+ feature_importance.T,
157
+ labels=dict(x="Features", y="Principal Components", color="Contribution"),
158
+ color_continuous_scale='RdBu_r',
159
+ aspect="auto",
160
+ title="Feature Contributions to Principal Components"
161
+ )
162
+
163
+ fig_importance.update_layout(
164
+ width=800,
165
+ height=400
166
+ )
167
+
168
+ st.plotly_chart(fig_importance, use_container_width=True)
169
+
170
+ # Show top contributing features
171
+ st.subheader("Top Contributing Features")
172
+ for i in range(n_components):
173
+ st.write(f"**PC{i+1}**:")
174
+ pc_features = feature_importance[f'PC{i+1}'].abs().sort_values(ascending=False)
175
+ top_features = pc_features.head(5)
176
+ for feature, value in top_features.items():
177
+ st.write(f"- {feature}: {value:.3f}")
178
+ st.write("")
179
 
180
+ # Information about PCA
181
+ with st.expander("โ„น๏ธ About PCA"):
182
+ st.markdown("""
183
+ **Principal Component Analysis (PCA)** is a dimensionality reduction technique that transforms
184
+ high-dimensional data into a lower-dimensional space while preserving as much variance as possible.
185
+
186
+ **Key Concepts:**
187
+ - **Principal Components**: New axes that capture maximum variance in the data
188
+ - **Explained Variance Ratio**: Proportion of total variance explained by each component
189
+ - **Standardization**: Important preprocessing step to ensure all features contribute equally
190
+
191
+ **Benefits:**
192
+ - Reduces computational complexity
193
+ - Removes multicollinearity
194
+ - Helps with data visualization
195
+ - Can improve model performance by reducing noise
196
+
197
+ **Applications:**
198
+ - Data visualization
199
+ - Feature extraction
200
+ - Noise reduction
201
+ - Data compression
202
+ """)