""" SAP Finance Dashboard with RPT-1-OSS Model - Gradio Version Main Gradio application with tabs: - Dashboard: Overview with metrics and charts - Data Explorer: Browse datasets - Upload: Upload custom datasets - Predictions: AI-powered predictions using SAP-RPT-1-OSS - OData: Connect to SAP OData services """ import importlib import os def _ensure_hf_folder_compat(): """Reintroduce gradio's expected huggingface_hub.HfFolder symbol.""" try: from huggingface_hub import HfFolder # noqa: F401 return except ImportError: pass try: hub_module = importlib.import_module("huggingface_hub") except ModuleNotFoundError: return if hasattr(hub_module, "HfFolder"): return class _CompatHfFolder: @staticmethod def get_token(token_path=None): return os.getenv("HUGGINGFACE_TOKEN") @staticmethod def save_token(token, token_path=None): if token: os.environ["HUGGINGFACE_TOKEN"] = token @staticmethod def delete_token(token_path=None): os.environ.pop("HUGGINGFACE_TOKEN", None) hub_module.HfFolder = _CompatHfFolder def _patch_gradio_client_schema_bug(): """Patch gradio_client's JSON schema parser to handle boolean schemas.""" try: from gradio_client import utils as client_utils except (ImportError, AttributeError): return # Patch json_schema_to_python_type to catch and handle the error original_json_to_type = getattr( client_utils, 'json_schema_to_python_type', None ) if not original_json_to_type: return def patched_json_to_type(schema, defs=None): """Safely handle JSON schema parsing for boolean schemas.""" try: return original_json_to_type(schema, defs) except Exception: # If schema parsing fails (e.g., boolean schema), return str return str client_utils.json_schema_to_python_type = patched_json_to_type _ensure_hf_folder_compat() _patch_gradio_client_schema_bug() # Setup HuggingFace authentication for gated model access def _setup_hf_auth(): """Authenticate with HuggingFace Hub using token from environment.""" try: from huggingface_hub import login hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN") if hf_token: login(token=hf_token, add_to_git_credential=False) print("โœ“ HuggingFace authentication configured") else: print("โš  HF_TOKEN not found. Gated model access will fail if not already cached.") except Exception as e: print(f"โš  HuggingFace auth setup failed: {e}") _setup_hf_auth() import gradio as gr print(f"Gradio version: {gr.__version__}") import pandas as pd import numpy as np from pathlib import Path import plotly.graph_objects as go from dotenv import load_dotenv # Import utilities from utils.data_generator import generate_all_datasets from utils.visualizations import ( create_revenue_expense_chart, create_balance_sheet_chart, create_gl_summary_chart, create_sales_analytics_chart, create_sales_trend_chart, get_summary_metrics, create_prediction_distribution_chart, create_prediction_bar_chart, create_confidence_gauge ) from utils.odata_connector import SAPFinanceConnector from models.rpt_model import create_model from utils.playground import ( load_dataset, detect_task_type, detect_task_type_from_column, get_dataset_info, auto_select_target_column, prepare_train_test_split, preprocess_data, export_results, check_embedding_server, start_embedding_server, ensure_embedding_server_running, is_sap_rpt_oss_installed ) # Load environment variables load_dotenv() # Global variables gl_data = pd.DataFrame() financial_data = pd.DataFrame() sales_data = pd.DataFrame() uploaded_data = pd.DataFrame() odata_data = pd.DataFrame() odata_connector = None model_wrapper = None # Playground variables playground_data = pd.DataFrame() playground_model = None playground_results = None def load_datasets(): """Load synthetic datasets if they exist.""" global gl_data, financial_data, sales_data data_dir = Path("data") if not data_dir.exists(): generate_all_datasets() if (data_dir / "synthetic_gl_accounts.csv").exists(): gl_data = pd.read_csv(data_dir / "synthetic_gl_accounts.csv") if (data_dir / "synthetic_financial_statements.csv").exists(): financial_data = pd.read_csv(data_dir / "synthetic_financial_statements.csv") if (data_dir / "synthetic_sales_orders.csv").exists(): sales_data = pd.read_csv(data_dir / "synthetic_sales_orders.csv") def create_dashboard(): """Create dashboard with metrics and charts.""" if gl_data.empty and financial_data.empty and sales_data.empty: load_datasets() # Calculate metrics with vibrant styling metrics_html = "
" if not gl_data.empty: gl_metrics = get_summary_metrics(gl_data, "gl") metrics_html += f"""

๐Ÿ’ฐ GL Transactions

{gl_metrics.get('Total Transactions', 0):,}

""" if not financial_data.empty: fin_metrics = get_summary_metrics(financial_data, "financial") metrics_html += f"""

๐Ÿ“ˆ Latest Revenue

${fin_metrics.get('Latest Revenue', 0):,.0f}

""" if not sales_data.empty: sales_metrics = get_summary_metrics(sales_data, "sales") metrics_html += f"""

๐Ÿ›’ Total Sales

${sales_metrics.get('Total Sales', 0):,.0f}

""" datasets_count = sum([not df.empty for df in [gl_data, financial_data, sales_data, uploaded_data]]) metrics_html += f"""

๐Ÿ“Š Datasets

{datasets_count} loaded

""" # Create charts charts = [] if not financial_data.empty: fig_dict = create_revenue_expense_chart(financial_data) if fig_dict: charts.append(go.Figure(fig_dict)) fig_dict = create_balance_sheet_chart(financial_data) if fig_dict: charts.append(go.Figure(fig_dict)) if not sales_data.empty: fig_dict = create_sales_analytics_chart(sales_data) if fig_dict: charts.append(go.Figure(fig_dict)) return metrics_html, charts[0] if len(charts) > 0 else None, charts[1] if len(charts) > 1 else None, charts[2] if len(charts) > 2 else None def explore_dataset(dataset_type): """Explore selected dataset.""" global gl_data, financial_data, sales_data, uploaded_data if dataset_type == "GL Accounts": if gl_data.empty: return "No GL data available", None, None fig_dict = create_gl_summary_chart(gl_data) fig = go.Figure(fig_dict) if fig_dict else None return f"GL Accounts ({len(gl_data)} records)", fig, gl_data.head(100) elif dataset_type == "Financial Statements": if financial_data.empty: return "No financial data available", None, None fig_dict = create_revenue_expense_chart(financial_data) fig = go.Figure(fig_dict) if fig_dict else None return f"Financial Statements ({len(financial_data)} records)", fig, financial_data elif dataset_type == "Sales Orders": if sales_data.empty: return "No sales data available", None, None fig_dict = create_sales_trend_chart(sales_data) fig = go.Figure(fig_dict) if fig_dict else None return f"Sales Orders ({len(sales_data)} records)", fig, sales_data.head(100) elif dataset_type == "Uploaded Data": if uploaded_data.empty: return "No uploaded data available", None, None return f"Uploaded Data ({len(uploaded_data)} records)", None, uploaded_data.head(100) return "Select a dataset", None, None def upload_file(file): """Handle file upload.""" global uploaded_data if file is not None: try: uploaded_data = pd.read_csv(file.name) return f"Successfully uploaded {len(uploaded_data)} records!", uploaded_data.head(50) except Exception as e: return f"Error uploading file: {str(e)}", None return "No file uploaded", None def init_model(model_type, use_gpu): """Initialize the SAP-RPT-1-OSS model.""" global model_wrapper try: model_wrapper = create_model(model_type=model_type.lower(), use_gpu=use_gpu) context_size = 8192 if use_gpu else 2048 bagging = 8 if use_gpu else 1 return f"""โœ… SAP-RPT-1-OSS Model Initialized Successfully! ๐ŸŽฏ Model Type: {model_type} ๐Ÿ”ง Context Size: {context_size} ๐Ÿ“ฆ Bagging Factor: {bagging} ๐Ÿ’ป Mode: {'GPU (80GB)' if use_gpu else 'CPU (Lightweight)'} ๐Ÿ“ Status: Ready for training โš ๏ธ Requirements: โ€ข Hugging Face authentication โ€ข Embedding service (may be required for predictions) โ€ข Sufficient memory""" except ImportError as e: return f"""โŒ SAP-RPT-1-OSS Model Not Available Error: {str(e)} ๐Ÿ“‹ Installation Required: pip install git+https://github.com/SAP-samples/sap-rpt-1-oss ๐Ÿ”‘ Authentication Required: 1. Create Hugging Face account 2. Accept model license at: https://huggingface.co/SAP/sap-rpt-1-oss 3. Run: huggingface-cli login 4. Set HUGGINGFACE_TOKEN in .env file""" except Exception as e: import traceback error_detail = traceback.format_exc() # Check for common errors if "HUGGINGFACE_TOKEN" in str(e) or "login" in str(e).lower(): return f"""โŒ Hugging Face Authentication Failed Error: {str(e)} ๐Ÿ”‘ Required Steps: 1. Login to Hugging Face: huggingface-cli login 2. OR set HUGGINGFACE_TOKEN in .env file 3. Accept model terms: https://huggingface.co/SAP/sap-rpt-1-oss""" elif "memory" in str(e).lower() or "cuda" in str(e).lower(): return f"""โŒ Insufficient Resources Error: {str(e)} ๐Ÿ’ป Requirements: โ€ข GPU with 80GB memory (recommended) โ€ข OR use CPU mode (uncheck GPU option) โ€ข Context size will be reduced for CPU mode""" else: return f"""โŒ SAP-RPT-1-OSS Initialization Failed Error: {str(e)} ๐Ÿ“‹ Details: {error_detail[:500]} ๐Ÿ”ง Common Solutions: 1. Ensure model is installed 2. Check Hugging Face authentication 3. Verify system resources 4. Try CPU mode if GPU unavailable""" def train_model(dataset_type): """Train the model on selected dataset.""" global model_wrapper, gl_data, financial_data, sales_data, uploaded_data if model_wrapper is None: return "Please initialize the model first" # Select dataset if dataset_type == "GL Accounts": df = gl_data elif dataset_type == "Financial Statements": df = financial_data elif dataset_type == "Sales Orders": df = sales_data elif dataset_type == "Uploaded Data": df = uploaded_data else: return "Please select a dataset" if df.empty: return "Selected dataset is empty" try: # Get numeric columns and clean data X = df.select_dtypes(include=[np.number]) # Remove columns with all NaN values X = X.dropna(axis=1, how='all') # Fill remaining NaN values with 0 X = X.fillna(0) if len(X) > 0 and len(X.columns) > 0: # Create a simple target for classification based on first column y = (X.iloc[:, 0] > X.iloc[:, 0].median()).astype(int) # Keep as DataFrame - SAP-RPT-OSS expects DataFrame or compatible format X_train = pd.DataFrame(X, columns=X.columns) X_train = X_train.astype(float) # Fit the model with DataFrame model_wrapper.fit(X_train, y) return f"โœ… Model trained successfully on {len(X)} samples with {len(X.columns)} features!" else: return "No numeric data available for training" except Exception as e: return f"Error training model: {str(e)}" def get_scenario_labels(dataset_type, scenario): """Get contextual labels for predictions based on dataset and scenario.""" labels_map = { "Sales Orders": { "High Value Order Classification": { 0: "Standard Order (Low Value)", 1: "High Value Order (Premium)", "description": "Identifies orders with high revenue potential" }, "Order Priority Classification": { 0: "Normal Priority", 1: "High Priority / Urgent", "description": "Flags orders requiring immediate attention" }, "Customer Segment Classification": { 0: "Regular Customer", 1: "VIP / Enterprise Customer", "description": "Identifies high-value customer segments" } }, "Products": { "Product Performance Classification": { 0: "Low Performer", 1: "Top Performer / Best Seller", "description": "Identifies products with high sales performance" }, "Stock Risk Classification": { 0: "Normal Stock Level", 1: "Low Stock / Reorder Needed", "description": "Flags products at risk of stockout" } }, "GL Accounts": { "Transaction Risk Classification": { 0: "Normal Transaction", 1: "Flagged / Review Needed", "description": "Identifies potentially risky or unusual transactions" }, "Account Balance Classification": { 0: "Below Average Balance", 1: "Above Average Balance", "description": "Classifies accounts by balance magnitude" }, "Expense Category Classification": { 0: "Operating Expense", 1: "Capital Expenditure", "description": "Categorizes transactions by type" } }, "Financial Statements": { "Financial Health Classification": { 0: "Below Average Performance", 1: "Strong Performance", "description": "Assesses overall financial health" }, "Profitability Classification": { 0: "Low Margin Period", 1: "High Margin Period", "description": "Identifies periods with strong profitability" }, "Growth Trend Classification": { 0: "Declining Revenue", 1: "Revenue Growth", "description": "Classifies periods by revenue trajectory" } } } default_labels = { 0: "Class 0 (Negative/Low)", 1: "Class 1 (Positive/High)", "description": "Binary classification" } return labels_map.get(dataset_type, {}).get(scenario, default_labels) def make_predictions(dataset_type, prediction_scenario): """Make predictions on selected dataset with scenario context.""" global model_wrapper, gl_data, financial_data, sales_data, uploaded_data if model_wrapper is None: return "โŒ Please initialize the model first", None if not hasattr(model_wrapper, 'is_fitted') or not model_wrapper.is_fitted: return "โŒ Please train the model first", None # Select dataset and get original data for context if dataset_type == "Sales Orders": df = sales_data.copy() original_cols = ['Order_Number', 'Customer_Name', 'Total_Amount', 'Status'] elif dataset_type == "Products": df = sales_data.copy() original_cols = ['Product_Name', 'Total_Amount', 'Quantity'] elif dataset_type == "GL Accounts": df = gl_data.copy() original_cols = ['Transaction_ID', 'Account_Description', 'Debit', 'Credit'] elif dataset_type == "Financial Statements": df = financial_data.copy() original_cols = ['Period', 'Revenue', 'Net_Income'] elif dataset_type == "Uploaded Data": df = uploaded_data.copy() original_cols = df.columns[:3].tolist() if len(df.columns) >= 3 else df.columns.tolist() else: return "Please select a dataset", None if df.empty: return f"โŒ Selected dataset ({dataset_type}) is empty", None try: # Get labels for this scenario label_config = get_scenario_labels(dataset_type, prediction_scenario) # Get numeric columns X = df.select_dtypes(include=[np.number]) X = X.dropna(axis=1, how='all') X = X.fillna(X.mean()) if len(X) > 0 and len(X.columns) > 0: # Limit to first 15 rows X_sample = X.head(15) # Keep as DataFrame with proper column names - SAP-RPT-OSS expects DataFrame X_pred = pd.DataFrame(X_sample, columns=X.columns) # Ensure all values are numeric and no NaN X_pred = X_pred.astype(float) X_pred = X_pred.fillna(0) # Make predictions - pass DataFrame directly predictions = model_wrapper.predict(X_pred) # Convert to numpy array and flatten if needed predictions = np.array(predictions) if hasattr(predictions, 'flatten') and len(predictions.shape) > 1: predictions = predictions.flatten() # Get original data columns for context context_df = df.head(15)[original_cols] if all(col in df.columns for col in original_cols) else df.head(15).iloc[:, :3] # Create result with meaningful labels model_type = model_wrapper.model_type.capitalize() if model_type == "Classifier": pred_labels = [label_config.get(int(p), f"Class {int(p)}") for p in predictions] result_df = pd.DataFrame({ 'Row': range(1, len(predictions) + 1), 'Prediction': pred_labels, 'Confidence': predictions }) # Add context columns for col in context_df.columns: result_df[col] = context_df[col].values # Count predictions class_0_count = sum(predictions == 0) class_1_count = sum(predictions == 1) # Create visualizations pie_chart = go.Figure(create_prediction_distribution_chart( predictions, label_config, f"{prediction_scenario} - Distribution" )) bar_chart = go.Figure(create_prediction_bar_chart( predictions, label_config, f"{prediction_scenario} - Summary" )) # Calculate confidence score confidence = max(class_0_count, class_1_count) / len(predictions) * 100 gauge_chart = go.Figure(create_confidence_gauge( confidence, "Prediction Confidence" )) status = f"""โœ… {model_type} Results - {prediction_scenario} ๐Ÿ“Š {label_config.get('description', 'Classification complete')} Analyzed {len(predictions)} records: โ€ข {label_config.get(1, 'Class 1')}: {class_1_count} records ({class_1_count/len(predictions)*100:.1f}%) โ€ข {label_config.get(0, 'Class 0')}: {class_0_count} records ({class_0_count/len(predictions)*100:.1f}%) Dataset: {dataset_type} Model Type: {model_type} Confidence: {confidence:.1f}%""" else: result_df = pd.DataFrame({ 'Row': range(1, len(predictions) + 1), 'Predicted Value': predictions.round(2) }) # Add context columns for col in context_df.columns: result_df[col] = context_df[col].values # Create visualizations for regression fig = go.Figure() fig.add_trace(go.Scatter( x=list(range(1, len(predictions) + 1)), y=predictions, mode='lines+markers', marker=dict(size=10, color='#3498db'), line=dict(width=3, color='#3498db') )) fig.update_layout( title=f"{prediction_scenario} - Predicted Values", xaxis_title="Sample", yaxis_title="Predicted Value", template='plotly_white', height=400 ) pie_chart = fig bar_chart = None gauge_chart = None status = f"""โœ… {model_type} Results - {prediction_scenario} Predicted {len(predictions)} values Mean: {predictions.mean():.2f} Range: {predictions.min():.2f} to {predictions.max():.2f} Std Dev: {predictions.std():.2f} Dataset: {dataset_type}""" return status, result_df, pie_chart, bar_chart, gauge_chart else: return f"โŒ No valid numeric data available in {dataset_type}", None, None, None, None except Exception as e: import traceback error_detail = traceback.format_exc() # Check for specific SAP-RPT-1-OSS errors if "zmq" in str(e).lower() or "socket" in str(e).lower() or "Resource temporarily unavailable" in str(e): return f"""โŒ SAP-RPT-1-OSS Embedding Service Not Available Error: {str(e)} ๐Ÿ”ง SAP-RPT-1-OSS requires an embedding service to be running: **Required Setup:** 1. The model uses a text embedding service via ZMQ socket 2. This service needs to be started separately 3. Service handles semantic understanding of column names and values **To Use SAP-RPT-1-OSS:** โ€ข Start the embedding service (see SAP-RPT-1-OSS documentation) โ€ข Ensure ZMQ socket is accessible โ€ข Verify service is running before making predictions **Current Status:** Model initialized but embedding service unavailable ๐Ÿ“– Documentation: https://github.com/SAP-samples/sap-rpt-1-oss ๐Ÿ”— Model Info: https://huggingface.co/SAP/sap-rpt-1-oss Dataset: {dataset_type} Scenario: {prediction_scenario}""", None, None, None, None else: return f"""โŒ Error making predictions on {dataset_type} Error: {str(e)} ๐Ÿ“‹ Details: {error_detail[:400]} Dataset: {dataset_type} Scenario: {prediction_scenario}""", None, None, None, None def update_scenarios(dataset_type): """Update scenario dropdown based on selected dataset.""" scenarios_map = { "Sales Orders": [ "High Value Order Classification", "Order Priority Classification", "Customer Segment Classification" ], "Products": [ "Product Performance Classification", "Stock Risk Classification" ], "GL Accounts": [ "Transaction Risk Classification", "Account Balance Classification", "Expense Category Classification" ], "Financial Statements": [ "Financial Health Classification", "Profitability Classification", "Growth Trend Classification" ], "Uploaded Data": [ "Custom Classification" ] } scenarios = scenarios_map.get(dataset_type, ["Custom Classification"]) return gr.Dropdown(choices=scenarios, value=scenarios[0]) def test_odata_connection(): """Test OData connection.""" global odata_connector try: odata_connector = SAPFinanceConnector() connected, message = odata_connector.test_connection() if connected: return f"โœ“ {message}" else: return f"โœ— {message}" except Exception as e: return f"Error: {str(e)}" def fetch_odata_data(entity_type, num_records): """Fetch data from OData service.""" global odata_connector, odata_data if odata_connector is None: return "Please test connection first", None try: if entity_type == "Sales Orders": odata_data = odata_connector.fetch_orders_df(num_records) elif entity_type == "Products": odata_data = odata_connector.fetch_products_df(num_records) elif entity_type == "Line Items": odata_data = odata_connector.fetch_line_items_df(num_records) elif entity_type == "Business Partners": odata_data = odata_connector.fetch_partners_df(num_records) else: return "Please select an entity type", None return f"Fetched {len(odata_data)} records", odata_data.head(100) if not odata_data.empty else None except Exception as e: return f"Error fetching data: {str(e)}", None # Playground functions def handle_playground_upload(file): """Handle dataset upload in playground.""" global playground_data if file is None: return "No file uploaded", None, [], None, "classification", [], None try: df, error = load_dataset(file.name) if error: return f"Error: {error}", None, [], None, "classification", [], None playground_data = df # Get dataset info info = get_dataset_info(df) # Auto-select target column (default to last) target_col = auto_select_target_column(df, "classification") # Detect task type from filename first filename_task_type = detect_task_type(Path(file.name).name) # Then detect from target column data type column_task_type = detect_task_type_from_column(df, target_col) # Use column-based detection if filename detection is default if filename_task_type == "classification" and column_task_type == "regression": task_type = column_task_type # Prefer column-based detection else: task_type = filename_task_type # Create info text target_info = "" if target_col: target_series = df[target_col] if pd.api.types.is_numeric_dtype(target_series): unique_count = target_series.dropna().nunique() target_info = f"\nTarget '{target_col}': {unique_count} unique values" if unique_count > 20: target_info += " (suggests regression)" else: target_info += " (suggests classification)" info_text = f"""Dataset loaded successfully! Rows: {info['num_rows']:,} Columns: {info['num_columns']} Numeric columns: {len(info['numeric_columns'])} Categorical columns: {len(info['categorical_columns'])} Detected task type: {task_type} (from filename: {filename_task_type}, from column: {column_task_type}) Suggested target column: {target_col}{target_info}""" # Preview first 10 rows preview = df.head(10) # Column list for dropdown columns = list(df.columns) return ( info_text, preview, columns, # Choices for dropdown target_col, # Value for dropdown task_type, columns, # Choices for second dropdown target_col # Value for second dropdown ) except Exception as e: return f"Error: {str(e)}", None, [], None, "classification", [], None def train_playground_model( task_type, target_column, test_split, max_context_size, bagging, use_gpu, handle_missing, normalize, progress=gr.Progress() ): """Train model in playground with progress tracking.""" global playground_data, playground_model if playground_data.empty: return "Please upload a dataset first", None, None, None try: progress(0.1, desc="Preparing data...") # Preprocess data df_processed = preprocess_data(playground_data, handle_missing, normalize) progress(0.2, desc="Validating target column...") # Validate target column exists if target_column not in df_processed.columns: return f"Error: Target column '{target_column}' not found in dataset", None, None, None # Check target column data type target_series = df_processed[target_column] target_dtype = target_series.dtype # Auto-detect task type if mismatch is_numeric = pd.api.types.is_numeric_dtype(target_series) is_integer_like = False if is_numeric: # Check if it's integer-like (can be converted to int without loss) try: int_values = target_series.dropna().astype(int) float_values = target_series.dropna().astype(float) is_integer_like = (int_values == float_values).all() except: is_integer_like = False # Validate task type matches target column if task_type == "classification": if not is_integer_like: # Check if it's numeric with many unique values if is_numeric: unique_values = target_series.dropna().nunique() if unique_values > 20: # Too many unique values for classification return f"""Error: Target column '{target_column}' contains continuous numeric values ({unique_values} unique values). This looks like a regression problem, not classification. Solution: Change Task Type to 'regression' or convert your target to integer classes.""", None, None, None else: # Convert numeric to integer classes (will be handled later with LabelEncoder) pass else: # String/categorical - will be encoded with LabelEncoder later # No need to convert here, just validate unique_values = target_series.dropna().nunique() if unique_values > 100: return f"""Error: Target column '{target_column}' has too many unique categories ({unique_values}). Classification works best with fewer categories (< 100). Solution: Consider grouping categories or using regression if this is a continuous value.""", None, None, None else: # regression if not is_numeric: return f"""Error: Target column '{target_column}' is not numeric (type: {target_dtype}). Regression requires numeric target values. Solution: Change Task Type to 'classification' or convert your target to numeric.""", None, None, None progress(0.3, desc="Splitting train/test...") # Prepare train/test split X_train, y_train, X_test, y_test = prepare_train_test_split( df_processed, target_column, test_split ) # Ensure classification targets are integers if task_type == "classification": # Handle string/categorical targets by encoding them from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y_train = pd.Series(le.fit_transform(y_train.astype(str)), index=y_train.index) y_test = pd.Series(le.transform(y_test.astype(str)), index=y_test.index) progress(0.4, desc="Preparing model...") # Note: SAP-RPT-OSS typically starts the embedding server automatically when needed # We check status but don't require it to be running beforehand server_running, server_msg = ensure_embedding_server_running() server_warning = "" if not server_running: # This is normal - the model will start the server automatically when making predictions server_warning = f"\n๐Ÿ’ก Note: Embedding server will start automatically when model makes predictions." progress(0.5, desc="Initializing model...") # Initialize model with custom parameters model_type = "classifier" if task_type == "classification" else "regressor" from models.rpt_model import RPTModelWrapper playground_model = RPTModelWrapper( model_type=model_type, max_context_size=max_context_size, bagging=bagging ) progress(0.6, desc="Training model...") # Train model playground_model.fit(X_train, y_train) progress(0.8, desc="Making predictions...") # Make predictions predictions = playground_model.predict(X_test) progress(0.9, desc="Exporting results...") # Export results results_path = export_results( X_test, y_test, predictions, task_type, filename_prefix="playground" ) progress(1.0, desc="Complete!") # Calculate metrics if task_type == "classification": accuracy = (predictions == y_test.values).mean() * 100 metrics = f"Accuracy: {accuracy:.2f}%" else: from sklearn.metrics import mean_squared_error, r2_score mse = mean_squared_error(y_test, predictions) r2 = r2_score(y_test, predictions) metrics = f"MSE: {mse:.4f}, Rยฒ: {r2:.4f}" # Create results DataFrame for display results_df = X_test.copy() results_df['true_value'] = y_test.values if task_type == "classification": results_df['predicted_class'] = predictions else: results_df['predicted_value'] = predictions status = f"""โœ… Training Complete! Training samples: {len(X_train):,} Test samples: {len(X_test):,} {metrics} {server_warning} Results exported to: {results_path}""" return status, results_df.head(100), results_path, gr.File(value=results_path) except Exception as e: import traceback error_detail = traceback.format_exc() return f"Error: {str(e)}\n\nDetails:\n{error_detail[:500]}", None, None, None def check_playground_embedding_server(): """Check embedding server status.""" # First check if package is installed if not is_sap_rpt_oss_installed(): return f"โŒ sap-rpt-oss package not found\n\n๐Ÿ“ฆ Installation Required:\n1. Install sap-rpt-oss: pip install git+https://github.com/SAP-samples/sap-rpt-1-oss\n2. Install pyzmq: pip install pyzmq\n\n๐Ÿ’ก After installation, the server will auto-start when you train a model." # Check if server is running is_running, message = check_embedding_server() if is_running: return f"โœ… {message}\n\nThe embedding server is ready to use." else: return f"โ„น๏ธ {message}\n\nโœ… This is normal! The embedding server will start automatically when you train a model or make predictions. No manual start needed." # Create Gradio interface with vibrant theme with gr.Blocks(title="SAP Finance Dashboard") as app: gr.HTML("""

๐Ÿ“ŠSAP Finance playground for RPT-1-OSS Model

AI-Powered Financial Analysis & Predictions with RPT-1-OSS Model by Amit Lal

""") with gr.Tabs(): # Dashboard Tab with gr.TabItem("๐Ÿ“Š Dashboard"): gr.Markdown("## ๐Ÿ“ˆ Financial Overview") gr.Markdown("*Real-time metrics and key financial indicators*") metrics_display = gr.HTML() with gr.Row(): chart1 = gr.Plot() chart2 = gr.Plot() chart3 = gr.Plot() refresh_btn = gr.Button("Refresh Dashboard") refresh_btn.click( create_dashboard, outputs=[metrics_display, chart1, chart2, chart3] ) # Load dashboard on startup app.load(create_dashboard, outputs=[metrics_display, chart1, chart2, chart3]) # Data Explorer Tab with gr.TabItem("๐Ÿ” Data Explorer"): gr.Markdown("## ๐Ÿ—‚๏ธ Explore Datasets") gr.Markdown("*Browse and analyze your financial data*") dataset_selector = gr.Dropdown( choices=["GL Accounts", "Financial Statements", "Sales Orders", "Uploaded Data"], label="Select Dataset", value="GL Accounts" ) info_text = gr.Textbox(label="Dataset Info", interactive=False) data_chart = gr.Plot() data_table = gr.Dataframe() dataset_selector.change( explore_dataset, inputs=[dataset_selector], outputs=[info_text, data_chart, data_table] ) # Upload Tab with gr.TabItem("๐Ÿ“ค Upload"): gr.Markdown("## ๐Ÿ“ Upload Dataset") gr.Markdown("*Upload your own CSV files for analysis*") file_upload = gr.File(label="Upload CSV File", file_types=[".csv"]) upload_status = gr.Textbox(label="Status", interactive=False) uploaded_preview = gr.Dataframe() file_upload.upload( upload_file, inputs=[file_upload], outputs=[upload_status, uploaded_preview] ) # Predictions Tab with gr.TabItem("๐Ÿค– AI Predictions"): gr.Markdown("## ๐ŸŽฏ AI Predictions with SAP-RPT-1-OSS") gr.Markdown("*Train AI models on financial data and make intelligent predictions powered by deep learning*") with gr.Row(): model_type_select = gr.Dropdown( choices=["Classifier", "Regressor"], label="Model Type", value="Classifier", info="Classifier: Categorize data | Regressor: Predict numeric values" ) use_gpu_check = gr.Checkbox(label="Use GPU (requires 80GB memory)", value=False) init_btn = gr.Button("Initialize Model", variant="primary") init_status = gr.Textbox(label="Initialization Status", interactive=False) gr.Markdown("### Step 1: Train the Model") with gr.Row(): train_dataset_select = gr.Dropdown( choices=["Sales Orders", "GL Accounts", "Financial Statements", "Uploaded Data"], label="Select Training Dataset", value="Sales Orders" ) train_btn = gr.Button("Train Model", variant="primary") train_status = gr.Textbox(label="Training Status", interactive=False, lines=3) gr.Markdown("### Step 2: Make Predictions") with gr.Row(): pred_dataset_select = gr.Dropdown( choices=["Sales Orders", "Products", "GL Accounts", "Financial Statements", "Uploaded Data"], label="Select Prediction Dataset", value="Sales Orders", info="Choose which dataset to analyze" ) prediction_scenario = gr.Dropdown( choices=[ "High Value Order Classification", "Order Priority Classification", "Customer Segment Classification" ], label="Prediction Scenario", value="High Value Order Classification", info="Scenario updates based on selected dataset" ) predict_btn = gr.Button("๐ŸŽฏ Make Predictions", variant="primary", size="lg") pred_status = gr.Textbox(label="Prediction Results", interactive=False, lines=6) gr.Markdown("### Prediction Visualizations") with gr.Row(): pred_pie_chart = gr.Plot(label="Distribution") pred_bar_chart = gr.Plot(label="Summary") with gr.Row(): pred_gauge_chart = gr.Plot(label="Confidence Score") gr.Markdown("### Detailed Predictions") predictions_table = gr.Dataframe(label="Data with Predictions") gr.Markdown(""" **Dataset-Specific Scenarios:** ๐Ÿ“ฆ **Sales Orders:** - High Value Order: Premium vs standard orders - Order Priority: Urgent vs normal handling - Customer Segment: VIP vs regular customers ๐Ÿ›๏ธ **Products:** - Product Performance: Best sellers vs low performers - Stock Risk: Items needing reorder ๐Ÿ’ฐ **GL Accounts:** - Transaction Risk: Flagged vs normal transactions - Account Balance: Above vs below average - Expense Category: OpEx vs CapEx ๐Ÿ“Š **Financial Statements:** - Financial Health: Strong vs weak performance - Profitability: High vs low margin periods - Growth Trend: Revenue growth vs decline """) init_btn.click( init_model, inputs=[model_type_select, use_gpu_check], outputs=[init_status] ) train_btn.click( train_model, inputs=[train_dataset_select], outputs=[train_status] ) # Update scenarios when dataset changes pred_dataset_select.change( update_scenarios, inputs=[pred_dataset_select], outputs=[prediction_scenario] ) predict_btn.click( make_predictions, inputs=[pred_dataset_select, prediction_scenario], outputs=[pred_status, predictions_table, pred_pie_chart, pred_bar_chart, pred_gauge_chart] ) # OData Tab with gr.TabItem("๐Ÿ”— OData"): gr.Markdown("## ๐ŸŒ SAP OData Connection") gr.Markdown("*Connect to live SAP systems and fetch real-time data*") test_conn_btn = gr.Button("Test Connection") conn_status = gr.Textbox(label="Connection Status", interactive=False) with gr.Row(): entity_select = gr.Dropdown( choices=["Sales Orders", "Products", "Line Items", "Business Partners"], label="Select Entity", value="Sales Orders" ) num_records = gr.Number(label="Number of Records", value=100, minimum=1, maximum=1000) fetch_btn = gr.Button("Fetch Data") fetch_status = gr.Textbox(label="Fetch Status", interactive=False) odata_table = gr.Dataframe() test_conn_btn.click( test_odata_connection, outputs=[conn_status] ) fetch_btn.click( fetch_odata_data, inputs=[entity_select, num_records], outputs=[fetch_status, odata_table] ) # Playground Tab with gr.TabItem("๐ŸŽฎ Playground"): gr.Markdown("## ๐Ÿงช SAP-RPT-1-OSS Playground") gr.Markdown("*Upload datasets, configure models, and train with real-time progress tracking*") # Embedding Server Status gr.Markdown("**๐Ÿ’ก Note:** The SAP-RPT-OSS embedding server starts automatically when the model makes predictions. Manual start is optional and may not be available in all installations.") with gr.Row(): embedding_status_btn = gr.Button("Check Embedding Server", size="sm") embedding_status = gr.Textbox(label="Embedding Server Status", interactive=False, lines=4) start_server_btn = gr.Button("Start Embedding Server (Optional)", size="sm", variant="secondary") embedding_status_btn.click( check_playground_embedding_server, outputs=[embedding_status] ) def start_playground_embedding_server(): """Start embedding server and return formatted message.""" # Check if package is installed first if not is_sap_rpt_oss_installed(): return f"โŒ sap-rpt-oss package not found\n\n๐Ÿ“ฆ Installation Required:\npip install git+https://github.com/SAP-samples/sap-rpt-1-oss" success, message = start_embedding_server(None) if success: return f"โœ… {message}\n\nThe server is now running and will be used automatically during training." else: # This is normal - SAP-RPT-OSS starts the server automatically when needed return f"โ„น๏ธ {message}\n\nโœ… This is expected! The embedding server will start automatically when you train the model or make predictions. No action needed." start_server_btn.click( start_playground_embedding_server, outputs=[embedding_status] ) gr.Markdown("### Step 1: Upload Dataset") playground_upload = gr.File( label="Upload Dataset (CSV, Parquet, or JSON)", file_types=[".csv", ".parquet", ".json", ".jsonl"] ) playground_info = gr.Textbox(label="Dataset Info", interactive=False, lines=8) playground_preview = gr.Dataframe(label="Preview (First 10 Rows)") gr.Markdown("### Step 2: Configure Model") # Documentation section with gr.Accordion("๐Ÿ“š Parameter Guide - Click to expand", open=False): gr.Markdown(""" **Understanding Model Parameters:** **๐ŸŽฏ Task Type:** - **Classification**: Predicts categories/classes (e.g., "High Risk" vs "Low Risk", "Approved" vs "Rejected") - Target column should have discrete values (integers or categories) - Examples: Will invoice be paid late? (Yes/No), Product category (A/B/C) - **Regression**: Predicts continuous numeric values (e.g., price, days, amount) - Target column should have numeric values - Examples: Days until payment, Revenue amount, Risk score (0-100) **๐Ÿ“Š Test Split Ratio:** - Proportion of your dataset reserved for testing model performance - **0.1 (10%)**: Use more data for training, less for validation. Good for small datasets. - **0.2 (20%)**: Balanced approach. Recommended default for most cases. - **0.3-0.5 (30-50%)**: More data for testing. Use when you have large datasets and want thorough validation. - Higher test split = more reliable performance estimate, but less training data **๐Ÿง  Max Context Size:** - Number of examples the model can consider simultaneously when making predictions - **512**: Fast, memory-efficient. Good for quick experiments or CPU-only setups. - **1024**: Balanced performance. Recommended for most use cases. - **2048**: Better accuracy, moderate memory. Good default for production. - **4096**: High accuracy, requires significant memory (16GB+ RAM). - **8192**: Best accuracy, requires 80GB GPU memory. Use only with powerful hardware. - Larger context = better understanding of patterns, but slower and more memory-intensive **๐ŸŽฒ Bagging Factor:** - Number of independent models trained and combined (ensemble learning) - **1**: Single model. Fastest, baseline performance. - **2**: Two models averaged. Good balance of speed and accuracy. Recommended default. - **4**: Four models. Better accuracy, 2x slower than bagging=2. - **8**: Eight models. Best accuracy, 4x slower. Use for final production models. - Higher bagging = more robust predictions (reduces overfitting), but slower training **๐Ÿ’ป Use GPU:** - Enable GPU acceleration (requires NVIDIA GPU with 80GB VRAM) - GPU mode: Context size 8192, Bagging 8 (maximum performance) - CPU mode: Context size 2048, Bagging 1 (lightweight, works on any machine) - Leave unchecked unless you have enterprise-grade GPU hardware **๐Ÿ”ง Handle Missing Values:** - How to treat empty/null values in your data - **mean**: Replace with column average (good for normally distributed data) - **median**: Replace with column median (better for skewed data, robust to outliers) - **zero**: Replace with 0 (simple, but may introduce bias) - **drop**: Remove rows with missing values (loses data, but preserves original distribution) **๐Ÿ“ Normalize Features:** - Scale all numeric features to have mean=0 and std=1 - **Enabled**: Recommended when features have very different scales (e.g., age 0-100 vs income 0-1000000) - **Disabled**: Use original feature scales (faster, works when scales are similar) - Normalization helps models converge faster and perform better with mixed-scale features """) with gr.Row(): playground_task_type = gr.Dropdown( choices=["classification", "regression"], label="Task Type", value="classification", info="Classification: Predict categories (Yes/No, A/B/C). Regression: Predict numbers (price, days, score)" ) playground_target_col = gr.Dropdown( choices=[], label="Target Column", value=None, info="The column you want to predict. Auto-selected: last column in dataset" ) with gr.Row(): playground_test_split = gr.Slider( minimum=0.1, maximum=0.5, value=0.2, step=0.05, label="Test Split Ratio", info="Proportion of data for testing (0.2 = 20% test, 80% train). Higher = more validation data, less training data" ) playground_max_context = gr.Dropdown( choices=[512, 1024, 2048, 4096, 8192], value=2048, label="Max Context Size", info="How many examples model considers (512=fast/light, 2048=balanced, 8192=best/needs GPU). Larger = better accuracy, more memory" ) with gr.Row(): playground_bagging = gr.Dropdown( choices=[1, 2, 4, 8], value=2, label="Bagging Factor", info="Number of models to combine (1=fast, 2=balanced, 8=best). Higher = more accurate but slower. Reduces overfitting" ) playground_use_gpu = gr.Checkbox( label="Use GPU (requires 80GB VRAM)", value=False, info="Enable GPU acceleration. Only check if you have NVIDIA GPU with 80GB memory. Unchecked = CPU mode (works on any machine)" ) with gr.Row(): playground_handle_missing = gr.Dropdown( choices=["mean", "median", "zero", "drop"], value="mean", label="Handle Missing Values", info="How to treat empty cells: mean/median (fill with average), zero (fill with 0), drop (remove rows)" ) playground_normalize = gr.Checkbox( label="Normalize Features", value=False, info="Scale all numeric features to same range (mean=0, std=1). Recommended when features have very different scales" ) gr.Markdown("### Step 3: Train Model") train_playground_btn = gr.Button("๐Ÿš€ Train Model", variant="primary", size="lg") playground_train_status = gr.Textbox(label="Training Status", interactive=False, lines=6) gr.Markdown("### Step 4: Results") playground_results_table = gr.Dataframe(label="Test Predictions (First 100 Rows)") playground_download = gr.File(label="Download Full Results CSV") # Connect upload handler def update_playground_components(file): """Update all playground components after upload.""" result = handle_playground_upload(file) if len(result) == 7: info, preview, choices, value, task_type, choices2, value2 = result return ( info, preview, gr.Dropdown(choices=choices, value=value), task_type, gr.Dropdown(choices=choices2, value=value2) ) elif len(result) == 7 and result[2] == []: # Error case return result[0], result[1], gr.Dropdown(choices=[], value=None), result[4], gr.Dropdown(choices=[], value=None) return result playground_upload.upload( update_playground_components, inputs=[playground_upload], outputs=[ playground_info, playground_preview, playground_target_col, playground_task_type, playground_target_col ] ) # Connect training handler train_playground_btn.click( train_playground_model, inputs=[ playground_task_type, playground_target_col, playground_test_split, playground_max_context, playground_bagging, playground_use_gpu, playground_handle_missing, playground_normalize ], outputs=[ playground_train_status, playground_results_table, playground_download, playground_download ] ) with gr.Accordion("๐Ÿ’ก Quick Start Guide", open=False): gr.Markdown(""" **Recommended Settings by Use Case:** **๐Ÿš€ Quick Experiment (Fast, Low Memory):** - Task Type: Auto-detect - Test Split: 0.2 (20%) - Max Context: 512 - Bagging: 1 - GPU: Unchecked - Missing Values: mean - Normalize: Unchecked - *Best for: Trying out the model, small datasets, CPU-only machines* **โš–๏ธ Balanced (Recommended Default):** - Task Type: Auto-detect - Test Split: 0.2 (20%) - Max Context: 2048 - Bagging: 2 - GPU: Unchecked - Missing Values: mean - Normalize: Check if features have very different scales - *Best for: Most production use cases, good accuracy/speed balance* **๐Ÿ† Maximum Accuracy (Slow, High Memory):** - Task Type: Auto-detect - Test Split: 0.3 (30%) - Max Context: 8192 - Bagging: 8 - GPU: Checked (requires 80GB GPU) - Missing Values: median (more robust) - Normalize: Checked - *Best for: Final production models, large datasets, when accuracy is critical* **๐Ÿ“‹ Step-by-Step Workflow:** 1. **Upload Dataset**: CSV, Parquet, or JSON file 2. **Review Auto-Detection**: Check if task type and target column are correct 3. **Adjust Parameters**: Use recommended settings above or customize 4. **Train Model**: Click "Train Model" and wait for progress 5. **Review Results**: Check accuracy/metrics and download predictions **โš ๏ธ Common Issues:** - **"Unknown label type"**: Target column has wrong data type. Change Task Type or convert target column. - **Out of Memory**: Reduce Max Context Size or Bagging Factor - **Slow Training**: Reduce Bagging Factor or Max Context Size - **Poor Accuracy**: Increase Max Context Size, Bagging Factor, or check data quality """) gr.Markdown(""" **Playground Features:** - Upload CSV, Parquet, or JSON datasets - Auto-detect task type from filename and target column - Auto-select target column (defaults to last column) - Configure model parameters with detailed guidance - Real-time progress tracking during training - Download results as CSV with predictions **Example Use Cases:** - Predictive business outcomes (invoice late payment, days to payment) - Recommendations & auto-defaulting (form of address) - Normalization & coding (country ISO codes) - Data quality & anomaly flags (bank details review) - Derived scores & segments (employee risk of leave) - Matching & linking (material entity matching) - Information extraction (ticket topic classification) """) if __name__ == "__main__": import os # Load datasets on startup load_datasets() # Get server configuration from environment variables (for container deployment) server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0") server_port = int(os.environ.get("GRADIO_SERVER_PORT", 7862)) # Launch the app app.launch( server_name=server_name, server_port=server_port, share=False, show_error=True, show_api=False )