Spaces:

tostido
/

Cascade

Configuration error

App Files Files Community

tostido commited on Jan 11

Commit

77bcbf1

0 Parent(s):

Initial commit - cascade-lattice 0.5.4

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.github/workflows/publish.yml +31 -0
.gitignore +35 -0
LICENSE +21 -0
README.md +70 -0
cascade/__init__.py +290 -0
cascade/analysis/__init__.py +37 -0
cascade/analysis/metrics.py +1168 -0
cascade/analysis/tracer.py +487 -0
cascade/bridge.py +265 -0
cascade/cli_main.py +851 -0
cascade/core/__init__.py +13 -0
cascade/core/adapter.py +470 -0
cascade/core/event.py +177 -0
cascade/core/graph.py +292 -0
cascade/core/provenance.py +601 -0
cascade/core/web3_bridge.py +846 -0
cascade/data/__init__.py +112 -0
cascade/data/croissant.py +289 -0
cascade/data/entities.py +349 -0
cascade/data/hub.py +533 -0
cascade/data/license.py +635 -0
cascade/data/live.py +844 -0
cascade/data/observer.py +666 -0
cascade/data/pii.py +748 -0
cascade/data/provenance.py +503 -0
cascade/data/schema.py +417 -0
cascade/demo.py +174 -0
cascade/demo_sdk.py +114 -0
cascade/export/__init__.py +23 -0
cascade/export/tableau_export.py +598 -0
cascade/forensics/__init__.py +53 -0
cascade/forensics/analyzer.py +464 -0
cascade/forensics/artifacts.py +1063 -0
cascade/forensics/fingerprints.py +328 -0
cascade/genesis.py +200 -0
cascade/hold/__init__.py +82 -0
cascade/hold/primitives.py +673 -0
cascade/hold/session.py +707 -0
cascade/identity.py +715 -0
cascade/ipld.py +379 -0
cascade/listen.py +154 -0
cascade/logging/__init__.py +86 -0
cascade/logging/color_example.py +107 -0
cascade/logging/integrate.py +275 -0
cascade/logging/interpretive_logger.py +276 -0
cascade/logging/kleene_logger.py +219 -0
cascade/logging/log_manager.py +266 -0
cascade/observation.py +397 -0
cascade/observe.py +231 -0
cascade/patches/__init__.py +19 -0

.github/workflows/publish.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+name: Publish to PyPI
+on:
+  push:
+    tags:
+      - 'v*'
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write  # For trusted publishing (optional)
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Install build tools
+        run: pip install build
+      - name: Build package
+        run: python -m build
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          password: ${{ secrets.PYPI_API_TOKEN }}

.gitignore ADDED Viewed

	@@ -0,0 +1,35 @@

+# Byte-compiled
+__pycache__/
+*.py[cod]
+*$py.class
+# Distribution / packaging
+dist/
+build/
+*.egg-info/
+*.egg
+*.whl
+# Virtual environments
+venv/
+.venv/
+env/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+# Logs
+*.log
+logs/
+# OS
+.DS_Store
+Thumbs.db

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024-2026 Jeff Towers
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,70 @@

+# Cascade Lattice
+**Universal AI provenance layer — cryptographic receipts for every call, with HOLD inference halt protocol**
+[![PyPI version](https://badge.fury.io/py/cascade-lattice.svg)](https://pypi.org/project/cascade-lattice/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+## Installation
+```bash
+pip install cascade-lattice
+```
+With optional dependencies:
+```bash
+pip install cascade-lattice[torch]  # PyTorch integration
+pip install cascade-lattice[all]    # All integrations
+```
+## Quick Start
+```python
+from cascade import Monitor
+# Create a monitor for your component
+monitor = Monitor("training_loop")
+# Observe events (parses logs, extracts metrics)
+event = monitor.observe("Epoch 5: loss=0.0234, accuracy=0.9812")
+print(event.data)  # {'loss': 0.0234, 'accuracy': 0.9812, ...}
+# Get metrics summary
+print(monitor.metrics.summary())
+```
+## Features
+- **Universal Observation** — Monitor training, inference, system logs, API calls
+- **Cryptographic Receipts** — Every observation gets a verifiable hash chain
+- **HOLD Protocol** — Inference halt capability for safety-critical applications
+- **Tape Storage** — JSONL event streams for replay and analysis
+- **Provider Patches** — Drop-in monitoring for OpenAI, Anthropic, LiteLLM, Ollama
+## CLI Usage
+```bash
+cascade --help              # Show all commands
+cascade stats               # Lattice statistics
+cascade list -n 20          # Recent observations
+cascade watch               # Live observation feed
+cascade fingerprint model/  # Fingerprint a model
+cascade pii scan.log        # Scan for PII
+```
+## Tape Utilities
+```python
+from cascade.viz import load_tape_file, find_latest_tape, list_tape_files
+# Find and load tape files
+latest = find_latest_tape("./logs")
+events = load_tape_file(latest)
+for event in events:
+    print(event['event']['event_type'], event['event']['data'])
+```
+## License
+MIT

cascade/__init__.py ADDED Viewed

	@@ -0,0 +1,290 @@

+"""
+╔═══════════════════════════════════════════════════════════════════════════════╗
+║                                                                               ║
+║     ██████╗ █████╗ ███████╗ ██████╗ █████╗ ██████╗ ███████╗                  ║
+║    ██╔════╝██╔══██╗██╔════╝██╔════╝██╔══██╗██╔══██╗██╔════╝                  ║
+║    ██║     ███████║███████╗██║     ███████║██║  ██║█████╗                    ║
+║    ██║     ██╔══██║╚════██║██║     ██╔══██║██║  ██║██╔══╝                    ║
+║    ╚██████╗██║  ██║███████║╚██████╗██║  ██║██████╔╝███████╗                  ║
+║     ╚═════╝╚═╝  ╚═╝╚══════╝ ╚═════╝╚═╝  ╚═╝╚═════╝ ╚══════╝                  ║
+║                                                                               ║
+║    Symbiotic Causation Monitoring for Neural Networks                        ║
+║                                                                               ║
+║    "even still, i grow, and yet, I grow still"                               ║
+║                                                                               ║
+╚═══════════════════════════════════════════════════════════════════════════════╝
+Cascade is a self-interpreting causation monitor that symbiotically adapts to
+any system architecture through Kleene fixed-point convergence.
+Feed it ANY signal format. It learns your system's patterns. It traces cause
+and effect bidirectionally through time. It predicts cascading failures before
+they complete.
+Quick Start:
+    >>> import cascade
+    >>> monitor = cascade.Monitor()
+    >>> monitor.observe({"loss": 0.5, "epoch": 10})
+    >>> monitor.observe("ERROR: gradient exploded at layer 5")
+    >>>
+    >>> # What caused this?
+    >>> monitor.trace_backwards("gradient_explosion")
+    >>>
+    >>> # What will this cause?
+    >>> monitor.trace_forwards("learning_rate_spike")
+"""
+__version__ = "0.5.4"
+__author__ = "Cascade Team"
+__license__ = "MIT"
+from cascade.core.event import Event, CausationLink
+from cascade.core.graph import CausationGraph
+from cascade.core.adapter import SymbioticAdapter
+from cascade.analysis.tracer import Tracer
+from cascade.analysis.metrics import MetricsEngine
+# Primary API
+class Monitor:
+    """
+    The main entry point for Cascade monitoring.
+    A symbiotic observer that acclimate to any system architecture.
+    Feed it signals in any format — it adapts and builds a causation graph.
+    Example:
+        >>> monitor = cascade.Monitor()
+        >>>
+        >>> # Feed it anything - dicts, strings, tensors, whatever
+        >>> monitor.observe({"loss": 0.5, "epoch": 10})
+        >>> monitor.observe("2024-01-01 12:00:00 INFO training started")
+        >>> monitor.observe(torch.tensor([0.1, 0.2, 0.3]))
+        >>>
+        >>> # Trace causation backwards (what caused this?)
+        >>> causes = monitor.trace_backwards(event_id)
+        >>>
+        >>> # Trace causation forwards (what will this cause?)
+        >>> effects = monitor.trace_forwards(event_id)
+        >>>
+        >>> # Get the full causation graph
+        >>> graph = monitor.graph
+    """
+    def __init__(self, name: str = "default"):
+        """
+        Initialize a new Cascade monitor.
+        Args:
+            name: Optional name for this monitor instance
+        """
+        self.name = name
+        self.adapter = SymbioticAdapter()
+        self.graph = CausationGraph()
+        self.tracer = Tracer(self.graph)
+        self.metrics = MetricsEngine(self.graph)
+        self._event_count = 0
+    def observe(self, signal) -> Event:
+        """
+        Observe a signal from the host system.
+        The signal can be in ANY format:
+        - dict: {"loss": 0.5, "epoch": 10}
+        - str: "ERROR: gradient exploded"
+        - tensor: torch.tensor([...])
+        - protobuf, JSON, log line, etc.
+        Cascade will automatically adapt to your signal format.
+        Args:
+            signal: Any signal from the host system
+        Returns:
+            Event: The interpreted event added to the causation graph
+        """
+        event = self.adapter.interpret(signal)
+        self.graph.add_event(event)
+        self.metrics.ingest(event)
+        self._event_count += 1
+        return event
+    def trace_backwards(self, event_id: str, max_depth: int = 10):
+        """
+        Trace causation backwards: what caused this event?
+        Args:
+            event_id: ID of the event to trace from
+            max_depth: Maximum depth to trace (default: 10)
+        Returns:
+            List of CausationChain objects showing the causal history
+        """
+        return self.tracer.trace_backwards(event_id, max_depth)
+    def trace_forwards(self, event_id: str, max_depth: int = 10):
+        """
+        Trace causation forwards: what did this event cause?
+        Args:
+            event_id: ID of the event to trace from
+            max_depth: Maximum depth to trace (default: 10)
+        Returns:
+            List of CausationChain objects showing the effects
+        """
+        return self.tracer.trace_forwards(event_id, max_depth)
+    def find_root_causes(self, event_id: str):
+        """
+        Find the ultimate root causes of an event.
+        Goes all the way back to find the origin points.
+        Args:
+            event_id: ID of the event to analyze
+        Returns:
+            List of root cause events with their causal chains
+        """
+        return self.tracer.find_root_causes(event_id)
+    def analyze_impact(self, event_id: str, max_depth: int = 20):
+        """
+        Analyze the downstream impact of an event.
+        Traces forward to find everything this event set in motion.
+        Args:
+            event_id: ID of the event to analyze
+            max_depth: Maximum depth to search
+        Returns:
+            ImpactAnalysis with effects and severity score
+        """
+        return self.tracer.analyze_impact(event_id, max_depth)
+    def predict_cascade(self, event_id: str):
+        """
+        Predict the likely future cascade from this event.
+        Uses learned patterns to forecast effects before they happen.
+        Args:
+            event_id: ID of the event to predict from
+        Returns:
+            CascadePrediction with risk scores and intervention points
+        """
+        return self.tracer.predict_cascade(event_id)
+    def __repr__(self):
+        return f"<Cascade Monitor '{self.name}' | {self._event_count} events>"
+# Convenience function for quick setup
+def observe() -> Monitor:
+    """
+    Create a new Cascade monitor ready for observation.
+    This is the simplest way to get started:
+        >>> import cascade
+        >>> monitor = cascade.observe()
+        >>> monitor.observe({"loss": 0.5})
+    Returns:
+        Monitor: A new monitor instance
+    """
+    return Monitor()
+# Tape utilities for event storage
+from cascade.viz.tape import (
+    load_tape_file,
+    find_latest_tape,
+    list_tape_files,
+    PlaybackBuffer,
+)
+# SDK - Universal AI Observation Layer
+from cascade.sdk import init, observe as sdk_observe, shutdown
+# Store - Simple observe/query with HuggingFace sync
+from cascade.store import (
+    observe as store_observe,
+    query as store_query,
+    get as store_get,
+    stats as store_stats,
+    sync_all,
+    pull_from_hf,
+    Receipt,
+    # Discovery - find other users' lattices
+    discover_models,
+    discover_datasets,
+    discover_live,
+    dataset_info,
+)
+# Convenience aliases
+auto_observe = init  # cascade.auto_observe() is clearer for some users
+# HOLD - Inference-Level Halt Protocol
+from cascade import hold as hold_module
+from cascade.hold import (
+    Hold,
+    HoldPoint,
+    HoldResolution,
+    HoldState,
+    HoldAwareMixin,
+    CausationHold,
+    InferenceStep,
+    HoldSession,
+    ArcadeFeedback,
+)
+__all__ = [
+    # SDK - Primary Interface
+    "init",
+    "auto_observe",
+    "shutdown",
+    # Store - HuggingFace-backed storage
+    "store_observe",
+    "store_query",
+    "store_get",
+    "store_stats",
+    "sync_all",
+    "pull_from_hf",
+    "Receipt",
+    # Discovery
+    "discover_models",
+    "discover_datasets",
+    "discover_live",
+    "dataset_info",
+    # Monitor (causation tracking)
+    "Monitor",
+    "observe",
+    "Event",
+    "CausationLink",
+    "CausationGraph",
+    "SymbioticAdapter",
+    "Tracer",
+    "MetricsEngine",
+    # Tape playback
+    "load_tape_file",
+    "find_latest_tape",
+    "list_tape_files",
+    "PlaybackBuffer",
+    # HOLD - Inference Halt Protocol
+    "Hold",
+    "HoldPoint",
+    "HoldResolution",
+    "HoldState",
+    "HoldAwareMixin",
+    "CausationHold",
+    "InferenceStep",
+    "HoldSession",
+    "ArcadeFeedback",
+    "hold_module",
+    "__version__",
+]

cascade/analysis/__init__.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""Cascade Analysis module - tracing, prediction, and intervention."""
+from cascade.analysis.tracer import (
+    Tracer,
+    RootCauseAnalysis,
+    ImpactAnalysis,
+    CascadePrediction,
+)
+from cascade.analysis.metrics import (
+    MetricsEngine,
+    MetricSeries,
+    MetricCategory,
+    MetricHealthSpec,
+    Anomaly,
+    Correlation,
+    ThresholdCrossing,
+    classify_metric,
+    METRIC_TAXONOMY,
+    HEALTH_SPECS,
+)
+__all__ = [
+    "Tracer",
+    "RootCauseAnalysis",
+    "ImpactAnalysis",
+    "CascadePrediction",
+    "MetricsEngine",
+    "MetricSeries",
+    "MetricCategory",
+    "MetricHealthSpec",
+    "Anomaly",
+    "Correlation",
+    "ThresholdCrossing",
+    "classify_metric",
+    "METRIC_TAXONOMY",
+    "HEALTH_SPECS",
+]

cascade/analysis/metrics.py ADDED Viewed

	@@ -0,0 +1,1168 @@

+"""
+Cascade Analysis - Metrics Engine.
+The quantification layer. Extracts, tracks, and correlates numeric data
+from the event stream. Provides the WHAT with enough depth that the WHY
+becomes self-evident to the observer.
+This module does NOT interpret or explain. It quantifies.
+Industry-Standard Neural Network Observability Taxonomy:
+=========================================================
+CATEGORY 1: TRAINING_DYNAMICS
+    Core training loop metrics - loss, accuracy, learning rate, throughput
+CATEGORY 2: GRADIENT_HEALTH
+    Gradient flow diagnostics - norms, clipping, vanishing/exploding
+CATEGORY 3: WEIGHT_DYNAMICS
+    Parameter evolution - norms, update ratios, dead neurons
+CATEGORY 4: ACTIVATION_FLOW
+    Forward pass health - magnitudes, saturation, dead ReLUs
+CATEGORY 5: ATTENTION_MECHANICS
+    Transformer-specific - entropy, sparsity, head importance
+CATEGORY 6: MEMORY_COMPUTE
+    Resource utilization - GPU/CPU memory, MFU, throughput
+CATEGORY 7: OPTIMIZATION_STATE
+    Optimizer internals - Adam moments, momentum, weight decay
+CATEGORY 8: CONVERGENCE_SIGNALS
+    Training health indicators - plateau, overfitting, noise scale
+CATEGORY 9: DATA_PIPELINE
+    Data loading metrics - batch time, queue depth, prefetch
+CATEGORY 10: REGULARIZATION
+    Regularization effects - dropout, batch norm, layer norm stats
+"""
+from typing import Dict, List, Any, Optional, Tuple, Set
+from dataclasses import dataclass, field
+from collections import defaultdict
+from enum import Enum, auto
+import math
+import re
+from cascade.core.event import Event
+from cascade.core.graph import CausationGraph
+# =============================================================================
+# METRIC CATEGORY TAXONOMY
+# =============================================================================
+class MetricCategory(Enum):
+    """Industry-standard neural network metric categories."""
+    TRAINING_DYNAMICS = auto()    # Loss, accuracy, LR, throughput
+    GRADIENT_HEALTH = auto()      # Grad norms, clipping, flow
+    WEIGHT_DYNAMICS = auto()      # Weight norms, updates, dead neurons
+    ACTIVATION_FLOW = auto()      # Activation stats, saturation
+    ATTENTION_MECHANICS = auto()  # Attention entropy, sparsity, heads
+    MEMORY_COMPUTE = auto()       # GPU/CPU mem, MFU, FLOPS
+    OPTIMIZATION_STATE = auto()   # Adam moments, momentum, decay
+    CONVERGENCE_SIGNALS = auto()  # Plateau, overfit, noise scale
+    DATA_PIPELINE = auto()        # Batch time, queue, prefetch
+    REGULARIZATION = auto()       # Dropout, norm layer stats
+    SYSTEM = auto()               # Iteration, epoch, timestamps
+    UNKNOWN = auto()              # Uncategorized metrics
+# Comprehensive metric-to-category mapping
+# This is the "knowledge base" of neural network metric taxonomy
+METRIC_TAXONOMY: Dict[str, MetricCategory] = {
+    # TRAINING_DYNAMICS
+    "loss": MetricCategory.TRAINING_DYNAMICS,
+    "train_loss": MetricCategory.TRAINING_DYNAMICS,
+    "val_loss": MetricCategory.TRAINING_DYNAMICS,
+    "test_loss": MetricCategory.TRAINING_DYNAMICS,
+    "eval_loss": MetricCategory.TRAINING_DYNAMICS,
+    "nll_loss": MetricCategory.TRAINING_DYNAMICS,
+    "ce_loss": MetricCategory.TRAINING_DYNAMICS,
+    "cross_entropy": MetricCategory.TRAINING_DYNAMICS,
+    "mse_loss": MetricCategory.TRAINING_DYNAMICS,
+    "mae_loss": MetricCategory.TRAINING_DYNAMICS,
+    "perplexity": MetricCategory.TRAINING_DYNAMICS,
+    "ppl": MetricCategory.TRAINING_DYNAMICS,
+    "accuracy": MetricCategory.TRAINING_DYNAMICS,
+    "acc": MetricCategory.TRAINING_DYNAMICS,
+    "top1_acc": MetricCategory.TRAINING_DYNAMICS,
+    "top5_acc": MetricCategory.TRAINING_DYNAMICS,
+    "precision": MetricCategory.TRAINING_DYNAMICS,
+    "recall": MetricCategory.TRAINING_DYNAMICS,
+    "f1": MetricCategory.TRAINING_DYNAMICS,
+    "f1_score": MetricCategory.TRAINING_DYNAMICS,
+    "auc": MetricCategory.TRAINING_DYNAMICS,
+    "auroc": MetricCategory.TRAINING_DYNAMICS,
+    "bleu": MetricCategory.TRAINING_DYNAMICS,
+    "rouge": MetricCategory.TRAINING_DYNAMICS,
+    "lr": MetricCategory.TRAINING_DYNAMICS,
+    "learning_rate": MetricCategory.TRAINING_DYNAMICS,
+    "samples_per_sec": MetricCategory.TRAINING_DYNAMICS,
+    "tokens_per_sec": MetricCategory.TRAINING_DYNAMICS,
+    "throughput": MetricCategory.TRAINING_DYNAMICS,
+    "steps_per_sec": MetricCategory.TRAINING_DYNAMICS,
+    # GRADIENT_HEALTH
+    "grad_norm": MetricCategory.GRADIENT_HEALTH,
+    "gradient_norm": MetricCategory.GRADIENT_HEALTH,
+    "global_grad_norm": MetricCategory.GRADIENT_HEALTH,
+    "grad_norm_clipped": MetricCategory.GRADIENT_HEALTH,
+    "grad_clip_rate": MetricCategory.GRADIENT_HEALTH,
+    "grad_scale": MetricCategory.GRADIENT_HEALTH,
+    "grad_mean": MetricCategory.GRADIENT_HEALTH,
+    "grad_std": MetricCategory.GRADIENT_HEALTH,
+    "grad_max": MetricCategory.GRADIENT_HEALTH,
+    "grad_min": MetricCategory.GRADIENT_HEALTH,
+    "grad_sparsity": MetricCategory.GRADIENT_HEALTH,
+    "vanishing_grad": MetricCategory.GRADIENT_HEALTH,
+    "exploding_grad": MetricCategory.GRADIENT_HEALTH,
+    # WEIGHT_DYNAMICS
+    "weight_norm": MetricCategory.WEIGHT_DYNAMICS,
+    "param_norm": MetricCategory.WEIGHT_DYNAMICS,
+    "weight_mean": MetricCategory.WEIGHT_DYNAMICS,
+    "weight_std": MetricCategory.WEIGHT_DYNAMICS,
+    "update_ratio": MetricCategory.WEIGHT_DYNAMICS,
+    "weight_update": MetricCategory.WEIGHT_DYNAMICS,
+    "dead_neurons": MetricCategory.WEIGHT_DYNAMICS,
+    "dead_neuron_pct": MetricCategory.WEIGHT_DYNAMICS,
+    "param_count": MetricCategory.WEIGHT_DYNAMICS,
+    "num_params": MetricCategory.WEIGHT_DYNAMICS,
+    "trainable_params": MetricCategory.WEIGHT_DYNAMICS,
+    # ACTIVATION_FLOW
+    "activation_mean": MetricCategory.ACTIVATION_FLOW,
+    "activation_std": MetricCategory.ACTIVATION_FLOW,
+    "activation_norm": MetricCategory.ACTIVATION_FLOW,
+    "activation_max": MetricCategory.ACTIVATION_FLOW,
+    "saturation": MetricCategory.ACTIVATION_FLOW,
+    "saturation_pct": MetricCategory.ACTIVATION_FLOW,
+    "dead_relu": MetricCategory.ACTIVATION_FLOW,
+    "dead_relu_pct": MetricCategory.ACTIVATION_FLOW,
+    "activation_sparsity": MetricCategory.ACTIVATION_FLOW,
+    # Generic activation stats from layer hooks
+    "mean": MetricCategory.ACTIVATION_FLOW,
+    "std": MetricCategory.ACTIVATION_FLOW,
+    "min": MetricCategory.ACTIVATION_FLOW,
+    "max": MetricCategory.ACTIVATION_FLOW,
+    "sparsity": MetricCategory.ACTIVATION_FLOW,
+    "layer_idx": MetricCategory.SYSTEM,
+    # ATTENTION_MECHANICS
+    "attention_entropy": MetricCategory.ATTENTION_MECHANICS,
+    "attn_entropy": MetricCategory.ATTENTION_MECHANICS,
+    "attention_sparsity": MetricCategory.ATTENTION_MECHANICS,
+    "head_importance": MetricCategory.ATTENTION_MECHANICS,
+    "attention_weight_norm": MetricCategory.ATTENTION_MECHANICS,
+    "position_bias": MetricCategory.ATTENTION_MECHANICS,
+    "attention_score_mean": MetricCategory.ATTENTION_MECHANICS,
+    "attention_score_std": MetricCategory.ATTENTION_MECHANICS,
+    # MEMORY_COMPUTE
+    "gpu_memory": MetricCategory.MEMORY_COMPUTE,
+    "gpu_mem": MetricCategory.MEMORY_COMPUTE,
+    "gpu_memory_allocated": MetricCategory.MEMORY_COMPUTE,
+    "gpu_memory_cached": MetricCategory.MEMORY_COMPUTE,
+    "gpu_memory_peak": MetricCategory.MEMORY_COMPUTE,
+    "cpu_memory": MetricCategory.MEMORY_COMPUTE,
+    "memory_usage": MetricCategory.MEMORY_COMPUTE,
+    "mfu": MetricCategory.MEMORY_COMPUTE,
+    "model_flops_utilization": MetricCategory.MEMORY_COMPUTE,
+    "flops": MetricCategory.MEMORY_COMPUTE,
+    "tflops": MetricCategory.MEMORY_COMPUTE,
+    "gpu_utilization": MetricCategory.MEMORY_COMPUTE,
+    "gpu_util": MetricCategory.MEMORY_COMPUTE,
+    # OPTIMIZATION_STATE
+    "adam_m_norm": MetricCategory.OPTIMIZATION_STATE,
+    "adam_v_norm": MetricCategory.OPTIMIZATION_STATE,
+    "momentum": MetricCategory.OPTIMIZATION_STATE,
+    "beta1": MetricCategory.OPTIMIZATION_STATE,
+    "beta2": MetricCategory.OPTIMIZATION_STATE,
+    "weight_decay": MetricCategory.OPTIMIZATION_STATE,
+    "effective_weight_decay": MetricCategory.OPTIMIZATION_STATE,
+    "warmup_progress": MetricCategory.OPTIMIZATION_STATE,
+    "lr_schedule_progress": MetricCategory.OPTIMIZATION_STATE,
+    # CONVERGENCE_SIGNALS
+    "train_val_gap": MetricCategory.CONVERGENCE_SIGNALS,
+    "overfit_ratio": MetricCategory.CONVERGENCE_SIGNALS,
+    "loss_plateau": MetricCategory.CONVERGENCE_SIGNALS,
+    "gradient_noise_scale": MetricCategory.CONVERGENCE_SIGNALS,
+    "critical_batch_size": MetricCategory.CONVERGENCE_SIGNALS,
+    "effective_batch_size": MetricCategory.CONVERGENCE_SIGNALS,
+    "early_stop_score": MetricCategory.CONVERGENCE_SIGNALS,
+    "best_val_loss": MetricCategory.CONVERGENCE_SIGNALS,
+    "improvement_rate": MetricCategory.CONVERGENCE_SIGNALS,
+    # DATA_PIPELINE
+    "data_time": MetricCategory.DATA_PIPELINE,
+    "batch_time": MetricCategory.DATA_PIPELINE,
+    "load_time": MetricCategory.DATA_PIPELINE,
+    "preprocessing_time": MetricCategory.DATA_PIPELINE,
+    "augmentation_time": MetricCategory.DATA_PIPELINE,
+    "queue_depth": MetricCategory.DATA_PIPELINE,
+    "prefetch_factor": MetricCategory.DATA_PIPELINE,
+    "num_workers": MetricCategory.DATA_PIPELINE,
+    # REGULARIZATION
+    "dropout_rate": MetricCategory.REGULARIZATION,
+    "dropout": MetricCategory.REGULARIZATION,
+    "bn_mean": MetricCategory.REGULARIZATION,
+    "bn_var": MetricCategory.REGULARIZATION,
+    "bn_running_mean": MetricCategory.REGULARIZATION,
+    "bn_running_var": MetricCategory.REGULARIZATION,
+    "ln_mean": MetricCategory.REGULARIZATION,
+    "ln_var": MetricCategory.REGULARIZATION,
+    "l1_penalty": MetricCategory.REGULARIZATION,
+    "l2_penalty": MetricCategory.REGULARIZATION,
+    # SYSTEM
+    "iter": MetricCategory.SYSTEM,
+    "iteration": MetricCategory.SYSTEM,
+    "step": MetricCategory.SYSTEM,
+    "total": MetricCategory.SYSTEM,
+    "epoch": MetricCategory.SYSTEM,
+    "batch": MetricCategory.SYSTEM,
+    "batch_idx": MetricCategory.SYSTEM,
+    "global_step": MetricCategory.SYSTEM,
+    "time": MetricCategory.SYSTEM,
+    "dt": MetricCategory.SYSTEM,
+    "elapsed": MetricCategory.SYSTEM,
+    "wall_time": MetricCategory.SYSTEM,
+    "timestamp": MetricCategory.SYSTEM,
+    "hooked_layers": MetricCategory.SYSTEM,
+    "input_tokens": MetricCategory.SYSTEM,
+    "predicted_class": MetricCategory.TRAINING_DYNAMICS,
+    # MODEL INFO
+    "params": MetricCategory.WEIGHT_DYNAMICS,
+    "num_params": MetricCategory.WEIGHT_DYNAMICS,
+    "total_params": MetricCategory.WEIGHT_DYNAMICS,
+    "trainable_params": MetricCategory.WEIGHT_DYNAMICS,
+    "parameters": MetricCategory.WEIGHT_DYNAMICS,
+    "model_size": MetricCategory.WEIGHT_DYNAMICS,
+    # INFERENCE METRICS
+    "confidence": MetricCategory.TRAINING_DYNAMICS,
+    "similarity": MetricCategory.TRAINING_DYNAMICS,
+    "score": MetricCategory.TRAINING_DYNAMICS,
+    "prob": MetricCategory.TRAINING_DYNAMICS,
+    "probability": MetricCategory.TRAINING_DYNAMICS,
+    "entropy": MetricCategory.ATTENTION_MECHANICS,
+    "latency": MetricCategory.MEMORY_COMPUTE,
+    "inference_time": MetricCategory.MEMORY_COMPUTE,
+    "input_len": MetricCategory.DATA_PIPELINE,
+    "output_len": MetricCategory.DATA_PIPELINE,
+    # OBSERVATION SYSTEM METRICS
+    "hooked_modules": MetricCategory.SYSTEM,
+    "total_layers": MetricCategory.SYSTEM,
+    "sample_rate": MetricCategory.SYSTEM,
+    "layer_num": MetricCategory.SYSTEM,
+    "max_depth": MetricCategory.SYSTEM,
+    "return_code": MetricCategory.SYSTEM,
+    "pid": MetricCategory.SYSTEM,
+    "max_iterations": MetricCategory.SYSTEM,
+    "total_iterations": MetricCategory.SYSTEM,
+    "iterations": MetricCategory.SYSTEM,
+    # GPU/VRAM
+    "vram_gb": MetricCategory.MEMORY_COMPUTE,
+    "gpu_count": MetricCategory.MEMORY_COMPUTE,
+    "gpu_memory_gb": MetricCategory.MEMORY_COMPUTE,
+}
+# Patterns for dynamic metric name matching
+METRIC_PATTERNS: List[Tuple[str, MetricCategory]] = [
+    (r".*loss.*", MetricCategory.TRAINING_DYNAMICS),
+    (r".*acc.*", MetricCategory.TRAINING_DYNAMICS),
+    (r".*accuracy.*", MetricCategory.TRAINING_DYNAMICS),
+    (r".*perplexity.*", MetricCategory.TRAINING_DYNAMICS),
+    (r".*lr.*", MetricCategory.TRAINING_DYNAMICS),
+    (r".*learning_rate.*", MetricCategory.TRAINING_DYNAMICS),
+    (r".*grad.*norm.*", MetricCategory.GRADIENT_HEALTH),
+    (r".*gradient.*", MetricCategory.GRADIENT_HEALTH),
+    (r".*weight.*norm.*", MetricCategory.WEIGHT_DYNAMICS),
+    (r".*param.*norm.*", MetricCategory.WEIGHT_DYNAMICS),
+    (r".*activation.*", MetricCategory.ACTIVATION_FLOW),
+    (r".*attention.*", MetricCategory.ATTENTION_MECHANICS),
+    (r".*attn.*", MetricCategory.ATTENTION_MECHANICS),
+    (r".*memory.*", MetricCategory.MEMORY_COMPUTE),
+    (r".*gpu.*", MetricCategory.MEMORY_COMPUTE),
+    (r".*mfu.*", MetricCategory.MEMORY_COMPUTE),
+    (r".*adam.*", MetricCategory.OPTIMIZATION_STATE),
+    (r".*momentum.*", MetricCategory.OPTIMIZATION_STATE),
+    (r".*overfit.*", MetricCategory.CONVERGENCE_SIGNALS),
+    (r".*plateau.*", MetricCategory.CONVERGENCE_SIGNALS),
+    (r".*data.*time.*", MetricCategory.DATA_PIPELINE),
+    (r".*batch.*time.*", MetricCategory.DATA_PIPELINE),
+    (r".*dropout.*", MetricCategory.REGULARIZATION),
+    (r".*bn_.*", MetricCategory.REGULARIZATION),
+    (r".*ln_.*", MetricCategory.REGULARIZATION),
+    (r".*iter.*", MetricCategory.SYSTEM),
+    (r".*epoch.*", MetricCategory.SYSTEM),
+    (r".*step.*", MetricCategory.SYSTEM),
+    (r".*time.*", MetricCategory.SYSTEM),
+    (r".*_ms$", MetricCategory.SYSTEM),
+    (r".*duration.*", MetricCategory.SYSTEM),
+]
+def classify_metric(name: str) -> MetricCategory:
+    """Classify a metric name into its category."""
+    name_lower = name.lower()
+    # Direct lookup
+    if name_lower in METRIC_TAXONOMY:
+        return METRIC_TAXONOMY[name_lower]
+    # Pattern matching
+    for pattern, category in METRIC_PATTERNS:
+        if re.match(pattern, name_lower):
+            return category
+    return MetricCategory.UNKNOWN
+# =============================================================================
+# METRIC HEALTH THRESHOLDS (Industry Standards)
+# =============================================================================
+@dataclass
+class MetricHealthSpec:
+    """Specification for healthy metric ranges."""
+    name: str
+    category: MetricCategory
+    healthy_min: Optional[float] = None
+    healthy_max: Optional[float] = None
+    critical_min: Optional[float] = None
+    critical_max: Optional[float] = None
+    expected_trend: Optional[str] = None  # 'falling', 'rising', 'stable'
+    def is_healthy(self, value: float) -> bool:
+        if self.healthy_min is not None and value < self.healthy_min:
+            return False
+        if self.healthy_max is not None and value > self.healthy_max:
+            return False
+        return True
+    def is_critical(self, value: float) -> bool:
+        if self.critical_min is not None and value < self.critical_min:
+            return True
+        if self.critical_max is not None and value > self.critical_max:
+            return True
+        return False
+# Industry-standard health thresholds
+HEALTH_SPECS: Dict[str, MetricHealthSpec] = {
+    "loss": MetricHealthSpec(
+        name="loss",
+        category=MetricCategory.TRAINING_DYNAMICS,
+        healthy_max=10.0,
+        critical_max=100.0,
+        expected_trend="falling",
+    ),
+    "grad_norm": MetricHealthSpec(
+        name="grad_norm",
+        category=MetricCategory.GRADIENT_HEALTH,
+        healthy_min=1e-7,
+        healthy_max=10.0,
+        critical_min=1e-10,  # Vanishing
+        critical_max=1000.0,  # Exploding
+    ),
+    "lr": MetricHealthSpec(
+        name="lr",
+        category=MetricCategory.TRAINING_DYNAMICS,
+        healthy_min=1e-8,
+        healthy_max=1.0,
+        critical_max=10.0,
+    ),
+    "mfu": MetricHealthSpec(
+        name="mfu",
+        category=MetricCategory.MEMORY_COMPUTE,
+        healthy_min=0.1,  # 10% utilization minimum
+        healthy_max=1.0,
+    ),
+    "dead_relu_pct": MetricHealthSpec(
+        name="dead_relu_pct",
+        category=MetricCategory.ACTIVATION_FLOW,
+        healthy_max=0.3,  # 30% dead is concerning
+        critical_max=0.7,  # 70% dead is critical
+    ),
+    "train_val_gap": MetricHealthSpec(
+        name="train_val_gap",
+        category=MetricCategory.CONVERGENCE_SIGNALS,
+        healthy_max=0.5,  # Gap shouldn't exceed 50% of train loss
+        critical_max=2.0,  # Severe overfitting
+    ),
+}
+@dataclass
+class MetricSeries:
+    """A time series of a single metric with category awareness."""
+    name: str
+    category: MetricCategory = field(default=MetricCategory.UNKNOWN)
+    values: List[float] = field(default_factory=list)
+    timestamps: List[float] = field(default_factory=list)
+    event_ids: List[str] = field(default_factory=list)
+    def __post_init__(self):
+        if self.category == MetricCategory.UNKNOWN:
+            self.category = classify_metric(self.name)
+    @property
+    def count(self) -> int:
+        return len(self.values)
+    @property
+    def current(self) -> Optional[float]:
+        return self.values[-1] if self.values else None
+    @property
+    def previous(self) -> Optional[float]:
+        return self.values[-2] if len(self.values) >= 2 else None
+    @property
+    def delta(self) -> Optional[float]:
+        """Change from previous to current."""
+        if len(self.values) >= 2:
+            return self.values[-1] - self.values[-2]
+        return None
+    @property
+    def delta_pct(self) -> Optional[float]:
+        """Percentage change from previous to current."""
+        if len(self.values) >= 2 and self.values[-2] != 0:
+            return (self.values[-1] - self.values[-2]) / abs(self.values[-2])
+        return None
+    @property
+    def mean(self) -> Optional[float]:
+        return sum(self.values) / len(self.values) if self.values else None
+    @property
+    def std(self) -> Optional[float]:
+        if len(self.values) < 2:
+            return None
+        mean = self.mean
+        variance = sum((x - mean) ** 2 for x in self.values) / len(self.values)
+        return math.sqrt(variance)
+    @property
+    def min(self) -> Optional[float]:
+        return min(self.values) if self.values else None
+    @property
+    def max(self) -> Optional[float]:
+        return max(self.values) if self.values else None
+    @property
+    def range(self) -> Optional[float]:
+        if self.values:
+            return self.max - self.min
+        return None
+    def moving_average(self, window: int = 5) -> Optional[float]:
+        """Compute moving average over last N values."""
+        if len(self.values) < window:
+            return self.mean
+        return sum(self.values[-window:]) / window
+    def rate_of_change(self, window: int = 5) -> Optional[float]:
+        """Average rate of change over last N values."""
+        if len(self.values) < 2:
+            return None
+        window = min(window, len(self.values))
+        recent = self.values[-window:]
+        deltas = [recent[i] - recent[i-1] for i in range(1, len(recent))]
+        return sum(deltas) / len(deltas) if deltas else None
+    def is_anomaly(self, threshold_std: float = 2.0) -> bool:
+        """Is current value anomalous (outside N standard deviations)?"""
+        if len(self.values) < 5 or self.std is None or self.std == 0:
+            return False
+        return abs(self.values[-1] - self.mean) > threshold_std * self.std
+    def trend(self, window: int = 10) -> str:
+        """Determine trend: 'rising', 'falling', 'stable', 'volatile'."""
+        if len(self.values) < 3:
+            return "unknown"
+        window = min(window, len(self.values))
+        recent = self.values[-window:]
+        deltas = [recent[i] - recent[i-1] for i in range(1, len(recent))]
+        positive = sum(1 for d in deltas if d > 0)
+        negative = sum(1 for d in deltas if d < 0)
+        if positive > 0.7 * len(deltas):
+            return "rising"
+        elif negative > 0.7 * len(deltas):
+            return "falling"
+        elif self.std and self.mean and self.std > 0.1 * abs(self.mean):
+            return "volatile"
+        else:
+            return "stable"
+    def health_status(self) -> str:
+        """Check health against industry standards. Returns 'healthy', 'warning', 'critical', 'unknown'."""
+        if self.current is None:
+            return "unknown"
+        name_lower = self.name.lower()
+        if name_lower in HEALTH_SPECS:
+            spec = HEALTH_SPECS[name_lower]
+            if spec.is_critical(self.current):
+                return "critical"
+            if not spec.is_healthy(self.current):
+                return "warning"
+            return "healthy"
+        # Default heuristics for unknown metrics
+        if self.is_anomaly(threshold_std=3.0):
+            return "critical"
+        if self.is_anomaly(threshold_std=2.0):
+            return "warning"
+        return "healthy"
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "name": self.name,
+            "category": self.category.name,
+            "count": self.count,
+            "current": self.current,
+            "delta": self.delta,
+            "delta_pct": self.delta_pct,
+            "mean": self.mean,
+            "std": self.std,
+            "min": self.min,
+            "max": self.max,
+            "trend": self.trend(),
+            "health": self.health_status(),
+            "is_anomaly": self.is_anomaly(),
+            "rate_of_change": self.rate_of_change(),
+        }
+@dataclass
+class Anomaly:
+    """A detected anomaly in the metric stream."""
+    metric_name: str
+    category: MetricCategory
+    event_id: str
+    timestamp: float
+    value: float
+    expected_range: Tuple[float, float]  # (low, high)
+    deviation_std: float
+    severity: str  # 'minor', 'major', 'critical'
+@dataclass
+class Correlation:
+    """A detected correlation between two metrics."""
+    metric_a: str
+    metric_b: str
+    category_a: MetricCategory
+    category_b: MetricCategory
+    coefficient: float  # -1 to 1
+    strength: str  # 'weak', 'moderate', 'strong'
+    direction: str  # 'positive', 'negative'
+@dataclass
+class ThresholdCrossing:
+    """A metric crossing a significant threshold."""
+    metric_name: str
+    category: MetricCategory
+    event_id: str
+    timestamp: float
+    old_value: float
+    new_value: float
+    threshold: float
+    direction: str  # 'above', 'below'
+class MetricsEngine:
+    """
+    Quantification engine for the event stream.
+    Extracts numeric metrics from events, tracks them over time,
+    detects anomalies, correlations, and threshold crossings.
+    Does NOT interpret or explain. Provides raw quantified data
+    for human or AI observers to divine meaning from.
+    Example:
+        >>> engine = MetricsEngine(graph)
+        >>> engine.ingest(event)
+        >>>
+        >>> # Get metric statistics
+        >>> loss = engine.get_metric("loss")
+        >>> print(f"Loss: {loss.current} (delta: {loss.delta}, trend: {loss.trend()})")
+        >>>
+        >>> # Get anomalies
+        >>> for anomaly in engine.anomalies:
+        ...     print(f"ANOMALY: {anomaly.metric_name} = {anomaly.value}")
+        >>>
+        >>> # Get correlations
+        >>> for corr in engine.get_correlations():
+        ...     print(f"{corr.metric_a} ~ {corr.metric_b}: {corr.coefficient:.2f}")
+    """
+    def __init__(self, graph: Optional[CausationGraph] = None):
+        self.graph = graph
+        self._metrics: Dict[str, MetricSeries] = {}
+        self._anomalies: List[Anomaly] = []
+        self._threshold_crossings: List[ThresholdCrossing] = []
+        self._event_count = 0
+        # Configurable thresholds
+        self.anomaly_std_threshold = 2.5
+        self.correlation_min_samples = 10
+        # Known significant thresholds for ML metrics
+        self._known_thresholds = {
+            "loss": [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
+            "accuracy": [0.5, 0.8, 0.9, 0.95, 0.99],
+            "lr": [1e-5, 1e-4, 1e-3, 1e-2, 0.1],
+            "learning_rate": [1e-5, 1e-4, 1e-3, 1e-2, 0.1],
+            "grad_norm": [0.1, 1.0, 10.0, 100.0],
+            "gradient_norm": [0.1, 1.0, 10.0, 100.0],
+        }
+    def ingest(self, event: Event) -> Dict[str, MetricSeries]:
+        """
+        Ingest an event and extract/track all numeric metrics.
+        Returns dict of updated metric series.
+        """
+        self._event_count += 1
+        updated = {}
+        for key, value in event.data.items():
+            if isinstance(value, (int, float)) and not isinstance(value, bool):
+                category = classify_metric(key)
+                if math.isnan(value) or math.isinf(value):
+                    # Track NaN/Inf as anomalies but don't add to series
+                    self._anomalies.append(Anomaly(
+                        metric_name=key,
+                        category=category,
+                        event_id=event.event_id,
+                        timestamp=event.timestamp,
+                        value=value,
+                        expected_range=(0, 0),
+                        deviation_std=float('inf'),
+                        severity='critical',
+                    ))
+                    continue
+                # Get or create metric series with proper category
+                if key not in self._metrics:
+                    self._metrics[key] = MetricSeries(name=key, category=category)
+                series = self._metrics[key]
+                old_value = series.current
+                # Add new value
+                series.values.append(float(value))
+                series.timestamps.append(event.timestamp)
+                series.event_ids.append(event.event_id)
+                # Check for anomaly
+                if series.is_anomaly(self.anomaly_std_threshold):
+                    deviation = abs(value - series.mean) / series.std if series.std else 0
+                    severity = 'critical' if deviation > 4 else 'major' if deviation > 3 else 'minor'
+                    self._anomalies.append(Anomaly(
+                        metric_name=key,
+                        category=category,
+                        event_id=event.event_id,
+                        timestamp=event.timestamp,
+                        value=value,
+                        expected_range=(
+                            series.mean - 2*series.std,
+                            series.mean + 2*series.std
+                        ),
+                        deviation_std=deviation,
+                        severity=severity,
+                    ))
+                # Check for threshold crossing
+                if old_value is not None:
+                    self._check_threshold_crossing(
+                        key, event.event_id, event.timestamp, old_value, value
+                    )
+                updated[key] = series
+        return updated
+    def _check_threshold_crossing(
+        self,
+        metric: str,
+        event_id: str,
+        timestamp: float,
+        old_value: float,
+        new_value: float
+    ):
+        """Check if a metric crossed a known threshold."""
+        thresholds = self._known_thresholds.get(metric, [])
+        category = classify_metric(metric)
+        for threshold in thresholds:
+            # Crossed upward
+            if old_value < threshold <= new_value:
+                self._threshold_crossings.append(ThresholdCrossing(
+                    metric_name=metric,
+                    category=category,
+                    event_id=event_id,
+                    timestamp=timestamp,
+                    old_value=old_value,
+                    new_value=new_value,
+                    threshold=threshold,
+                    direction='above',
+                ))
+            # Crossed downward
+            elif old_value > threshold >= new_value:
+                self._threshold_crossings.append(ThresholdCrossing(
+                    metric_name=metric,
+                    category=category,
+                    event_id=event_id,
+                    timestamp=timestamp,
+                    old_value=old_value,
+                    new_value=new_value,
+                    threshold=threshold,
+                    direction='below',
+                ))
+    def get_metric(self, name: str) -> Optional[MetricSeries]:
+        """Get a metric series by name."""
+        return self._metrics.get(name)
+    @property
+    def metrics(self) -> Dict[str, MetricSeries]:
+        """All tracked metrics."""
+        return self._metrics
+    @property
+    def metric_names(self) -> List[str]:
+        """Names of all tracked metrics."""
+        return list(self._metrics.keys())
+    @property
+    def anomalies(self) -> List[Anomaly]:
+        """All detected anomalies."""
+        return self._anomalies
+    @property
+    def recent_anomalies(self) -> List[Anomaly]:
+        """Anomalies from last 10 events."""
+        if not self._anomalies:
+            return []
+        recent_ids = set()
+        for series in self._metrics.values():
+            recent_ids.update(series.event_ids[-10:])
+        return [a for a in self._anomalies if a.event_id in recent_ids]
+    @property
+    def threshold_crossings(self) -> List[ThresholdCrossing]:
+        """All threshold crossings."""
+        return self._threshold_crossings
+    def get_correlations(self, min_coefficient: float = 0.5) -> List[Correlation]:
+        """
+        Compute correlations between all metric pairs.
+        Returns correlations with |coefficient| >= min_coefficient.
+        """
+        correlations = []
+        metric_names = list(self._metrics.keys())
+        for i, name_a in enumerate(metric_names):
+            series_a = self._metrics[name_a]
+            for name_b in metric_names[i+1:]:
+                series_b = self._metrics[name_b]
+                coef = self._pearson_correlation(name_a, name_b)
+                if coef is not None and abs(coef) >= min_coefficient:
+                    strength = 'strong' if abs(coef) > 0.8 else 'moderate' if abs(coef) > 0.5 else 'weak'
+                    direction = 'positive' if coef > 0 else 'negative'
+                    correlations.append(Correlation(
+                        metric_a=name_a,
+                        metric_b=name_b,
+                        category_a=series_a.category,
+                        category_b=series_b.category,
+                        coefficient=coef,
+                        strength=strength,
+                        direction=direction,
+                    ))
+        return sorted(correlations, key=lambda c: abs(c.coefficient), reverse=True)
+    def _pearson_correlation(self, name_a: str, name_b: str) -> Optional[float]:
+        """Compute Pearson correlation between two metrics."""
+        series_a = self._metrics.get(name_a)
+        series_b = self._metrics.get(name_b)
+        if not series_a or not series_b:
+            return None
+        # Need enough samples
+        if series_a.count < self.correlation_min_samples or series_b.count < self.correlation_min_samples:
+            return None
+        # Align by taking min length
+        n = min(series_a.count, series_b.count)
+        a = series_a.values[-n:]
+        b = series_b.values[-n:]
+        # Compute correlation
+        mean_a = sum(a) / n
+        mean_b = sum(b) / n
+        numerator = sum((a[i] - mean_a) * (b[i] - mean_b) for i in range(n))
+        var_a = sum((x - mean_a) ** 2 for x in a)
+        var_b = sum((x - mean_b) ** 2 for x in b)
+        denominator = math.sqrt(var_a * var_b)
+        if denominator == 0:
+            return None
+        return numerator / denominator
+    def summary(self) -> Dict[str, Any]:
+        """Get a summary of all metrics and detections."""
+        return {
+            "event_count": self._event_count,
+            "metric_count": len(self._metrics),
+            "metrics": {name: series.to_dict() for name, series in self._metrics.items()},
+            "metrics_by_category": self.metrics_by_category_summary(),
+            "anomaly_count": len(self._anomalies),
+            "recent_anomalies": [
+                {"metric": a.metric_name, "category": a.category.name, "value": a.value, "severity": a.severity}
+                for a in self.recent_anomalies
+            ],
+            "threshold_crossings": len(self._threshold_crossings),
+            "correlations": [
+                {"a": c.metric_a, "b": c.metric_b, "r": c.coefficient,
+                 "cat_a": c.category_a.name, "cat_b": c.category_b.name}
+                for c in self.get_correlations()[:5]  # Top 5
+            ],
+            "health_status": self.health_summary(),
+        }
+    # =========================================================================
+    # CATEGORY-AWARE QUERIES
+    # =========================================================================
+    def get_metrics_by_category(self, category: MetricCategory) -> Dict[str, MetricSeries]:
+        """Get all metrics in a specific category."""
+        return {
+            name: series for name, series in self._metrics.items()
+            if series.category == category
+        }
+    def metrics_by_category_summary(self) -> Dict[str, Dict[str, Any]]:
+        """Get metric count and names grouped by category."""
+        by_cat: Dict[str, Dict[str, Any]] = {}
+        for name, series in self._metrics.items():
+            cat_name = series.category.name
+            if cat_name not in by_cat:
+                by_cat[cat_name] = {"count": 0, "metrics": [], "health": []}
+            by_cat[cat_name]["count"] += 1
+            by_cat[cat_name]["metrics"].append(name)
+            by_cat[cat_name]["health"].append(series.health_status())
+        return by_cat
+    def get_training_metrics(self) -> Dict[str, MetricSeries]:
+        """Convenience: get all TRAINING_DYNAMICS metrics."""
+        return self.get_metrics_by_category(MetricCategory.TRAINING_DYNAMICS)
+    def get_gradient_metrics(self) -> Dict[str, MetricSeries]:
+        """Convenience: get all GRADIENT_HEALTH metrics."""
+        return self.get_metrics_by_category(MetricCategory.GRADIENT_HEALTH)
+    def get_memory_metrics(self) -> Dict[str, MetricSeries]:
+        """Convenience: get all MEMORY_COMPUTE metrics."""
+        return self.get_metrics_by_category(MetricCategory.MEMORY_COMPUTE)
+    def get_convergence_metrics(self) -> Dict[str, MetricSeries]:
+        """Convenience: get all CONVERGENCE_SIGNALS metrics."""
+        return self.get_metrics_by_category(MetricCategory.CONVERGENCE_SIGNALS)
+    def health_summary(self) -> Dict[str, Any]:
+        """Get overall health status of all metrics."""
+        statuses = {"healthy": 0, "warning": 0, "critical": 0, "unknown": 0}
+        issues = []
+        for name, series in self._metrics.items():
+            status = series.health_status()
+            statuses[status] += 1
+            if status in ("warning", "critical"):
+                issues.append({
+                    "metric": name,
+                    "category": series.category.name,
+                    "status": status,
+                    "value": series.current,
+                    "trend": series.trend(),
+                })
+        overall = "critical" if statuses["critical"] > 0 else \
+                  "warning" if statuses["warning"] > 0 else "healthy"
+        return {
+            "overall": overall,
+            "counts": statuses,
+            "issues": issues,
+        }
+    def get_cross_category_correlations(self) -> List[Correlation]:
+        """Get correlations between metrics in different categories."""
+        all_corr = self.get_correlations(min_coefficient=0.3)
+        return [c for c in all_corr if c.category_a != c.category_b]
+    def get_category_coverage(self) -> Dict[str, bool]:
+        """Check which metric categories are being tracked."""
+        tracked = {series.category for series in self._metrics.values()}
+        return {cat.name: cat in tracked for cat in MetricCategory}
+    # =========================================================================
+    # TRIAGE SYSTEM - Common Sense Diagnostics (Occam's Razor)
+    # =========================================================================
+    #
+    # Five questions that matter:
+    # 1. Is training working? (loss trend)
+    # 2. Is it about to explode? (gradient health)
+    # 3. Am I wasting compute? (efficiency)
+    # 4. Am I overfitting? (generalization gap)
+    # 5. What broke and why? (anomaly + correlation)
+    #
+    def triage(self) -> Dict[str, Any]:
+        """
+        Quick diagnostic: Is training healthy? What's wrong?
+        Returns a simple, actionable assessment.
+        Occam's Razor: simplest useful answer.
+        """
+        diagnosis = {
+            "status": "LISTENING",  # Not UNKNOWN - we're actively waiting
+            "confidence": 0.0,
+            "checks": {},
+            "action": "Collecting initial metrics...",
+            "details": [],
+        }
+        checks_passed = 0
+        checks_total = 0
+        # CHECK 1: Is loss going down?
+        loss_check = self._check_loss_progress()
+        diagnosis["checks"]["loss_progress"] = loss_check
+        checks_total += 1
+        if loss_check["ok"]:
+            checks_passed += 1
+        # CHECK 2: Are gradients healthy?
+        grad_check = self._check_gradient_health()
+        diagnosis["checks"]["gradient_health"] = grad_check
+        checks_total += 1
+        if grad_check["ok"]:
+            checks_passed += 1
+        # CHECK 3: Am I using compute efficiently?
+        efficiency_check = self._check_efficiency()
+        diagnosis["checks"]["efficiency"] = efficiency_check
+        checks_total += 1
+        if efficiency_check["ok"]:
+            checks_passed += 1
+        # CHECK 4: Am I overfitting?
+        overfit_check = self._check_overfitting()
+        diagnosis["checks"]["overfitting"] = overfit_check
+        checks_total += 1
+        if overfit_check["ok"]:
+            checks_passed += 1
+        # CHECK 5: Any anomalies pointing to root cause?
+        anomaly_check = self._check_anomalies()
+        diagnosis["checks"]["anomalies"] = anomaly_check
+        checks_total += 1
+        if anomaly_check["ok"]:
+            checks_passed += 1
+        # Overall status
+        diagnosis["confidence"] = checks_passed / checks_total if checks_total > 0 else 0
+        if checks_passed == checks_total:
+            diagnosis["status"] = "HEALTHY"
+            diagnosis["action"] = "Training looks good. Continue monitoring."
+        elif checks_passed >= checks_total * 0.6:
+            diagnosis["status"] = "WARNING"
+            # Find what's wrong
+            issues = [k for k, v in diagnosis["checks"].items() if not v["ok"]]
+            diagnosis["action"] = f"Review: {', '.join(issues)}"
+        else:
+            diagnosis["status"] = "CRITICAL"
+            diagnosis["action"] = "Stop and investigate. Multiple issues detected."
+        # Collect all details
+        for check_name, check_result in diagnosis["checks"].items():
+            if check_result.get("detail"):
+                diagnosis["details"].append(f"{check_name}: {check_result['detail']}")
+        return diagnosis
+    def _check_loss_progress(self) -> Dict[str, Any]:
+        """Is loss decreasing as expected?"""
+        # Find loss metric (try common names)
+        loss_series = None
+        for name in ["loss", "train_loss", "nll_loss", "ce_loss"]:
+            if name in self._metrics:
+                loss_series = self._metrics[name]
+                break
+        if loss_series is None or loss_series.count < 3:
+            return {"ok": True, "detail": "Waiting for loss metrics (need 3+)", "status": "waiting"}
+        trend = loss_series.trend()
+        roc = loss_series.rate_of_change()
+        if trend == "falling":
+            return {"ok": True, "detail": f"Loss falling (Δ={roc:.4f}/step)", "status": "good"}
+        elif trend == "stable" and loss_series.current < 1.0:
+            return {"ok": True, "detail": f"Loss stable at {loss_series.current:.4f}", "status": "converged"}
+        elif trend == "rising":
+            return {"ok": False, "detail": f"Loss RISING! Current: {loss_series.current:.4f}", "status": "diverging"}
+        elif trend == "volatile":
+            return {"ok": False, "detail": f"Loss unstable (std={loss_series.std:.4f})", "status": "unstable"}
+        else:
+            return {"ok": True, "detail": f"Loss: {loss_series.current:.4f} (trend unclear)", "status": "stable"}
+    def _check_gradient_health(self) -> Dict[str, Any]:
+        """Are gradients in a healthy range?"""
+        grad_series = None
+        for name in ["grad_norm", "gradient_norm", "global_grad_norm"]:
+            if name in self._metrics:
+                grad_series = self._metrics[name]
+                break
+        if grad_series is None or grad_series.count < 2:
+            return {"ok": True, "detail": "Waiting for grad_norm metrics", "status": "waiting"}
+        current = grad_series.current
+        # Vanishing gradients
+        if current < 1e-7:
+            return {"ok": False, "detail": f"VANISHING gradients: {current:.2e}", "status": "vanishing"}
+        # Exploding gradients
+        if current > 100:
+            return {"ok": False, "detail": f"EXPLODING gradients: {current:.2f}", "status": "exploding"}
+        # Healthy range
+        if 1e-5 < current < 10:
+            return {"ok": True, "detail": f"Gradients healthy: {current:.4f}", "status": "healthy"}
+        # Warning zone
+        return {"ok": True, "detail": f"Gradients marginal: {current:.4f}", "status": "marginal"}
+    def _check_efficiency(self) -> Dict[str, Any]:
+        """Am I using compute efficiently?"""
+        # Check MFU (Model FLOP Utilization)
+        mfu_series = self._metrics.get("mfu")
+        if mfu_series and mfu_series.count > 0:
+            mfu = mfu_series.current
+            if mfu < 0.1:
+                return {"ok": False, "detail": f"Low GPU utilization: {mfu*100:.1f}%", "status": "inefficient"}
+            elif mfu < 0.3:
+                return {"ok": True, "detail": f"Moderate efficiency: {mfu*100:.1f}%", "status": "moderate"}
+            else:
+                return {"ok": True, "detail": f"Good efficiency: {mfu*100:.1f}%", "status": "efficient"}
+        # Fallback: check timing
+        time_series = self._metrics.get("dt") or self._metrics.get("time") or self._metrics.get("batch_time")
+        if time_series and time_series.count > 2:
+            trend = time_series.trend()
+            if trend == "rising":
+                return {"ok": False, "detail": "Step time increasing (slowdown)", "status": "degrading"}
+            return {"ok": True, "detail": f"Step time: {time_series.current:.3f}s", "status": "stable"}
+        return {"ok": True, "detail": "Need mfu or dt/time metrics", "status": "waiting"}
+    def _check_overfitting(self) -> Dict[str, Any]:
+        """Is model overfitting?"""
+        train_loss = None
+        val_loss = None
+        # Find train and val loss
+        for name in ["loss", "train_loss"]:
+            if name in self._metrics:
+                train_loss = self._metrics[name]
+                break
+        for name in ["val_loss", "eval_loss", "test_loss"]:
+            if name in self._metrics:
+                val_loss = self._metrics[name]
+                break
+        if train_loss is None or val_loss is None:
+            return {"ok": True, "detail": "Need train_loss + val_loss to check", "status": "waiting"}
+        if train_loss.count < 3 or val_loss.count < 3:
+            return {"ok": True, "detail": f"Collecting ({train_loss.count}/3 train, {val_loss.count}/3 val)", "status": "waiting"}
+        gap = val_loss.current - train_loss.current
+        gap_pct = gap / train_loss.current if train_loss.current > 0 else 0
+        # Check if gap is widening
+        train_trend = train_loss.trend()
+        val_trend = val_loss.trend()
+        if train_trend == "falling" and val_trend == "rising":
+            return {"ok": False, "detail": f"OVERFITTING: train↓ val↑ (gap={gap:.4f})", "status": "overfitting"}
+        if gap_pct > 0.5:  # Val loss 50% higher than train
+            return {"ok": False, "detail": f"Large generalization gap: {gap_pct*100:.1f}%", "status": "high_gap"}
+        if gap_pct > 0.2:
+            return {"ok": True, "detail": f"Moderate gap: {gap_pct*100:.1f}%", "status": "moderate_gap"}
+        return {"ok": True, "detail": f"Good generalization (gap={gap:.4f})", "status": "healthy"}
+    def _check_anomalies(self) -> Dict[str, Any]:
+        """Any recent anomalies that need attention?"""
+        recent = self.recent_anomalies
+        if not recent:
+            return {"ok": True, "detail": "No anomalies", "status": "clean"}
+        critical = [a for a in recent if a.severity == "critical"]
+        major = [a for a in recent if a.severity == "major"]
+        if critical:
+            names = list(set(a.metric_name for a in critical))
+            return {"ok": False, "detail": f"CRITICAL anomalies in: {', '.join(names)}", "status": "critical"}
+        if major:
+            names = list(set(a.metric_name for a in major))
+            return {"ok": False, "detail": f"Major anomalies in: {', '.join(names)}", "status": "major"}
+        return {"ok": True, "detail": f"{len(recent)} minor anomalies", "status": "minor"}
+    def quick_status(self) -> str:
+        """One-line status for dashboards."""
+        t = self.triage()
+        return f"[{t['status']}] {t['action']} (confidence: {t['confidence']*100:.0f}%)"
+    def __repr__(self) -> str:
+        return f"<MetricsEngine | {len(self._metrics)} metrics, {len(self._anomalies)} anomalies>"

cascade/analysis/tracer.py ADDED Viewed

	@@ -0,0 +1,487 @@

+"""
+Cascade Analysis - Bidirectional Causation Tracer.
+Trace cause-effect chains forwards and backwards through time.
+Find root causes. Predict cascading effects.
+"""
+from typing import List, Dict, Any, Optional, Set
+from collections import deque
+from dataclasses import dataclass, field
+from cascade.core.event import Event, CausationLink, CausationChain
+from cascade.core.graph import CausationGraph
+@dataclass
+class RootCauseAnalysis:
+    """Results of a root cause analysis."""
+    target_event: Event
+    root_causes: List[Event]
+    chains: List[CausationChain]
+    deepest_depth: int = 0
+    narrative: str = ""
+@dataclass
+class ImpactAnalysis:
+    """Results of an impact/forward analysis."""
+    source_event: Event
+    effects: List[Event]
+    chains: List[CausationChain]
+    total_impact_count: int = 0
+    severity_score: float = 0.0
+    narrative: str = ""
+@dataclass
+class CascadePrediction:
+    """Prediction of likely cascade from an event."""
+    source_event: Event
+    predicted_effects: List[Dict[str, Any]]  # [{event_type, probability, time_estimate}, ...]
+    risk_score: float = 0.0
+    intervention_points: List[str] = field(default_factory=list)
+    narrative: str = ""
+class Tracer:
+    """
+    Bidirectional causation tracer.
+    Traces cause-effect chains through the causation graph:
+    - Backwards: "What caused this?" → find root causes
+    - Forwards: "What will this cause?" → predict cascades
+    Example:
+        >>> tracer = Tracer(graph)
+        >>>
+        >>> # What caused this gradient explosion?
+        >>> causes = tracer.trace_backwards("evt_123")
+        >>>
+        >>> # What will this learning rate change cause?
+        >>> effects = tracer.trace_forwards("evt_456")
+        >>>
+        >>> # Deep root cause analysis
+        >>> roots = tracer.find_root_causes("evt_789")
+    """
+    def __init__(self, graph: CausationGraph):
+        """
+        Initialize tracer with a causation graph.
+        Args:
+            graph: The causation graph to trace through
+        """
+        self.graph = graph
+        self._prediction_model = None  # Future: ML model for predictions
+    def trace_backwards(self, event_id: str, max_depth: int = 1000) -> List[CausationChain]:
+        """
+        Trace causation backwards: what caused this event?
+        Args:
+            event_id: ID of the event to trace from
+            max_depth: Maximum depth to trace (default: 1000 - effectively unlimited)
+        Returns:
+            List of CausationChain objects, one per causal path found
+        """
+        target = self.graph.get_event(event_id)
+        if not target:
+            return []
+        chains = []
+        self._trace_backwards_recursive(event_id, [], [], max_depth, chains)
+        # Sort by depth (longest chain first for root cause analysis)
+        chains.sort(key=lambda c: c.depth, reverse=True)
+        return chains
+    def _trace_backwards_recursive(
+        self,
+        current_id: str,
+        current_events: List[Event],
+        current_links: List[CausationLink],
+        depth_remaining: int,
+        results: List[CausationChain],
+        visited: Optional[Set[str]] = None
+    ) -> None:
+        """Recursive helper for backwards tracing."""
+        if visited is None:
+            visited = set()
+        if current_id in visited:
+            return  # Avoid cycles
+        visited.add(current_id)
+        current_event = self.graph.get_event(current_id)
+        if not current_event:
+            return
+        current_events = [current_event] + current_events
+        if depth_remaining <= 0:
+            # Max depth reached, record this chain
+            if len(current_events) > 1:
+                results.append(self._build_chain(current_events, current_links))
+            return
+        causes = self.graph.get_causes(current_id)
+        if not causes:
+            # This is a root - record the chain
+            if len(current_events) >= 1:
+                results.append(self._build_chain(current_events, current_links))
+            return
+        for cause in causes:
+            link = self.graph.get_link(cause.event_id, current_id)
+            new_links = [link] + current_links if link else current_links
+            self._trace_backwards_recursive(
+                cause.event_id,
+                current_events,
+                new_links,
+                depth_remaining - 1,
+                results,
+                visited.copy()
+            )
+    def trace_forwards(self, event_id: str, max_depth: int = 1000) -> List[CausationChain]:
+        """
+        Trace causation forwards: what will this event cause?
+        Args:
+            event_id: ID of the event to trace from
+            max_depth: Maximum depth to trace (default: 1000 - effectively unlimited)
+        Returns:
+            List of CausationChain objects, one per effect path found
+        """
+        source = self.graph.get_event(event_id)
+        if not source:
+            return []
+        chains = []
+        self._trace_forwards_recursive(event_id, [], [], max_depth, chains)
+        # Sort by depth
+        chains.sort(key=lambda c: c.depth, reverse=True)
+        return chains
+    def _trace_forwards_recursive(
+        self,
+        current_id: str,
+        current_events: List[Event],
+        current_links: List[CausationLink],
+        depth_remaining: int,
+        results: List[CausationChain],
+        visited: Optional[Set[str]] = None
+    ) -> None:
+        """Recursive helper for forwards tracing."""
+        if visited is None:
+            visited = set()
+        if current_id in visited:
+            return
+        visited.add(current_id)
+        current_event = self.graph.get_event(current_id)
+        if not current_event:
+            return
+        current_events = current_events + [current_event]
+        if depth_remaining <= 0:
+            if len(current_events) > 1:
+                results.append(self._build_chain(current_events, current_links))
+            return
+        effects = self.graph.get_effects(current_id)
+        if not effects:
+            # This is a leaf - record the chain
+            if len(current_events) >= 1:
+                results.append(self._build_chain(current_events, current_links))
+            return
+        for effect in effects:
+            link = self.graph.get_link(current_id, effect.event_id)
+            new_links = current_links + [link] if link else current_links
+            self._trace_forwards_recursive(
+                effect.event_id,
+                current_events,
+                new_links,
+                depth_remaining - 1,
+                results,
+                visited.copy()
+            )
+    def find_root_causes(self, event_id: str, max_depth: int = 1000) -> RootCauseAnalysis:
+        """
+        Deep root cause analysis: find the ultimate origins.
+        Traces all the way back to find events with no causes.
+        Args:
+            event_id: ID of the event to analyze
+            max_depth: Maximum depth to search (default: 1000 - effectively unlimited)
+        Returns:
+            RootCauseAnalysis with root causes and narrative
+        """
+        target = self.graph.get_event(event_id)
+        if not target:
+            return RootCauseAnalysis(
+                target_event=None,
+                root_causes=[],
+                chains=[],
+            )
+        chains = self.trace_backwards(event_id, max_depth)
+        # Extract root causes (events at the start of chains)
+        root_causes = []
+        seen = set()
+        for chain in chains:
+            if chain.events:
+                root = chain.events[0]
+                if root.event_id not in seen:
+                    root_causes.append(root)
+                    seen.add(root.event_id)
+        # Build narrative
+        narrative = self._build_root_cause_narrative(target, root_causes, chains)
+        return RootCauseAnalysis(
+            target_event=target,
+            root_causes=root_causes,
+            chains=chains,
+            deepest_depth=max(c.depth for c in chains) if chains else 0,
+            narrative=narrative,
+        )
+    def analyze_impact(self, event_id: str, max_depth: int = 1000) -> ImpactAnalysis:
+        """
+        Impact analysis: what were ALL downstream effects?
+        Traces forward to find everything this event set in motion.
+        Args:
+            event_id: ID of the event to analyze
+            max_depth: Maximum depth to search (default: 1000 - effectively unlimited)
+        Returns:
+            ImpactAnalysis with effects and severity score
+        """
+        source = self.graph.get_event(event_id)
+        if not source:
+            return ImpactAnalysis(
+                source_event=None,
+                effects=[],
+                chains=[],
+            )
+        chains = self.trace_forwards(event_id, max_depth)
+        # Extract all effects
+        effects = []
+        seen = set()
+        for chain in chains:
+            for event in chain.events[1:]:  # Skip source
+                if event.event_id not in seen:
+                    effects.append(event)
+                    seen.add(event.event_id)
+        # Calculate severity
+        severity = self._calculate_impact_severity(source, effects)
+        # Build narrative
+        narrative = self._build_impact_narrative(source, effects, chains)
+        return ImpactAnalysis(
+            source_event=source,
+            effects=effects,
+            chains=chains,
+            total_impact_count=len(effects),
+            severity_score=severity,
+            narrative=narrative,
+        )
+    def predict_cascade(self, event_id: str) -> CascadePrediction:
+        """
+        Predict likely cascade from this event.
+        Uses learned patterns to forecast effects BEFORE they happen.
+        This is the "Minority Report" capability.
+        Args:
+            event_id: ID of the event to predict from
+        Returns:
+            CascadePrediction with risk scores and intervention points
+        """
+        source = self.graph.get_event(event_id)
+        if not source:
+            return CascadePrediction(
+                source_event=None,
+                predicted_effects=[],
+            )
+        # Get historical patterns for this event type
+        similar_events = self.graph.get_events_by_type(source.event_type)
+        # Count what typically follows - use all available history for better predictions
+        # No artificial cap - system learns from full history
+        effect_counts: Dict[str, int] = {}
+        analysis_window = similar_events  # Full history, no slice
+        for similar in analysis_window:
+            effects = self.graph.get_effects(similar.event_id)
+            for effect in effects:
+                key = effect.event_type
+                effect_counts[key] = effect_counts.get(key, 0) + 1
+        # Convert to predictions
+        total = len(analysis_window)
+        predictions = []
+        for event_type, count in sorted(effect_counts.items(), key=lambda x: -x[1]):
+            predictions.append({
+                "event_type": event_type,
+                "probability": count / total if total > 0 else 0,
+                "historical_count": count,
+            })
+        # Calculate risk score
+        risk_score = self._calculate_risk_score(source, predictions)
+        # Identify intervention points
+        intervention_points = self._find_intervention_points(source, predictions)
+        return CascadePrediction(
+            source_event=source,
+            predicted_effects=predictions[:10],  # Top 10
+            risk_score=risk_score,
+            intervention_points=intervention_points,
+            narrative=f"Based on {total} similar events, predicting {len(predictions)} likely effects.",
+        )
+    def _build_chain(self, events: List[Event], links: List[CausationLink]) -> CausationChain:
+        """Build a CausationChain from events and links."""
+        total_strength = 1.0
+        for link in links:
+            total_strength *= link.strength
+        return CausationChain(
+            events=events,
+            links=links,
+            total_strength=total_strength,
+            depth=len(links),
+        )
+    def _build_root_cause_narrative(
+        self,
+        target: Event,
+        roots: List[Event],
+        chains: List[CausationChain]
+    ) -> str:
+        """Build human-readable narrative for root cause analysis."""
+        if not roots:
+            return f"No root causes found for {target.event_type}"
+        lines = [f"Root cause analysis for {target.event_type}:"]
+        lines.append(f"Found {len(roots)} root cause(s) across {len(chains)} causal chain(s).")
+        lines.append("")
+        for i, root in enumerate(roots[:5], 1):  # Top 5
+            lines.append(f"{i}. {root.component}/{root.event_type}")
+            if root.data:
+                key_data = list(root.data.items())[:3]
+                lines.append(f"   Data: {dict(key_data)}")
+        return "\n".join(lines)
+    def _build_impact_narrative(
+        self,
+        source: Event,
+        effects: List[Event],
+        chains: List[CausationChain]
+    ) -> str:
+        """Build human-readable narrative for impact analysis."""
+        if not effects:
+            return f"No downstream effects found for {source.event_type}"
+        lines = [f"Impact analysis for {source.event_type}:"]
+        lines.append(f"Found {len(effects)} downstream effect(s).")
+        lines.append("")
+        # Group by event type
+        by_type: Dict[str, int] = {}
+        for effect in effects:
+            by_type[effect.event_type] = by_type.get(effect.event_type, 0) + 1
+        for event_type, count in sorted(by_type.items(), key=lambda x: -x[1]):
+            lines.append(f"  • {event_type}: {count} occurrence(s)")
+        return "\n".join(lines)
+    def _calculate_impact_severity(self, source: Event, effects: List[Event]) -> float:
+        """Calculate severity score for an impact (0.0 to 1.0)."""
+        if not effects:
+            return 0.0
+        # Factors: number of effects, types of effects
+        count_score = min(1.0, len(effects) / 20)  # 20+ effects = max
+        # High-severity event types
+        severe_types = {'error', 'anomaly', 'crash', 'failure', 'explosion'}
+        severe_count = sum(1 for e in effects if e.event_type in severe_types)
+        severity_score = min(1.0, severe_count / 5)
+        return (count_score + severity_score) / 2
+    def _calculate_risk_score(
+        self,
+        source: Event,
+        predictions: List[Dict[str, Any]]
+    ) -> float:
+        """Calculate risk score for a cascade prediction."""
+        if not predictions:
+            return 0.0
+        # High-risk event types
+        risky_types = {'error', 'anomaly', 'crash', 'failure', 'explosion', 'nan', 'overflow'}
+        risk = 0.0
+        for pred in predictions:
+            if pred["event_type"] in risky_types:
+                risk += pred["probability"] * 2  # Double weight for risky
+            else:
+                risk += pred["probability"] * 0.5
+        return min(1.0, risk)
+    def _find_intervention_points(
+        self,
+        source: Event,
+        predictions: List[Dict[str, Any]]
+    ) -> List[str]:
+        """Identify points where intervention could prevent bad cascades."""
+        points = []
+        # Look at source event data for intervention hints
+        if 'learning_rate' in source.data:
+            points.append("Reduce learning rate")
+        if 'gradient' in source.event_type.lower():
+            points.append("Apply gradient clipping")
+        if source.data.get('loss', 0) > 10:
+            points.append("Check loss function / data")
+        # Check predictions for severe outcomes
+        for pred in predictions:
+            if pred["event_type"] == "nan" and pred["probability"] > 0.3:
+                points.append("Enable NaN detection early stopping")
+            if pred["event_type"] == "overflow" and pred["probability"] > 0.3:
+                points.append("Apply gradient scaling")
+        return points

cascade/bridge.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""
+HuggingFace → IPFS Bridge
+Makes every CASCADE instance a node in the IPFS network.
+Serves lattice content to DHT without running a full daemon.
+Uses js-ipfs HTTP API compatible endpoints via ipfs-http-client.
+For HF Spaces, we use Helia (browser/Node IPFS) style serving.
+"""
+import json
+import hashlib
+from pathlib import Path
+from typing import Optional, Dict, Any
+import threading
+import time
+# Optional: for full IPFS integration
+try:
+    import ipfshttpclient
+    HAS_IPFS_CLIENT = True
+except ImportError:
+    HAS_IPFS_CLIENT = False
+from cascade.ipld import chain_to_ipld, chain_to_cid, encode_to_dag_cbor
+class LatticeServer:
+    """
+    Serves lattice content over IPFS-compatible protocols.
+    Can run in multiple modes:
+    1. Gateway mode: HTTP endpoints that mirror IPFS gateway API
+    2. DHT mode: Announce content to IPFS DHT (needs daemon)
+    3. Hybrid: Both
+    """
+    def __init__(self, lattice_dir: Path = None):
+        if lattice_dir is None:
+            # Try relative to this file first, then cwd
+            candidate = Path(__file__).resolve().parent.parent / "lattice"
+            if not candidate.exists():
+                candidate = Path.cwd() / "lattice"
+            self.lattice_dir = candidate
+        else:
+            self.lattice_dir = lattice_dir
+        self.ipld_dir = self.lattice_dir / "ipld"
+        self._index: Dict[str, Path] = {}  # CID -> file path
+        self._build_index()
+    def _build_index(self):
+        """Index all known CIDs to their local files."""
+        # Index CBOR files
+        if self.ipld_dir.exists():
+            for cbor_file in self.ipld_dir.glob("*.cbor"):
+                ipld_json = cbor_file.with_suffix(".ipld.json")
+                if ipld_json.exists():
+                    meta = json.loads(ipld_json.read_text())
+                    # Try both 'cid' and '_cid' keys
+                    cid = meta.get("cid") or meta.get("_cid")
+                    if cid:
+                        self._index[cid] = cbor_file
+        # Index JSON chain files (compute CID on the fly)
+        for json_file in self.lattice_dir.glob("*.json"):
+            if json_file.name == "README.md":
+                continue
+            try:
+                chain_data = json.loads(json_file.read_text())
+                cid = chain_to_cid(chain_data)
+                self._index[cid] = json_file
+            except:
+                pass
+        print(f"Indexed {len(self._index)} CIDs")
+    def resolve(self, cid: str) -> Optional[bytes]:
+        """Resolve a CID to its content."""
+        if cid in self._index:
+            filepath = self._index[cid]
+            if filepath.suffix == ".cbor":
+                return filepath.read_bytes()
+            else:
+                # JSON file - return as CBOR for consistency
+                chain_data = json.loads(filepath.read_text())
+                ipld_data = chain_to_ipld(chain_data)
+                return encode_to_dag_cbor(ipld_data)
+        return None
+    def list_cids(self) -> list:
+        """List all available CIDs."""
+        return list(self._index.keys())
+    def get_gateway_response(self, cid: str) -> tuple:
+        """
+        Return (content, content_type, status_code) for gateway-style serving.
+        """
+        content = self.resolve(cid)
+        if content:
+            return (content, "application/cbor", 200)
+        return (b"CID not found", "text/plain", 404)
+    def announce_to_dht(self, ipfs_api: str = "/ip4/127.0.0.1/tcp/5001"):
+        """
+        Announce all CIDs to IPFS DHT.
+        Requires running IPFS daemon.
+        """
+        if not HAS_IPFS_CLIENT:
+            print("ipfshttpclient not installed. Run: pip install ipfshttpclient")
+            return
+        try:
+            client = ipfshttpclient.connect(ipfs_api)
+        except Exception as e:
+            print(f"Could not connect to IPFS daemon: {e}")
+            print("Start daemon with: ipfs daemon")
+            return
+        for cid, filepath in self._index.items():
+            try:
+                # Add file to local IPFS node
+                if filepath.suffix == ".cbor":
+                    result = client.add(str(filepath))
+                    print(f"Announced {filepath.name}: {result['Hash']}")
+            except Exception as e:
+                print(f"Failed to announce {cid}: {e}")
+    def start_gateway(self, host: str = "0.0.0.0", port: int = 8080):
+        """
+        Start a simple HTTP gateway for serving lattice content.
+        Compatible with IPFS gateway URL format:
+            GET /ipfs/{cid}
+        """
+        from http.server import HTTPServer, BaseHTTPRequestHandler
+        server = self
+        class GatewayHandler(BaseHTTPRequestHandler):
+            def do_GET(self):
+                # Parse /ipfs/{cid} or just /{cid}
+                path = self.path.strip("/")
+                if path.startswith("ipfs/"):
+                    cid = path[5:]
+                else:
+                    cid = path
+                content, content_type, status = server.get_gateway_response(cid)
+                self.send_response(status)
+                self.send_header("Content-Type", content_type)
+                self.send_header("Content-Length", len(content))
+                self.send_header("Access-Control-Allow-Origin", "*")
+                self.end_headers()
+                self.wfile.write(content)
+            def do_HEAD(self):
+                path = self.path.strip("/")
+                if path.startswith("ipfs/"):
+                    cid = path[5:]
+                else:
+                    cid = path
+                _, content_type, status = server.get_gateway_response(cid)
+                self.send_response(status)
+                self.send_header("Content-Type", content_type)
+                self.send_header("Access-Control-Allow-Origin", "*")
+                self.end_headers()
+            def log_message(self, format, *args):
+                print(f"[Gateway] {args[0]}")
+        httpd = HTTPServer((host, port), GatewayHandler)
+        print(f"Lattice gateway running at http://{host}:{port}")
+        print(f"Serving {len(self._index)} CIDs")
+        print(f"\nTry: http://localhost:{port}/ipfs/bafyreidixjlzdat7ex72foi6vm3vnskhzguovxj6ondbazrqks7v6ahmei")
+        httpd.serve_forever()
+def create_gradio_gateway():
+    """
+    Create a Gradio interface that serves as IPFS gateway.
+    Suitable for HuggingFace Spaces deployment.
+    """
+    try:
+        import gradio as gr
+    except ImportError:
+        print("Gradio not installed. Run: pip install gradio")
+        return None
+    server = LatticeServer()
+    def resolve_cid(cid: str) -> str:
+        """Resolve CID and return content as hex + JSON decode attempt."""
+        content = server.resolve(cid.strip())
+        if content is None:
+            return f"❌ CID not found: {cid}\n\nAvailable CIDs:\n" + "\n".join(server.list_cids())
+        # Try to decode as CBOR → JSON for display
+        try:
+            import dag_cbor
+            decoded = dag_cbor.decode(content)
+            return f"✓ Found! ({len(content)} bytes)\n\n{json.dumps(decoded, indent=2, default=str)}"
+        except:
+            return f"✓ Found! ({len(content)} bytes)\n\nRaw hex: {content.hex()[:200]}..."
+    def list_all() -> str:
+        """List all available CIDs."""
+        cids = server.list_cids()
+        lines = [f"=== Lattice Index ({len(cids)} chains) ===\n"]
+        for cid in cids:
+            filepath = server._index[cid]
+            lines.append(f"• {filepath.stem}")
+            lines.append(f"  {cid}\n")
+        return "\n".join(lines)
+    with gr.Blocks(title="CASCADE Lattice Gateway") as app:
+        gr.Markdown("# 🌐 CASCADE Lattice Gateway")
+        gr.Markdown("*The neural internetwork, content-addressed.*")
+        with gr.Tab("Resolve CID"):
+            cid_input = gr.Textbox(
+                label="CID",
+                placeholder="bafyrei...",
+                value="bafyreidixjlzdat7ex72foi6vm3vnskhzguovxj6ondbazrqks7v6ahmei"
+            )
+            resolve_btn = gr.Button("Resolve")
+            output = gr.Textbox(label="Content", lines=20)
+            resolve_btn.click(resolve_cid, inputs=cid_input, outputs=output)
+        with gr.Tab("Browse Lattice"):
+            list_btn = gr.Button("List All CIDs")
+            list_output = gr.Textbox(label="Available Chains", lines=20)
+            list_btn.click(list_all, outputs=list_output)
+        gr.Markdown("""
+        ---
+        **What is this?**
+        This gateway serves the CASCADE lattice — a cryptographic provenance network for AI agents.
+        Every chain has a CID (Content IDentifier). Same content = same CID. Forever.
+        - **Genesis**: `bafyreidixjlzdat7ex72foi6vm3vnskhzguovxj6ondbazrqks7v6ahmei`
+        - Protocol: [IPLD](https://ipld.io/) (InterPlanetary Linked Data)
+        """)
+    return app
+if __name__ == "__main__":
+    import sys
+    if "--gradio" in sys.argv:
+        app = create_gradio_gateway()
+        if app:
+            app.launch()
+    elif "--announce" in sys.argv:
+        server = LatticeServer()
+        server.announce_to_dht()
+    else:
+        # Default: run HTTP gateway
+        server = LatticeServer()
+        server.start_gateway(port=8080)

cascade/cli_main.py ADDED Viewed

	@@ -0,0 +1,851 @@

+"""
+CASCADE CLI - Full-featured Rich TUI for cascade-ai.
+Exposes all CASCADE capabilities:
+- Lattice: stats, list, inspect, chains, pin, export, watch
+- Model: observe, fingerprint
+- Data: entities, provenance, pii scan
+- System: logs, analyze, ingest
+- Proxy: start intercepting proxy
+"""
+import argparse
+import sys
+import json
+from pathlib import Path
+from datetime import datetime
+# Rich imports with fallback
+try:
+    from rich.console import Console
+    from rich.table import Table
+    from rich.panel import Panel
+    from rich.tree import Tree
+    from rich.progress import Progress, SpinnerColumn, TextColumn
+    from rich.text import Text
+    from rich.markdown import Markdown
+    from rich.syntax import Syntax
+    from rich import box
+    HAS_RICH = True
+except ImportError:
+    HAS_RICH = False
+console = Console() if HAS_RICH else None
+# ═══════════════════════════════════════════════════════════════════════════════
+# LATTICE COMMANDS
+# ═══════════════════════════════════════════════════════════════════════════════
+def cmd_stats(args):
+    """Show lattice statistics with Rich panels."""
+    from cascade.observation import ObservationManager
+    manager = ObservationManager()
+    stats = manager.get_stats()
+    if HAS_RICH:
+        stats_table = Table(show_header=False, box=box.SIMPLE, padding=(0, 2))
+        stats_table.add_column("Key", style="cyan")
+        stats_table.add_column("Value", style="green")
+        stats_table.add_row("Genesis Root", f"[bold magenta]{stats['genesis_root']}[/]")
+        stats_table.add_row("", "")
+        stats_table.add_row("Total Observations", str(stats['total_observations']))
+        stats_table.add_row("  └─ Model", str(stats['model_observations']))
+        stats_table.add_row("  └─ Data", str(stats['data_observations']))
+        stats_table.add_row("  └─ System", str(stats['system_observations']))
+        stats_table.add_row("", "")
+        stats_table.add_row("Registered Models", str(stats['registered_models']))
+        stats_table.add_row("Unique Models Observed", str(stats['unique_models']))
+        panel = Panel(
+            stats_table,
+            title="[bold cyan]CASCADE LATTICE[/]",
+            subtitle="[dim]The Neural Internetwork[/]",
+            border_style="cyan",
+        )
+        console.print(panel)
+    else:
+        print(f"""
+CASCADE LATTICE STATS
+═════════════════════
+Genesis Root: {stats['genesis_root']}
+Observations:
+  Total:  {stats['total_observations']}
+  Model:  {stats['model_observations']}
+  Data:   {stats['data_observations']}
+  System: {stats['system_observations']}
+Models:
+  Registered: {stats['registered_models']}
+  Observed:   {stats['unique_models']}
+""")
+def cmd_list(args):
+    """List recent observations."""
+    from cascade.observation import ObservationManager
+    manager = ObservationManager()
+    observations = manager.list_observations(limit=args.limit)
+    if not observations:
+        if HAS_RICH:
+            console.print("[yellow]No observations yet.[/]")
+        else:
+            print("No observations yet.")
+        return
+    if HAS_RICH:
+        table = Table(title=f"Recent Observations", box=box.ROUNDED)
+        table.add_column("Type", style="cyan", width=8)
+        table.add_column("Source", style="white", max_width=40)
+        table.add_column("Merkle Root", style="magenta")
+        table.add_column("Time", style="dim")
+        for obs in observations:
+            obs_type = obs.get('observation_type', '?')[:7]
+            source = obs.get('source_id', 'unknown')[:39]
+            merkle = obs.get('merkle_root', '?')[:16]
+            timestamp = obs.get('timestamp', '')
+            if timestamp:
+                try:
+                    if isinstance(timestamp, (int, float)):
+                        timestamp = datetime.fromtimestamp(timestamp).strftime('%H:%M:%S')
+                    else:
+                        timestamp = str(timestamp)[:8]
+                except:
+                    timestamp = '?'
+            table.add_row(obs_type, source, merkle, timestamp)
+        console.print(table)
+        console.print(f"[dim]Showing {len(observations)} of {manager.get_stats()['total_observations']}[/]")
+    else:
+        print(f"\n{'TYPE':<8} {'SOURCE':<40} {'MERKLE ROOT':<20}")
+        print("─" * 70)
+        for obs in observations:
+            print(f"{obs.get('observation_type', '?')[:7]:<8} {obs.get('source_id', '?')[:39]:<40} {obs.get('merkle_root', '?')[:19]:<20}")
+def cmd_inspect(args):
+    """Inspect a specific observation by merkle root."""
+    from cascade.observation import ObservationManager
+    manager = ObservationManager()
+    obs = manager.get_observation(args.root)
+    if not obs:
+        if HAS_RICH:
+            console.print(f"[red]Observation not found:[/] {args.root}")
+        else:
+            print(f"Observation not found: {args.root}")
+        return
+    if HAS_RICH:
+        tree = Tree(f"[bold magenta]{args.root}[/]")
+        for key, value in obs.items():
+            if isinstance(value, dict):
+                branch = tree.add(f"[cyan]{key}[/]")
+                for k, v in value.items():
+                    branch.add(f"[dim]{k}:[/] {v}")
+            elif isinstance(value, list):
+                branch = tree.add(f"[cyan]{key}[/] ({len(value)} items)")
+                for item in value[:5]:
+                    branch.add(str(item)[:60])
+                if len(value) > 5:
+                    branch.add(f"[dim]... and {len(value) - 5} more[/]")
+            else:
+                tree.add(f"[cyan]{key}:[/] {value}")
+        console.print(Panel(tree, title="Observation Details", border_style="magenta"))
+    else:
+        print(json.dumps(obs, indent=2, default=str))
+def cmd_chains(args):
+    """List all chains in the lattice."""
+    from cascade.viz.lattice_gateway import load_lattice_data
+    data = load_lattice_data()
+    chains = data.get('chains', [])
+    if HAS_RICH:
+        table = Table(title="Lattice Chains", box=box.ROUNDED)
+        table.add_column("Name", style="cyan")
+        table.add_column("Merkle Root", style="magenta")
+        table.add_column("Records", justify="right")
+        table.add_column("CID", style="dim")
+        for chain in chains:
+            name = chain.get('name', '?')
+            root = chain.get('merkle_root', '?')[:16]
+            records = len(chain.get('records', {}))
+            cid = chain.get('cid', 'Not pinned')
+            if cid and cid != 'Not pinned':
+                cid = cid[:20] + '...'
+            style = "bold green" if name == 'genesis' else None
+            table.add_row(name, root, str(records), cid, style=style)
+        console.print(table)
+        console.print(f"\n[dim]Genesis: {data.get('genesis_root', 'N/A')}[/]")
+    else:
+        print(f"Chains in lattice: {len(chains)}")
+        for chain in chains:
+            print(f"  {chain.get('name')}: {chain.get('merkle_root', '?')[:16]} ({len(chain.get('records', {}))} records)")
+def cmd_pin(args):
+    """Pin observation to IPFS."""
+    from cascade.observation import ObservationManager
+    manager = ObservationManager()
+    obs = manager.get_observation(args.root)
+    if not obs:
+        if HAS_RICH:
+            console.print(f"[red]Observation not found:[/] {args.root}")
+        else:
+            print(f"Observation not found: {args.root}")
+        return
+    if HAS_RICH:
+        with console.status("[cyan]Pinning to IPFS...[/]"):
+            cid = manager.pin_to_ipfs(obs)
+        if cid:
+            console.print(f"[green]✓ Pinned to IPFS[/]")
+            console.print(f"  CID: [magenta]{cid}[/]")
+            console.print(f"  URL: https://storacha.link/ipfs/{cid}")
+        else:
+            console.print("[red]✗ Failed to pin[/]")
+    else:
+        print(f"Pinning {args.root}...")
+        cid = manager.pin_to_ipfs(obs)
+        if cid:
+            print(f"✓ Pinned: {cid}")
+        else:
+            print("✗ Failed")
+def cmd_export(args):
+    """Export lattice or chain to file."""
+    from cascade.viz.lattice_gateway import load_lattice_data
+    data = load_lattice_data()
+    if args.chain:
+        chains = [c for c in data.get('chains', []) if c['name'] == args.chain]
+        if not chains:
+            msg = f"Chain not found: {args.chain}"
+            console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
+            return
+        export_data = chains[0]
+    else:
+        export_data = data
+    output = Path(args.output)
+    output.write_text(json.dumps(export_data, indent=2, default=str))
+    msg = f"✓ Exported to {output}"
+    console.print(f"[green]{msg}[/]") if HAS_RICH else print(msg)
+def cmd_watch(args):
+    """Watch live observations in real-time."""
+    from cascade.observation import ObservationManager
+    import time
+    manager = ObservationManager()
+    last_count = 0
+    if HAS_RICH:
+        console.print("[cyan]Watching for observations... (Ctrl+C to stop)[/]\n")
+    else:
+        print("Watching... (Ctrl+C to stop)")
+    try:
+        while True:
+            stats = manager.get_stats()
+            current = stats['total_observations']
+            if current > last_count:
+                new_obs = manager.list_observations(limit=current - last_count)
+                for obs in reversed(new_obs):
+                    if HAS_RICH:
+                        console.print(
+                            f"[green]●[/] [{datetime.now().strftime('%H:%M:%S')}] "
+                            f"[cyan]{obs.get('observation_type', '?')}[/] "
+                            f"[white]{obs.get('source_id', '?')[:40]}[/] "
+                            f"[magenta]{obs.get('merkle_root', '?')[:16]}[/]"
+                        )
+                    else:
+                        print(f"● {obs.get('observation_type', '?')} {obs.get('merkle_root', '?')[:16]}")
+                last_count = current
+            time.sleep(1)
+    except KeyboardInterrupt:
+        msg = "\nStopped watching."
+        console.print(f"[yellow]{msg}[/]") if HAS_RICH else print(msg)
+# ═══════════════════════════════════════════════════════════════════════════════
+# MODEL COMMANDS
+# ═══════════════════════════════════════════════════════════════════════════════
+def cmd_observe(args):
+    """Manually observe a model interaction."""
+    from cascade import observe
+    result = observe(
+        model_id=args.model,
+        input_data=args.input,
+        output_data=args.output,
+        observation_type='model',
+    )
+    if HAS_RICH:
+        console.print(f"[green]✓ Observed[/]")
+        console.print(f"  Merkle Root: [magenta]{result.get('merkle_root', 'N/A')}[/]")
+    else:
+        print(f"Observed: {result.get('merkle_root', 'N/A')}")
+def cmd_fingerprint(args):
+    """Generate model fingerprint."""
+    try:
+        from cascade.forensics.fingerprints import ModelFingerprinter
+        if HAS_RICH:
+            with console.status(f"[cyan]Fingerprinting {args.model}...[/]"):
+                fp = ModelFingerprinter()
+                result = fp.fingerprint(args.model)
+            if result:
+                table = Table(title=f"Fingerprint: {args.model}", box=box.ROUNDED)
+                table.add_column("Property", style="cyan")
+                table.add_column("Value", style="white")
+                for key, value in result.items():
+                    if isinstance(value, dict):
+                        value = json.dumps(value)[:50] + '...'
+                    table.add_row(str(key), str(value)[:60])
+                console.print(table)
+            else:
+                console.print("[yellow]Could not fingerprint model[/]")
+        else:
+            fp = ModelFingerprinter()
+            result = fp.fingerprint(args.model)
+            print(json.dumps(result, indent=2, default=str))
+    except Exception as e:
+        msg = f"Error: {e}"
+        console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
+# ═══════════════════════════════════════════════════════════════════════════════
+# DATA COMMANDS
+# ═══════════════════════════════════════════════════════════════════════════════
+def cmd_entities(args):
+    """Run entity resolution on a file."""
+    try:
+        from cascade.data.entities import EntityResolver
+        if HAS_RICH:
+            with console.status(f"[cyan]Resolving entities in {args.file}...[/]"):
+                resolver = EntityResolver()
+                result = resolver.resolve_file(args.file)
+            if result:
+                console.print(f"[green]✓ Found {len(result)} entities[/]")
+                table = Table(box=box.SIMPLE)
+                table.add_column("Entity", style="cyan")
+                table.add_column("Type", style="magenta")
+                table.add_column("Count", justify="right")
+                for entity in result[:20]:
+                    table.add_row(
+                        str(entity.get('name', '?'))[:30],
+                        entity.get('type', '?'),
+                        str(entity.get('count', 1))
+                    )
+                console.print(table)
+                if len(result) > 20:
+                    console.print(f"[dim]... and {len(result) - 20} more[/]")
+        else:
+            resolver = EntityResolver()
+            result = resolver.resolve_file(args.file)
+            print(f"Found {len(result)} entities")
+    except Exception as e:
+        msg = f"Error: {e}"
+        console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
+def cmd_pii(args):
+    """Scan for PII in a file."""
+    try:
+        from cascade.data.pii import PIIScanner
+        if HAS_RICH:
+            with console.status(f"[cyan]Scanning {args.file} for PII...[/]"):
+                scanner = PIIScanner()
+                results = scanner.scan_file(args.file)
+            if results:
+                console.print(f"[yellow]⚠ Found {len(results)} potential PII instances[/]")
+                table = Table(box=box.ROUNDED)
+                table.add_column("Type", style="red")
+                table.add_column("Value", style="yellow")
+                table.add_column("Location", style="dim")
+                for pii in results[:20]:
+                    val = pii.get('value', '?')
+                    table.add_row(
+                        pii.get('type', '?'),
+                        val[:30] + '...' if len(val) > 30 else val,
+                        str(pii.get('location', '?'))
+                    )
+                console.print(table)
+            else:
+                console.print("[green]✓ No PII detected[/]")
+        else:
+            scanner = PIIScanner()
+            results = scanner.scan_file(args.file)
+            print(f"Found {len(results)} PII instances")
+    except Exception as e:
+        msg = f"Error: {e}"
+        console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
+def cmd_provenance(args):
+    """Show data provenance for a file/dataset."""
+    try:
+        from cascade.data.provenance import DataProvenance
+        if HAS_RICH:
+            with console.status(f"[cyan]Analyzing provenance...[/]"):
+                prov = DataProvenance()
+                result = prov.analyze(args.path)
+            if result:
+                tree = Tree(f"[bold cyan]{args.path}[/]")
+                if 'hash' in result:
+                    tree.add(f"[magenta]Hash:[/] {result['hash']}")
+                if 'sources' in result:
+                    sources = tree.add("[cyan]Sources[/]")
+                    for src in result['sources']:
+                        sources.add(str(src))
+                if 'transformations' in result:
+                    transforms = tree.add("[cyan]Transformations[/]")
+                    for t in result['transformations']:
+                        transforms.add(str(t))
+                console.print(Panel(tree, title="Data Provenance", border_style="cyan"))
+        else:
+            prov = DataProvenance()
+            result = prov.analyze(args.path)
+            print(json.dumps(result, indent=2, default=str))
+    except Exception as e:
+        msg = f"Error: {e}"
+        console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
+# ═══════════════════════════════════════════════════════════════════════════════
+# SYSTEM COMMANDS
+# ═══════════════════════════════════════════════════════════════════════════════
+def cmd_ingest(args):
+    """Ingest logs/files into the lattice."""
+    try:
+        from cascade.system.repo_ingester import RepoIngester
+        if HAS_RICH:
+            with console.status(f"[cyan]Ingesting {args.path}...[/]"):
+                ingester = RepoIngester()
+                result = ingester.ingest(args.path)
+            console.print(f"[green]✓ Ingested[/]")
+            console.print(f"  Files: {result.get('files', 0)}")
+            console.print(f"  Observations: {result.get('observations', 0)}")
+            console.print(f"  Merkle Root: [magenta]{result.get('merkle_root', 'N/A')}[/]")
+        else:
+            ingester = RepoIngester()
+            result = ingester.ingest(args.path)
+            print(f"Ingested: {result}")
+    except Exception as e:
+        msg = f"Error: {e}"
+        console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
+def cmd_analyze(args):
+    """Analyze a log file or folder."""
+    try:
+        from cascade.system.omnidirectional_analyzer import OmnidirectionalAnalyzer
+        if HAS_RICH:
+            with console.status(f"[cyan]Analyzing {args.path}...[/]"):
+                analyzer = OmnidirectionalAnalyzer()
+                result = analyzer.analyze(args.path)
+            if result:
+                console.print(Panel(
+                    Syntax(json.dumps(result, indent=2, default=str), "json"),
+                    title="Analysis Result",
+                    border_style="cyan"
+                ))
+        else:
+            analyzer = OmnidirectionalAnalyzer()
+            result = analyzer.analyze(args.path)
+            print(json.dumps(result, indent=2, default=str))
+    except Exception as e:
+        msg = f"Error: {e}"
+        console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
+# ═══════════════════════════════════════════════════════════════════════════════
+# PROXY & INIT
+# ═══════════════════════════════════════════════════════════════════════════════
+def cmd_proxy(args):
+    """Start the CASCADE proxy server."""
+    if HAS_RICH:
+        console.print(Panel(
+            f"""[cyan]CASCADE Proxy Server[/]
+Listening on [bold]{args.host}:{args.port}[/]
+Set these environment variables in your app:
+[green]
+  OPENAI_BASE_URL=http://localhost:{args.port}/v1
+  ANTHROPIC_BASE_URL=http://localhost:{args.port}/anthropic
+[/]
+Press Ctrl+C to stop.""",
+            title="🌐 Proxy Mode",
+            border_style="cyan",
+        ))
+    else:
+        print(f"CASCADE Proxy on {args.host}:{args.port}")
+    from cascade.proxy import run_proxy
+    run_proxy(host=args.host, port=args.port, verbose=not args.quiet)
+def cmd_init(args):
+    """Show initialization instructions."""
+    if HAS_RICH:
+        md = """
+# CASCADE Setup
+## Option 1: Auto-Patch (Python)
+```python
+import cascade
+cascade.init()
+# Now every call emits a receipt
+from openai import OpenAI
+client = OpenAI()
+client.chat.completions.create(...)  # ← automatically observed
+```
+## Option 2: Proxy Mode (Any Language)
+```bash
+cascade proxy --port 7777
+```
+Then set environment variables:
+```bash
+export OPENAI_BASE_URL=http://localhost:7777/v1
+export ANTHROPIC_BASE_URL=http://localhost:7777/anthropic
+```
+## Option 3: Manual Observation
+```python
+from cascade import observe
+observe(model_id="my-model", input_data="prompt", output_data="response")
+```
+---
+**Genesis Root:** `89f940c1a4b7aa65`
+"""
+        console.print(Panel(Markdown(md), title="[bold cyan]CASCADE[/]", border_style="cyan"))
+    else:
+        print("""
+CASCADE - Universal AI Provenance Layer
+OPTION 1: Auto-Patch (Python)
+  import cascade
+  cascade.init()
+OPTION 2: Proxy Mode (Any Language)
+  cascade proxy
+  export OPENAI_BASE_URL=http://localhost:7777/v1
+OPTION 3: Manual
+  from cascade import observe
+  observe(model_id="...", input_data="...", output_data="...")
+""")
+def cmd_version(args):
+    """Show version."""
+    try:
+        from cascade import __version__
+        version = __version__
+    except:
+        version = "0.1.1"
+    if HAS_RICH:
+        console.print(f"[cyan]cascade-ai[/] [bold]{version}[/]")
+        console.print(f"[dim]Genesis: 89f940c1a4b7aa65[/]")
+    else:
+        print(f"cascade-ai {version}")
+# ═══════════════════════════════════════════════════════════════════════════════
+# HOLD COMMANDS - Inference-Level Halt Protocol
+# ═══════════════════════════════════════════════════════════════════════════════
+def cmd_hold_status(args):
+    """Show HOLD system status."""
+    try:
+        from cascade.hold import Hold
+        hold = Hold.get()
+        if HAS_RICH:
+            from rich.table import Table
+            table = Table(title="🛑 HOLD System Status", box=box.SIMPLE)
+            table.add_column("Property", style="cyan")
+            table.add_column("Value", style="green")
+            table.add_row("Hold Count", str(hold._hold_count))
+            table.add_row("Override Count", str(hold._override_count))
+            table.add_row("Timeout", f"{hold.timeout}s")
+            table.add_row("Auto Accept", str(hold.auto_accept))
+            table.add_row("Listeners", str(len(hold._listeners)))
+            table.add_row("Last Merkle", hold._last_merkle or "None")
+            table.add_row("Current Hold", "Active" if hold._current_hold else "None")
+            console.print(table)
+        else:
+            print(f"HOLD Count: {hold._hold_count}")
+            print(f"Override Count: {hold._override_count}")
+            print(f"Timeout: {hold.timeout}s")
+            print(f"Listeners: {len(hold._listeners)}")
+    except Exception as e:
+        if HAS_RICH:
+            console.print(f"[red]Error: {e}[/]")
+        else:
+            print(f"Error: {e}")
+def cmd_hold_info(args):
+    """Show HOLD usage information."""
+    info = """
+🛑 HOLD - Inference-Level Halt Protocol
+HOLD pauses AI inference so humans can observe and intervene.
+USAGE IN YOUR CODE:
+    from cascade.hold import Hold
+    hold = Hold.get()
+    # In your inference loop:
+    probs = model.predict(observation)
+    resolution = hold.yield_point(
+        action_probs=probs,
+        value=value_estimate,
+        observation=obs,
+        brain_id="my_model",
+        # Optional informational wealth:
+        action_labels=["up", "down", "left", "right"],
+        latent=model.latent,
+        attention=model.attention,
+        features=model.features,
+        imagination=model.imagine(),
+    )
+    action = resolution.action        # Final action (AI or override)
+    was_override = resolution.was_override  # True if human intervened
+REGISTERING LISTENERS:
+    def my_handler(hold_point):
+        print(f"HOLD: {hold_point.action_probs}")
+        # Send to UI, game engine, logger, etc.
+    hold.register_listener(my_handler)
+RESOLVING HOLDS:
+    hold.resolve(action=3, source="human")  # Override with action 3
+    hold.accept()                            # Accept AI's choice
+"""
+    if HAS_RICH:
+        console.print(Panel(info, title="[bold red]HOLD[/]", border_style="red"))
+    else:
+        print(info)
+# ═══════════════════════════════════════════════════════════════════════════════
+# MAIN
+# ═══════════════════════════════════════════════════════════════════════════════
+def main():
+    """Main CLI entry point."""
+    parser = argparse.ArgumentParser(
+        prog="cascade",
+        description="CASCADE - Universal AI Provenance Layer",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  cascade stats                    Show lattice statistics
+  cascade list -n 20               List recent observations
+  cascade chains                   List all chains
+  cascade inspect <root>           Inspect an observation
+  cascade watch                    Live observation feed
+  cascade proxy                    Start proxy server
+  cascade fingerprint <model>      Fingerprint a model
+  cascade pii <file>               Scan file for PII
+  cascade ingest <path>            Ingest logs/files
+        """
+    )
+    parser.add_argument("--version", "-v", action="store_true", help="Show version")
+    subparsers = parser.add_subparsers(dest="command", help="Commands")
+    # ─── Lattice commands ───
+    subparsers.add_parser("stats", help="Show lattice statistics").set_defaults(func=cmd_stats)
+    subparsers.add_parser("chains", help="List all chains").set_defaults(func=cmd_chains)
+    subparsers.add_parser("init", help="Show setup instructions").set_defaults(func=cmd_init)
+    subparsers.add_parser("watch", help="Watch live observations").set_defaults(func=cmd_watch)
+    list_p = subparsers.add_parser("list", help="List recent observations")
+    list_p.add_argument("--limit", "-n", type=int, default=10, help="Number to show")
+    list_p.set_defaults(func=cmd_list)
+    inspect_p = subparsers.add_parser("inspect", help="Inspect an observation")
+    inspect_p.add_argument("root", help="Merkle root to inspect")
+    inspect_p.set_defaults(func=cmd_inspect)
+    pin_p = subparsers.add_parser("pin", help="Pin observation to IPFS")
+    pin_p.add_argument("root", help="Merkle root to pin")
+    pin_p.set_defaults(func=cmd_pin)
+    export_p = subparsers.add_parser("export", help="Export lattice/chain to JSON")
+    export_p.add_argument("--chain", "-c", help="Export specific chain")
+    export_p.add_argument("--output", "-o", default="cascade_export.json", help="Output file")
+    export_p.set_defaults(func=cmd_export)
+    # ─── Model commands ───
+    observe_p = subparsers.add_parser("observe", help="Manual observation")
+    observe_p.add_argument("--model", "-m", required=True, help="Model ID")
+    observe_p.add_argument("--input", "-i", required=True, help="Input data")
+    observe_p.add_argument("--output", "-o", required=True, help="Output data")
+    observe_p.set_defaults(func=cmd_observe)
+    fp_p = subparsers.add_parser("fingerprint", help="Fingerprint a model")
+    fp_p.add_argument("model", help="Model name/path")
+    fp_p.set_defaults(func=cmd_fingerprint)
+    # ─── Data commands ───
+    entities_p = subparsers.add_parser("entities", help="Entity resolution")
+    entities_p.add_argument("file", help="File to analyze")
+    entities_p.set_defaults(func=cmd_entities)
+    pii_p = subparsers.add_parser("pii", help="Scan for PII")
+    pii_p.add_argument("file", help="File to scan")
+    pii_p.set_defaults(func=cmd_pii)
+    prov_p = subparsers.add_parser("provenance", help="Data provenance")
+    prov_p.add_argument("path", help="File or dataset path")
+    prov_p.set_defaults(func=cmd_provenance)
+    # ─── System commands ───
+    ingest_p = subparsers.add_parser("ingest", help="Ingest logs/files")
+    ingest_p.add_argument("path", help="Path to ingest")
+    ingest_p.set_defaults(func=cmd_ingest)
+    analyze_p = subparsers.add_parser("analyze", help="Analyze logs/files")
+    analyze_p.add_argument("path", help="Path to analyze")
+    analyze_p.set_defaults(func=cmd_analyze)
+    # ─── Proxy ───
+    proxy_p = subparsers.add_parser("proxy", help="Start proxy server")
+    proxy_p.add_argument("--host", default="0.0.0.0", help="Host to bind")
+    proxy_p.add_argument("--port", "-p", type=int, default=7777, help="Port")
+    proxy_p.add_argument("--quiet", "-q", action="store_true", help="Quiet mode")
+    proxy_p.set_defaults(func=cmd_proxy)
+    # ─── HOLD - Inference-Level Halt Protocol ───
+    hold_p = subparsers.add_parser("hold", help="Show HOLD usage and API info")
+    hold_p.set_defaults(func=cmd_hold_info)
+    hold_status_p = subparsers.add_parser("hold-status", help="Show HOLD system status")
+    hold_status_p.set_defaults(func=cmd_hold_status)
+    # Parse
+    args = parser.parse_args()
+    if args.version:
+        cmd_version(args)
+        return
+    if not args.command:
+        if HAS_RICH:
+            console.print(Panel(
+                """[cyan]CASCADE[/] - Universal AI Provenance Layer
+[bold]Lattice Commands:[/]
+  [green]stats[/]        Show lattice statistics
+  [green]chains[/]       List all chains
+  [green]list[/]         List recent observations
+  [green]inspect[/]      Inspect an observation
+  [green]watch[/]        Live observation feed
+  [green]pin[/]          Pin to IPFS
+  [green]export[/]       Export to JSON
+[bold]Model Commands:[/]
+  [green]observe[/]      Manual observation
+  [green]fingerprint[/]  Fingerprint a model
+[bold]Data Commands:[/]
+  [green]entities[/]     Entity resolution
+  [green]pii[/]          PII scanner
+  [green]provenance[/]   Data provenance
+[bold]System Commands:[/]
+  [green]ingest[/]       Ingest files/logs
+  [green]analyze[/]      Analyze files
+[bold]HOLD (Inference Halt):[/]
+  [green]hold[/]         Show HOLD usage and API info
+  [green]hold-status[/]  Show HOLD system status
+[bold]Other:[/]
+  [green]proxy[/]        Start proxy server
+  [green]init[/]         Setup instructions
+Use [cyan]cascade <command> --help[/] for details.""",
+                title="[bold magenta]🌀 CASCADE[/]",
+                subtitle="[dim]pip install cascade-ai[/]",
+                border_style="magenta",
+            ))
+        else:
+            parser.print_help()
+        return
+    args.func(args)
+if __name__ == "__main__":
+    main()

cascade/core/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Cascade Core module - fundamental data structures and algorithms."""
+from cascade.core.event import Event, CausationLink, CausationChain
+from cascade.core.graph import CausationGraph
+from cascade.core.adapter import SymbioticAdapter
+__all__ = [
+    "Event",
+    "CausationLink",
+    "CausationChain",
+    "CausationGraph",
+    "SymbioticAdapter",
+]

cascade/core/adapter.py ADDED Viewed

	@@ -0,0 +1,470 @@

+"""
+Cascade Core - Symbiotic Adapter.
+The heart of Cascade's system-agnostic design. The adapter uses Kleene fixed-point
+convergence to interpret ANY signal format and convert it to Events.
+"It doesn't hook into your system — it becomes part of it."
+"""
+import time
+import json
+import re
+from typing import Any, Dict, List, Optional, Callable, Type
+from dataclasses import dataclass
+from cascade.core.event import Event
+@dataclass
+class SignalPattern:
+    """A learned pattern for interpreting signals."""
+    pattern_type: str  # 'dict', 'string', 'tensor', 'protobuf', etc.
+    component: str
+    event_type: str
+    extractor: Optional[Callable[[Any], Dict[str, Any]]] = None
+    confidence: float = 0.0
+    match_count: int = 0
+class SymbioticAdapter:
+    """
+    Self-interpreting adapter that converges to any signal format.
+    The adapter observes signals from the host system and learns how to
+    interpret them through fixed-point iteration. It starts with naive
+    interpretations and refines them until stable.
+    This is the key to Cascade's system-agnostic design:
+    - No framework-specific hooks required
+    - No configuration needed
+    - Feed it ANY signal format, it adapts
+    Example:
+        >>> adapter = SymbioticAdapter()
+        >>>
+        >>> # Feed it different signal formats
+        >>> adapter.interpret({"loss": 0.5, "epoch": 10})
+        >>> adapter.interpret("2024-01-01 12:00:00 ERROR training failed")
+        >>> adapter.interpret(torch.tensor([0.1, 0.2, 0.3]))
+        >>>
+        >>> # It learns patterns and gets better at interpretation
+        >>> print(adapter.learned_patterns)
+    """
+    def __init__(self):
+        """Initialize the symbiotic adapter."""
+        self._patterns: List[SignalPattern] = []
+        self._signal_count = 0
+        self._interpretation_cache: Dict[str, SignalPattern] = {}
+        # Built-in interpreters for common formats
+        self._builtin_interpreters = {
+            dict: self._interpret_dict,
+            str: self._interpret_string,
+            list: self._interpret_list,
+        }
+        # Regex patterns for log line parsing
+        self._log_patterns = [
+            # ISO timestamp with level: "2024-01-01 12:00:00 ERROR message"
+            re.compile(r'^(\d{4}-\d{2}-\d{2}[T\s]\d{2}:\d{2}:\d{2}(?:\.\d+)?)\s+(\w+)\s+(.*)$'),
+            # Simple timestamp: "12:00:00.123 component message"
+            re.compile(r'^(\d{2}:\d{2}:\d{2}(?:\.\d+)?)\s+(\w+)\s+(.*)$'),
+            # Pipe-delimited: "timestamp|level|component|key:value"
+            re.compile(r'^([^|]+)\|(\w+)\|(\w+)\|(.*)$'),
+        ]
+        # Metric extraction patterns - ONLY extract real training metrics
+        # Be strict to avoid extracting garbage from config lines
+        self._metric_patterns = [
+            # Standard training metrics with = or :
+            re.compile(r'\b(loss|val_loss|train_loss|accuracy|acc|val_acc|lr|learning_rate|epoch|step|iter|iteration|mfu|tokens_per_sec|samples_per_sec|grad_norm|perplexity|ppl)[=:]\s*([+-]?\d+\.?\d*(?:e[+-]?\d+)?)', re.I),
+            # "iter X: loss=Y" format from nanoGPT
+            re.compile(r'iter\s+(\d+).*loss[=:]?\s*([+-]?\d+\.?\d*)', re.I),
+            # "step X loss Y" format
+            re.compile(r'step\s+(\d+).*loss\s*[=:]?\s*([+-]?\d+\.?\d*)', re.I),
+        ]
+    def interpret(self, signal: Any) -> Event:
+        """
+        Interpret any signal into a Cascade Event.
+        Uses Kleene fixed-point iteration to converge on the best interpretation.
+        Args:
+            signal: Any signal from the host system
+        Returns:
+            Event: The interpreted event
+        """
+        self._signal_count += 1
+        # Get signal type
+        signal_type = type(signal)
+        # Try cached pattern first
+        cache_key = self._get_cache_key(signal)
+        if cache_key in self._interpretation_cache:
+            pattern = self._interpretation_cache[cache_key]
+            pattern.match_count += 1
+            return self._apply_pattern(signal, pattern)
+        # Try built-in interpreter
+        if signal_type in self._builtin_interpreters:
+            event = self._builtin_interpreters[signal_type](signal)
+            self._learn_pattern(signal, event)
+            return event
+        # Try tensor-like objects (duck typing)
+        if hasattr(signal, 'numpy') or hasattr(signal, 'detach'):
+            event = self._interpret_tensor(signal)
+            self._learn_pattern(signal, event)
+            return event
+        # Try protobuf-like objects
+        if hasattr(signal, 'SerializeToString'):
+            event = self._interpret_protobuf(signal)
+            self._learn_pattern(signal, event)
+            return event
+        # Fallback: convert to string and interpret
+        event = self._interpret_string(str(signal))
+        return event
+    def _interpret_dict(self, signal: Dict[str, Any]) -> Event:
+        """Interpret a dictionary signal."""
+        # Extract common fields
+        timestamp = signal.get('timestamp', signal.get('time', time.time()))
+        if isinstance(timestamp, str):
+            try:
+                from datetime import datetime
+                timestamp = datetime.fromisoformat(timestamp).timestamp()
+            except:
+                timestamp = time.time()
+        component = signal.get('component', signal.get('source', 'unknown'))
+        event_type = signal.get('event_type', signal.get('type', 'state_change'))
+        # Everything else goes in data
+        reserved = {'timestamp', 'time', 'component', 'source', 'event_type', 'type'}
+        data = {k: v for k, v in signal.items() if k not in reserved}
+        return Event(
+            timestamp=timestamp,
+            component=component,
+            event_type=event_type,
+            data=data,
+            source_signal=signal,
+        )
+    def _interpret_string(self, signal: str) -> Event:
+        """Interpret a string signal (log line, message, etc.)."""
+        signal = signal.strip()
+        # Try each log pattern
+        for pattern in self._log_patterns:
+            match = pattern.match(signal)
+            if match:
+                groups = match.groups()
+                if len(groups) >= 3:
+                    timestamp_str, level_or_component, rest = groups[0], groups[1], groups[-1]
+                    # Parse timestamp
+                    try:
+                        from datetime import datetime
+                        timestamp = datetime.fromisoformat(timestamp_str.replace(' ', 'T')).timestamp()
+                    except:
+                        timestamp = time.time()
+                    # Extract metrics from the rest
+                    data = self._extract_metrics(rest)
+                    data['raw_message'] = rest
+                    # Determine event type from keywords
+                    event_type = self._infer_event_type(signal)
+                    return Event(
+                        timestamp=timestamp,
+                        component=level_or_component.lower(),
+                        event_type=event_type,
+                        data=data,
+                        source_signal=signal,
+                    )
+        # Fallback: extract what we can with smarter component detection
+        data = self._extract_metrics(signal)
+        data['raw_message'] = signal
+        # Infer component from content
+        component = self._infer_component(signal)
+        return Event(
+            timestamp=time.time(),
+            component=component,
+            event_type=self._infer_event_type(signal),
+            data=data,
+            source_signal=signal,
+        )
+    def _interpret_list(self, signal: List[Any]) -> Event:
+        """Interpret a list signal."""
+        # Convert to dict with indices
+        data = {f'item_{i}': v for i, v in enumerate(signal)}
+        data['length'] = len(signal)
+        # Check if it looks like numeric data
+        if all(isinstance(x, (int, float)) for x in signal):
+            data['mean'] = sum(signal) / len(signal) if signal else 0
+            data['min'] = min(signal) if signal else 0
+            data['max'] = max(signal) if signal else 0
+        return Event(
+            timestamp=time.time(),
+            component='data',
+            event_type='list_signal',
+            data=data,
+            source_signal=signal,
+        )
+    def _interpret_tensor(self, signal: Any) -> Event:
+        """Interpret a tensor-like signal (PyTorch, NumPy, etc.)."""
+        # Try to get numpy array
+        try:
+            if hasattr(signal, 'detach'):
+                arr = signal.detach().cpu().numpy()
+            elif hasattr(signal, 'numpy'):
+                arr = signal.numpy()
+            else:
+                arr = signal
+            data = {
+                'shape': list(arr.shape) if hasattr(arr, 'shape') else [],
+                'dtype': str(arr.dtype) if hasattr(arr, 'dtype') else 'unknown',
+                'mean': float(arr.mean()) if hasattr(arr, 'mean') else 0,
+                'std': float(arr.std()) if hasattr(arr, 'std') else 0,
+                'min': float(arr.min()) if hasattr(arr, 'min') else 0,
+                'max': float(arr.max()) if hasattr(arr, 'max') else 0,
+            }
+            # Check for NaN/Inf (common in gradient explosions)
+            if hasattr(arr, 'isnan'):
+                data['has_nan'] = bool(arr.isnan().any())
+            if hasattr(arr, 'isinf'):
+                data['has_inf'] = bool(arr.isinf().any())
+        except Exception as e:
+            data = {'error': str(e), 'type': str(type(signal))}
+        return Event(
+            timestamp=time.time(),
+            component='tensor',
+            event_type='tensor_signal',
+            data=data,
+            source_signal=None,  # Don't store tensor to save memory
+        )
+    def _interpret_protobuf(self, signal: Any) -> Event:
+        """Interpret a protobuf-like signal."""
+        try:
+            # Try to convert to dict
+            if hasattr(signal, 'DESCRIPTOR'):
+                from google.protobuf.json_format import MessageToDict
+                data = MessageToDict(signal)
+            else:
+                data = {'raw': str(signal)}
+        except:
+            data = {'raw': str(signal)}
+        return Event(
+            timestamp=time.time(),
+            component='protobuf',
+            event_type='protobuf_signal',
+            data=data,
+            source_signal=None,
+        )
+    def _extract_metrics(self, text: str) -> Dict[str, Any]:
+        """Extract numeric metrics from text - STRICT, only real training metrics."""
+        metrics = {}
+        # nanoGPT format: "iter 0: loss=4.2176, time 46.76ms, mfu 0.62%"
+        nano_match = re.search(r'iter\s+(\d+).*loss[=:]?\s*([\d.]+)', text, re.I)
+        if nano_match:
+            metrics['iter'] = int(nano_match.group(1))
+            metrics['loss'] = float(nano_match.group(2))
+        # Diffusers/tqdm format: "step_loss=0.1234" or "step_loss: 0.1234"
+        step_loss_match = re.search(r'step_loss[=:]\s*([\d.e+-]+)', text, re.I)
+        if step_loss_match:
+            metrics['loss'] = float(step_loss_match.group(1))
+        # train_loss format from accelerator.log
+        train_loss_match = re.search(r'train_loss[=:]\s*([\d.e+-]+)', text, re.I)
+        if train_loss_match:
+            metrics['loss'] = float(train_loss_match.group(1))
+        # tqdm progress format: "  5%|█         | 5/100 [00:30<09:30, step_loss=0.234, lr=1e-5]"
+        tqdm_match = re.search(r'(\d+)%\|.*\|\s*(\d+)/(\d+)', text)
+        if tqdm_match:
+            metrics['progress_pct'] = int(tqdm_match.group(1))
+            metrics['step'] = int(tqdm_match.group(2))
+            metrics['total_steps'] = int(tqdm_match.group(3))
+        # Generic loss patterns
+        generic_loss = re.search(r'\bloss[=:]\s*([\d.e+-]+)', text, re.I)
+        if generic_loss and 'loss' not in metrics:
+            metrics['loss'] = float(generic_loss.group(1))
+        # mfu extraction
+        mfu_match = re.search(r'mfu\s*[=:]?\s*([\d.]+)%?', text, re.I)
+        if mfu_match:
+            metrics['mfu'] = float(mfu_match.group(1))
+        # time extraction (ms)
+        time_match = re.search(r'time\s*[=:]?\s*([\d.]+)\s*ms', text, re.I)
+        if time_match:
+            metrics['time_ms'] = float(time_match.group(1))
+        # learning rate - multiple formats
+        lr_match = re.search(r'\b(?:lr|learning_rate)\s*[=:]\s*([\d.e+-]+)', text, re.I)
+        if lr_match:
+            metrics['lr'] = float(lr_match.group(1))
+        # epoch/step for other frameworks
+        epoch_match = re.search(r'\bepoch\s*[=:]\s*(\d+)', text, re.I)
+        if epoch_match:
+            metrics['epoch'] = int(epoch_match.group(1))
+        step_match = re.search(r'\bstep\s*[=:]\s*(\d+)', text, re.I)
+        if step_match and 'step' not in metrics:
+            metrics['step'] = int(step_match.group(1))
+        # global_step from diffusers
+        global_step_match = re.search(r'global_step[=:]\s*(\d+)', text, re.I)
+        if global_step_match:
+            metrics['step'] = int(global_step_match.group(1))
+        return metrics
+    def _infer_event_type(self, text: str) -> str:
+        """Infer event type from text content."""
+        text_lower = text.lower()
+        # Training iteration logs (highest priority)
+        if re.search(r'iter\s+\d+.*loss', text_lower):
+            return 'training_step'
+        if re.search(r'step\s+\d+.*loss', text_lower):
+            return 'training_step'
+        if any(kw in text_lower for kw in ['error', 'exception', 'failed', 'crash']):
+            return 'error'
+        if any(kw in text_lower for kw in ['warning', 'warn']):
+            return 'warning'
+        if any(kw in text_lower for kw in ['gradient', 'backward']):
+            return 'training'
+        if 'loss' in text_lower and 'val' in text_lower:
+            return 'validation'
+        if any(kw in text_lower for kw in ['inference', 'predict', 'forward']):
+            return 'inference'
+        if any(kw in text_lower for kw in ['epoch', 'step', 'iteration', 'iter']):
+            return 'progress'
+        if any(kw in text_lower for kw in ['nan', 'inf', 'explode', 'overflow']):
+            return 'anomaly'
+        if any(kw in text_lower for kw in ['save', 'checkpoint', 'load', 'saving']):
+            return 'checkpoint'
+        if any(kw in text_lower for kw in ['config', 'setting', 'parameter', 'device', 'gpu', 'cuda']):
+            return 'config'
+        if any(kw in text_lower for kw in ['initializ', 'loading model', 'compiling']):
+            return 'init'
+        return 'state_change'
+    def _infer_component(self, text: str) -> str:
+        """Infer component from text content - NO MORE 'unknown'."""
+        text_lower = text.lower()
+        # Training/optimizer related
+        if any(kw in text_lower for kw in ['iter', 'step', 'epoch', 'batch']):
+            return 'trainer'
+        if any(kw in text_lower for kw in ['loss', 'backward', 'gradient']):
+            return 'loss'
+        if any(kw in text_lower for kw in ['optim', 'adam', 'sgd', 'lr', 'learning']):
+            return 'optimizer'
+        if any(kw in text_lower for kw in ['model', 'layer', 'param', 'weight']):
+            return 'model'
+        if any(kw in text_lower for kw in ['data', 'batch', 'loader', 'dataset']):
+            return 'data'
+        if any(kw in text_lower for kw in ['cuda', 'gpu', 'device', 'memory']):
+            return 'device'
+        if any(kw in text_lower for kw in ['checkpoint', 'save', 'load']):
+            return 'checkpoint'
+        if any(kw in text_lower for kw in ['config', 'setting', 'override']):
+            return 'config'
+        if any(kw in text_lower for kw in ['eval', 'valid', 'test']):
+            return 'evaluator'
+        if any(kw in text_lower for kw in ['token', 'vocab', 'embed']):
+            return 'tokenizer'
+        return 'system'  # Generic fallback, not "unknown"
+    def _get_cache_key(self, signal: Any) -> str:
+        """Generate a cache key for a signal's structure."""
+        if isinstance(signal, dict):
+            # Key based on dict keys
+            return f"dict:{':'.join(sorted(signal.keys()))}"
+        elif isinstance(signal, str):
+            # Key based on first word
+            first_word = signal.split()[0] if signal.split() else ''
+            return f"str:{first_word[:20]}"
+        else:
+            return f"type:{type(signal).__name__}"
+    def _learn_pattern(self, signal: Any, event: Event) -> None:
+        """Learn a pattern from a successful interpretation."""
+        cache_key = self._get_cache_key(signal)
+        pattern = SignalPattern(
+            pattern_type=type(signal).__name__,
+            component=event.component,
+            event_type=event.event_type,
+            confidence=0.5,
+            match_count=1,
+        )
+        self._interpretation_cache[cache_key] = pattern
+        self._patterns.append(pattern)
+    def _apply_pattern(self, signal: Any, pattern: SignalPattern) -> Event:
+        """Apply a learned pattern to interpret a signal."""
+        # Re-interpret with learned hints - use direct interpreters to avoid recursion
+        if isinstance(signal, dict):
+            event = self._interpret_dict(signal)
+            # Apply learned component/type if more confident
+            if pattern.confidence > 0.7:
+                return Event(
+                    timestamp=event.timestamp,
+                    component=pattern.component,
+                    event_type=pattern.event_type,
+                    data=event.data,
+                    source_signal=signal,
+                )
+            return event
+        elif isinstance(signal, str):
+            return self._interpret_string(signal)
+        elif isinstance(signal, list):
+            return self._interpret_list(signal)
+        else:
+            # Fallback: interpret as string without recursion
+            return self._interpret_string(str(signal))
+    @property
+    def learned_patterns(self) -> List[SignalPattern]:
+        """Get all learned signal patterns."""
+        return sorted(self._patterns, key=lambda p: p.match_count, reverse=True)
+    @property
+    def signal_count(self) -> int:
+        """Total number of signals interpreted."""
+        return self._signal_count
+    def __repr__(self) -> str:
+        return f"<SymbioticAdapter | {self._signal_count} signals, {len(self._patterns)} patterns>"

cascade/core/event.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+Cascade Core - Event and CausationLink primitives.
+These are the fundamental data structures that represent causation.
+"""
+from dataclasses import dataclass, field
+from typing import Dict, List, Any, Optional
+from datetime import datetime
+import time
+import uuid
+def _generate_event_id() -> str:
+    """Generate a unique event ID with timestamp prefix for ordering."""
+    timestamp = int(time.time() * 1000000)
+    unique = uuid.uuid4().hex[:8]
+    return f"evt_{timestamp}_{unique}"
+@dataclass
+class Event:
+    """
+    A discrete event in the causation graph.
+    Events are the nodes in your causation graph. Each event represents
+    something that happened in your system at a point in time.
+    Attributes:
+        event_id: Unique identifier (auto-generated if not provided)
+        timestamp: Unix timestamp when event occurred
+        component: Which system component generated this event
+        event_type: Category of event (e.g., 'training', 'inference', 'error')
+        data: Arbitrary key-value data associated with the event
+        source_signal: The original signal that created this event (for debugging)
+    Example:
+        >>> event = Event(
+        ...     timestamp=time.time(),
+        ...     component="neural_network",
+        ...     event_type="gradient_explosion",
+        ...     data={"layer": "fc3", "magnitude": 1e12}
+        ... )
+    """
+    timestamp: float
+    component: str
+    event_type: str
+    data: Dict[str, Any] = field(default_factory=dict)
+    event_id: str = field(default_factory=_generate_event_id)
+    source_signal: Optional[Any] = field(default=None, repr=False)
+    def __post_init__(self):
+        """Ensure timestamp is float."""
+        if isinstance(self.timestamp, datetime):
+            self.timestamp = self.timestamp.timestamp()
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize event to dictionary."""
+        return {
+            "event_id": self.event_id,
+            "timestamp": self.timestamp,
+            "component": self.component,
+            "event_type": self.event_type,
+            "data": self.data,
+        }
+    @classmethod
+    def from_dict(cls, d: Dict[str, Any]) -> "Event":
+        """Deserialize event from dictionary."""
+        return cls(
+            event_id=d.get("event_id", _generate_event_id()),
+            timestamp=d["timestamp"],
+            component=d["component"],
+            event_type=d["event_type"],
+            data=d.get("data", {}),
+        )
+    def __hash__(self):
+        return hash(self.event_id)
+    def __eq__(self, other):
+        if isinstance(other, Event):
+            return self.event_id == other.event_id
+        return False
+@dataclass
+class CausationLink:
+    """
+    A causal relationship between two events.
+    Links are the edges in your causation graph. Each link represents
+    a cause-effect relationship: event A caused event B.
+    Attributes:
+        from_event: ID of the causing event
+        to_event: ID of the caused event
+        causation_type: How the causation was detected
+            - 'temporal': A happened shortly before B
+            - 'correlation': A and B metrics moved together
+            - 'threshold': A crossed a threshold triggering B
+            - 'direct': Explicit causation declared in code
+        strength: Confidence in the causal relationship (0.0 to 1.0)
+        explanation: Human-readable explanation of the link
+        metrics_involved: Which metrics connect these events
+    Example:
+        >>> link = CausationLink(
+        ...     from_event="evt_123",
+        ...     to_event="evt_456",
+        ...     causation_type="threshold",
+        ...     strength=0.95,
+        ...     explanation="Loss exceeded 10.0, triggering gradient clipping"
+        ... )
+    """
+    from_event: str
+    to_event: str
+    causation_type: str  # 'temporal', 'correlation', 'threshold', 'direct'
+    strength: float = 1.0
+    explanation: str = ""
+    metrics_involved: List[str] = field(default_factory=list)
+    def __post_init__(self):
+        """Validate strength is in range."""
+        self.strength = max(0.0, min(1.0, self.strength))
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize link to dictionary."""
+        return {
+            "from_event": self.from_event,
+            "to_event": self.to_event,
+            "causation_type": self.causation_type,
+            "strength": self.strength,
+            "explanation": self.explanation,
+            "metrics_involved": self.metrics_involved,
+        }
+    @classmethod
+    def from_dict(cls, d: Dict[str, Any]) -> "CausationLink":
+        """Deserialize link from dictionary."""
+        return cls(
+            from_event=d["from_event"],
+            to_event=d["to_event"],
+            causation_type=d["causation_type"],
+            strength=d.get("strength", 1.0),
+            explanation=d.get("explanation", ""),
+            metrics_involved=d.get("metrics_involved", []),
+        )
+@dataclass
+class CausationChain:
+    """
+    A chain of causal events from origin to destination.
+    Represents a full causal path through the graph.
+    Attributes:
+        events: List of events in causal order
+        links: List of links connecting the events
+        total_strength: Combined strength of all links
+        depth: Number of hops in the chain
+        narrative: Human-readable story of what happened
+    """
+    events: List[Event]
+    links: List[CausationLink]
+    total_strength: float = 1.0
+    depth: int = 0
+    narrative: str = ""
+    def __post_init__(self):
+        self.depth = len(self.links)
+        if not self.total_strength and self.links:
+            # Calculate combined strength
+            self.total_strength = 1.0
+            for link in self.links:
+                self.total_strength *= link.strength

cascade/core/graph.py ADDED Viewed

	@@ -0,0 +1,292 @@

+"""
+Cascade Core - Causation Graph Engine.
+The graph stores events and their causal relationships, enabling
+bidirectional traversal through time.
+"""
+import threading
+from typing import Dict, List, Optional, Set, Any, Iterator
+from collections import defaultdict
+from datetime import datetime
+try:
+    import networkx as nx
+    HAS_NETWORKX = True
+except ImportError:
+    HAS_NETWORKX = False
+from cascade.core.event import Event, CausationLink
+class CausationGraph:
+    """
+    A directed graph of causal relationships between events.
+    The graph enables bidirectional traversal:
+    - Backwards: "What caused this event?"
+    - Forwards: "What did this event cause?"
+    Thread-safe for concurrent event ingestion.
+    Example:
+        >>> graph = CausationGraph()
+        >>> graph.add_event(event1)
+        >>> graph.add_event(event2)
+        >>> graph.add_link(CausationLink(
+        ...     from_event=event1.event_id,
+        ...     to_event=event2.event_id,
+        ...     causation_type="temporal",
+        ...     strength=0.9
+        ... ))
+        >>>
+        >>> # Find what caused event2
+        >>> causes = graph.get_causes(event2.event_id)
+    """
+    def __init__(self):
+        """Initialize an empty causation graph."""
+        self._lock = threading.RLock()
+        # Event storage
+        self._events: Dict[str, Event] = {}
+        self._events_by_component: Dict[str, List[str]] = defaultdict(list)
+        self._events_by_type: Dict[str, List[str]] = defaultdict(list)
+        self._events_by_time: List[str] = []  # Ordered by timestamp
+        # Link storage
+        self._links: Dict[str, CausationLink] = {}  # link_id -> link
+        self._causes: Dict[str, Set[str]] = defaultdict(set)  # event_id -> set of cause event_ids
+        self._effects: Dict[str, Set[str]] = defaultdict(set)  # event_id -> set of effect event_ids
+        # NetworkX graph for advanced algorithms (optional)
+        if HAS_NETWORKX:
+            self._nx_graph = nx.DiGraph()
+        else:
+            self._nx_graph = None
+        # Statistics
+        self._event_count = 0
+        self._link_count = 0
+    def add_event(self, event: Event) -> None:
+        """
+        Add an event to the graph.
+        Thread-safe. Automatically detects potential causations with recent events.
+        Args:
+            event: The event to add
+        """
+        with self._lock:
+            if event.event_id in self._events:
+                return  # Already exists
+            self._events[event.event_id] = event
+            self._events_by_component[event.component].append(event.event_id)
+            self._events_by_type[event.event_type].append(event.event_id)
+            self._events_by_time.append(event.event_id)
+            self._event_count += 1
+            if self._nx_graph is not None:
+                self._nx_graph.add_node(event.event_id, **event.to_dict())
+    def add_link(self, link: CausationLink) -> None:
+        """
+        Add a causal link between two events.
+        Thread-safe.
+        Args:
+            link: The causation link to add
+        """
+        with self._lock:
+            link_id = f"{link.from_event}->{link.to_event}"
+            if link_id in self._links:
+                # Update existing link if new one is stronger
+                if link.strength > self._links[link_id].strength:
+                    self._links[link_id] = link
+                return
+            self._links[link_id] = link
+            self._causes[link.to_event].add(link.from_event)
+            self._effects[link.from_event].add(link.to_event)
+            self._link_count += 1
+            if self._nx_graph is not None:
+                self._nx_graph.add_edge(
+                    link.from_event,
+                    link.to_event,
+                    **link.to_dict()
+                )
+    def get_event(self, event_id: str) -> Optional[Event]:
+        """Get an event by ID."""
+        with self._lock:
+            return self._events.get(event_id)
+    def get_causes(self, event_id: str) -> List[Event]:
+        """
+        Get all events that directly caused this event.
+        Args:
+            event_id: ID of the effect event
+        Returns:
+            List of causing events
+        """
+        with self._lock:
+            cause_ids = self._causes.get(event_id, set())
+            return [self._events[cid] for cid in cause_ids if cid in self._events]
+    def get_effects(self, event_id: str) -> List[Event]:
+        """
+        Get all events that were directly caused by this event.
+        Args:
+            event_id: ID of the cause event
+        Returns:
+            List of effect events
+        """
+        with self._lock:
+            effect_ids = self._effects.get(event_id, set())
+            return [self._events[eid] for eid in effect_ids if eid in self._events]
+    def get_link(self, from_event: str, to_event: str) -> Optional[CausationLink]:
+        """Get the causation link between two events."""
+        with self._lock:
+            link_id = f"{from_event}->{to_event}"
+            return self._links.get(link_id)
+    def get_all_links(self) -> List[CausationLink]:
+        """Get all causal links in the graph."""
+        with self._lock:
+            return list(self._links.values())
+    def get_component_connections(self) -> Dict[str, Dict[str, float]]:
+        """
+        Aggregate causal links into component-to-component connections.
+        Returns:
+            Dict mapping (from_component, to_component) -> total strength
+        """
+        with self._lock:
+            connections: Dict[tuple, float] = {}
+            for link in self._links.values():
+                from_event = self._events.get(link.from_event)
+                to_event = self._events.get(link.to_event)
+                if from_event and to_event:
+                    from_comp = from_event.component
+                    to_comp = to_event.component
+                    if from_comp != to_comp:  # Skip self-links
+                        key = (from_comp, to_comp)
+                        connections[key] = connections.get(key, 0) + link.strength
+            return connections
+    def get_recent_events(self, count: int = 100) -> List[Event]:
+        """Get the most recent events by timestamp."""
+        with self._lock:
+            ids = self._events_by_time[-count:]
+            return [self._events[eid] for eid in reversed(ids)]
+    def get_events_by_component(self, component: str) -> List[Event]:
+        """Get all events from a specific component."""
+        with self._lock:
+            ids = self._events_by_component.get(component, [])
+            return [self._events[eid] for eid in ids]
+    def get_events_by_type(self, event_type: str) -> List[Event]:
+        """Get all events of a specific type."""
+        with self._lock:
+            ids = self._events_by_type.get(event_type, [])
+            return [self._events[eid] for eid in ids]
+    def find_path(self, from_event: str, to_event: str) -> Optional[List[str]]:
+        """
+        Find the shortest causal path between two events.
+        Uses NetworkX if available, otherwise falls back to BFS.
+        Args:
+            from_event: Starting event ID
+            to_event: Target event ID
+        Returns:
+            List of event IDs in the path, or None if no path exists
+        """
+        with self._lock:
+            if self._nx_graph is not None:
+                try:
+                    return nx.shortest_path(self._nx_graph, from_event, to_event)
+                except nx.NetworkXNoPath:
+                    return None
+                except nx.NodeNotFound:
+                    return None
+            else:
+                # BFS fallback
+                return self._bfs_path(from_event, to_event)
+    def _bfs_path(self, from_event: str, to_event: str) -> Optional[List[str]]:
+        """BFS path finding without NetworkX."""
+        from collections import deque
+        if from_event not in self._events or to_event not in self._events:
+            return None
+        queue = deque([(from_event, [from_event])])
+        visited = {from_event}
+        while queue:
+            current, path = queue.popleft()
+            if current == to_event:
+                return path
+            for effect_id in self._effects.get(current, set()):
+                if effect_id not in visited:
+                    visited.add(effect_id)
+                    queue.append((effect_id, path + [effect_id]))
+        return None
+    def get_root_events(self) -> List[Event]:
+        """Get events with no causes (entry points)."""
+        with self._lock:
+            roots = []
+            for event_id, event in self._events.items():
+                if not self._causes.get(event_id):
+                    roots.append(event)
+            return sorted(roots, key=lambda e: e.timestamp)
+    def get_leaf_events(self) -> List[Event]:
+        """Get events with no effects (endpoints)."""
+        with self._lock:
+            leaves = []
+            for event_id, event in self._events.items():
+                if not self._effects.get(event_id):
+                    leaves.append(event)
+            return sorted(leaves, key=lambda e: e.timestamp, reverse=True)
+    def get_stats(self) -> Dict[str, Any]:
+        """Get statistics about the graph."""
+        with self._lock:
+            return {
+                "event_count": self._event_count,
+                "link_count": self._link_count,
+                "components": list(self._events_by_component.keys()),
+                "event_types": list(self._events_by_type.keys()),
+                "root_count": len(self.get_root_events()),
+                "leaf_count": len(self.get_leaf_events()),
+            }
+    def __len__(self) -> int:
+        return self._event_count
+    def __repr__(self) -> str:
+        return f"<CausationGraph | {self._event_count} events, {self._link_count} links>"

cascade/core/provenance.py ADDED Viewed

	@@ -0,0 +1,601 @@

+"""
+CASCADE // PROVENANCE ENGINE
+Cryptographic lineage tracking for neural network activations.
+Due process infrastructure for AI - immutable evidence chains
+that enable governance without prescribing decisions.
+Architecture:
+    Input → [Layer₀] → [Layer₁] → ... → [Layerₙ] → Output
+              │          │                │
+              ▼          ▼                ▼
+            Hash₀ ──► Hash₁ ──► ... ──► Hashₙ
+              │                           │
+              └───────── Merkle Root ─────┘
+Each hash includes:
+    - Tensor state (sampled for efficiency)
+    - Parent hashes (inputs to this layer)
+    - Layer identity (name, params hash)
+    - Execution context (order, timestamp)
+This creates verifiable, tamper-evident records of
+what happened inside the network.
+"""
+import hashlib
+import json
+import time
+from dataclasses import dataclass, field, asdict
+from typing import Dict, List, Optional, Any, Tuple
+from collections import OrderedDict
+import numpy as np
+@dataclass
+class ProvenanceRecord:
+    """Immutable record of a single layer's activation state."""
+    # Identity
+    layer_name: str
+    layer_idx: int
+    # Lineage
+    state_hash: str                    # Hash of this layer's output
+    parent_hashes: List[str]           # Hashes of inputs (usually 1, but attention has multiple)
+    params_hash: Optional[str] = None  # Hash of layer weights (frozen reference)
+    # Tensor metadata
+    shape: List[int] = field(default_factory=list)
+    dtype: str = "float32"
+    # Statistics (for visualization, not hashed)
+    stats: Dict[str, float] = field(default_factory=dict)
+    # Execution context
+    execution_order: int = 0
+    timestamp: float = field(default_factory=time.time)
+    # Merkle tree position
+    merkle_depth: int = 0
+    merkle_path: List[str] = field(default_factory=list)
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize for JSON export."""
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'ProvenanceRecord':
+        """Deserialize from JSON."""
+        return cls(**data)
+@dataclass
+class ProvenanceChain:
+    """Complete provenance chain for a forward pass."""
+    # Session identity
+    session_id: str
+    model_id: str
+    model_hash: str
+    # Input/output
+    input_hash: str
+    output_hash: Optional[str] = None
+    # The chain itself
+    records: Dict[str, ProvenanceRecord] = field(default_factory=OrderedDict)
+    # External system roots (for inter-system linking)
+    # When this chain depends on another system's computation,
+    # include their merkle_root here. This creates the lattice.
+    external_roots: List[str] = field(default_factory=list)
+    # Merkle root (computed after chain complete)
+    merkle_root: Optional[str] = None
+    # Metadata
+    created_at: float = field(default_factory=time.time)
+    finalized: bool = False
+    def add_record(self, record: ProvenanceRecord) -> None:
+        """Add a record to the chain. Chain must not be finalized."""
+        if self.finalized:
+            raise ValueError("Cannot add to finalized chain")
+        self.records[record.layer_name] = record
+    def finalize(self) -> str:
+        """Compute Merkle root and lock the chain."""
+        if self.finalized:
+            return self.merkle_root
+        # Build Merkle tree from record hashes + external roots
+        # External roots create cryptographic proof of inter-system dependency
+        hashes = [r.state_hash for r in self.records.values()]
+        hashes.extend(self.external_roots)  # Include external system roots
+        self.merkle_root = compute_merkle_root(hashes)
+        self.finalized = True
+        return self.merkle_root
+    def verify(self) -> Tuple[bool, Optional[str]]:
+        """Verify chain integrity."""
+        if not self.finalized:
+            return False, "Chain not finalized"
+        # Recompute Merkle root (including external roots)
+        hashes = [r.state_hash for r in self.records.values()]
+        hashes.extend(self.external_roots)  # Must include external roots
+        computed_root = compute_merkle_root(hashes)
+        if computed_root != self.merkle_root:
+            return False, f"Merkle root mismatch: {computed_root} != {self.merkle_root}"
+        return True, None
+    def link_external(self, external_merkle_root: str, source_id: str = None) -> None:
+        """
+        Link this chain to another system's merkle root.
+        This creates the neural internetwork - cryptographic proof
+        that this computation depended on another system's output.
+        Args:
+            external_merkle_root: The merkle root from the external system
+            source_id: Optional identifier of the source system
+        """
+        if self.finalized:
+            raise ValueError("Cannot link external root to finalized chain")
+        self.external_roots.append(external_merkle_root)
+    def get_lineage(self, layer_name: str) -> List[ProvenanceRecord]:
+        """Trace back from a layer to its ancestors."""
+        if layer_name not in self.records:
+            return []
+        lineage = []
+        current = self.records[layer_name]
+        visited = set()
+        def trace_back(record: ProvenanceRecord):
+            if record.layer_name in visited:
+                return
+            visited.add(record.layer_name)
+            lineage.append(record)
+            for parent_hash in record.parent_hashes:
+                # Find record with this hash
+                for r in self.records.values():
+                    if r.state_hash == parent_hash:
+                        trace_back(r)
+                        break
+        trace_back(current)
+        return lineage
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize entire chain."""
+        return {
+            "session_id": self.session_id,
+            "model_id": self.model_id,
+            "model_hash": self.model_hash,
+            "input_hash": self.input_hash,
+            "output_hash": self.output_hash,
+            "external_roots": self.external_roots,  # Inter-system links
+            "merkle_root": self.merkle_root,
+            "created_at": self.created_at,
+            "finalized": self.finalized,
+            "records": {k: v.to_dict() for k, v in self.records.items()}
+        }
+    def to_json(self, indent: int = 2) -> str:
+        """Export as JSON."""
+        return json.dumps(self.to_dict(), indent=indent)
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'ProvenanceChain':
+        """Deserialize from dict."""
+        records = OrderedDict()
+        for k, v in data.get("records", {}).items():
+            records[k] = ProvenanceRecord.from_dict(v)
+        chain = cls(
+            session_id=data["session_id"],
+            model_id=data["model_id"],
+            model_hash=data["model_hash"],
+            input_hash=data["input_hash"],
+            output_hash=data.get("output_hash"),
+            external_roots=data.get("external_roots", []),  # Inter-system links
+            merkle_root=data.get("merkle_root"),
+            created_at=data.get("created_at", time.time()),
+            finalized=data.get("finalized", False),
+        )
+        chain.records = records
+        return chain
+# =============================================================================
+# HASHING FUNCTIONS
+# =============================================================================
+def hash_tensor(tensor, sample_size: int = 1000) -> str:
+    """
+    Compute deterministic hash of tensor state.
+    Samples tensor for efficiency - full hash would be too slow
+    for large activations. Sample is deterministic (first N elements
+    after flatten) so hash is reproducible.
+    Args:
+        tensor: PyTorch tensor or numpy array
+        sample_size: Number of elements to sample
+    Returns:
+        16-character hex hash
+    """
+    # Convert to numpy if needed
+    if hasattr(tensor, 'detach'):
+        # PyTorch tensor
+        arr = tensor.detach().cpu().float().numpy()
+    elif hasattr(tensor, 'numpy'):
+        arr = tensor.numpy()
+    else:
+        arr = np.array(tensor)
+    # Flatten and sample
+    flat = arr.flatten()
+    sample = flat[:min(sample_size, len(flat))]
+    # Hash the bytes
+    # Include shape in hash so same values in different shapes hash differently
+    shape_bytes = str(arr.shape).encode('utf-8')
+    tensor_bytes = sample.astype(np.float32).tobytes()
+    combined = shape_bytes + tensor_bytes
+    return hashlib.sha256(combined).hexdigest()[:16]
+def hash_params(module) -> str:
+    """
+    Hash a module's parameters (weights, biases).
+    This creates a frozen reference to the model state at observation time.
+    If weights change, this hash changes.
+    """
+    param_hashes = []
+    for name, param in module.named_parameters(recurse=False):
+        if param is not None:
+            h = hash_tensor(param.data, sample_size=500)
+            param_hashes.append(f"{name}:{h}")
+    if not param_hashes:
+        return "no_params"
+    combined = "|".join(sorted(param_hashes))
+    return hashlib.sha256(combined.encode()).hexdigest()[:16]
+def hash_model(model) -> str:
+    """
+    Hash entire model state.
+    This is the model's identity hash - changes if any weight changes.
+    """
+    all_hashes = []
+    for name, param in model.named_parameters():
+        h = hash_tensor(param.data, sample_size=100)
+        all_hashes.append(f"{name}:{h}")
+    combined = "|".join(all_hashes)
+    return hashlib.sha256(combined.encode()).hexdigest()[:32]
+def hash_input(data: Any) -> str:
+    """
+    Hash input data (text, tokens, images, etc).
+    """
+    if isinstance(data, str):
+        return hashlib.sha256(data.encode('utf-8')).hexdigest()[:16]
+    elif hasattr(data, 'detach'):
+        return hash_tensor(data)
+    elif isinstance(data, dict):
+        # Tokenizer output
+        combined = json.dumps({k: str(v) for k, v in sorted(data.items())})
+        return hashlib.sha256(combined.encode()).hexdigest()[:16]
+    else:
+        return hashlib.sha256(str(data).encode()).hexdigest()[:16]
+def compute_merkle_root(hashes: List[str]) -> str:
+    """
+    Compute Merkle root from list of hashes.
+    Standard Merkle tree construction - pairs hashes bottom-up
+    until single root remains.
+    """
+    if not hashes:
+        return hashlib.sha256(b"empty").hexdigest()[:16]
+    if len(hashes) == 1:
+        return hashes[0]
+    # Pad to even length
+    if len(hashes) % 2 == 1:
+        hashes = hashes + [hashes[-1]]
+    # Compute next level
+    next_level = []
+    for i in range(0, len(hashes), 2):
+        combined = hashes[i] + hashes[i + 1]
+        next_hash = hashlib.sha256(combined.encode()).hexdigest()[:16]
+        next_level.append(next_hash)
+    return compute_merkle_root(next_level)
+# =============================================================================
+# PROVENANCE TRACKER (attaches to model)
+# =============================================================================
+class ProvenanceTracker:
+    """
+    Tracks provenance during model forward pass.
+    Usage:
+        tracker = ProvenanceTracker(model, model_id="gpt2")
+        tracker.start_session(input_text)
+        # Run forward pass - hooks capture everything
+        output = model(**inputs)
+        chain = tracker.finalize_session()
+        print(chain.merkle_root)
+    NEW: Now writes to tape file (JSONL) for redundant logging!
+    Correlative with the Live Tracer - both systems log independently.
+    """
+    def __init__(self, model, model_id: str, log_dir: str = "./logs"):
+        self.model = model
+        self.model_id = model_id
+        self.model_hash = hash_model(model)
+        self.hooks = []
+        self.current_chain: Optional[ProvenanceChain] = None
+        self.execution_counter = 0
+        self.last_hash = None  # Track for parent linking
+        self.layer_hashes: Dict[str, str] = {}  # layer_name -> hash
+        # === TAPE FILE FOR REDUNDANT LOGGING ===
+        from pathlib import Path
+        from threading import Lock
+        self._log_dir = Path(log_dir)
+        self._log_dir.mkdir(parents=True, exist_ok=True)
+        self._session_id = int(time.time())
+        self._tape_path = self._log_dir / f"provenance_tape_{self._session_id}.jsonl"
+        self._tape_file = None
+        self._tape_lock = Lock()
+        self._record_count = 0
+    def start_session(self, input_data: Any) -> str:
+        """Start a new provenance tracking session."""
+        import uuid
+        session_id = str(uuid.uuid4())[:8]
+        input_hash = hash_input(input_data)
+        self.current_chain = ProvenanceChain(
+            session_id=session_id,
+            model_id=self.model_id,
+            model_hash=self.model_hash,
+            input_hash=input_hash
+        )
+        self.execution_counter = 0
+        self.last_hash = input_hash
+        self.layer_hashes = {"input": input_hash}
+        # Register hooks
+        self._register_hooks()
+        return session_id
+    def _register_hooks(self):
+        """Register forward hooks on all modules."""
+        self._remove_hooks()  # Clean up any existing
+        for name, module in self.model.named_modules():
+            if name:  # Skip root
+                hook = module.register_forward_hook(
+                    self._make_hook(name)
+                )
+                self.hooks.append(hook)
+    def _make_hook(self, layer_name: str):
+        """Create a forward hook for a specific layer."""
+        def hook(module, inp, out):
+            # Extract tensor
+            tensor = None
+            if hasattr(out, 'detach'):
+                tensor = out
+            elif isinstance(out, tuple) and len(out) > 0 and hasattr(out[0], 'detach'):
+                tensor = out[0]
+            elif hasattr(out, 'last_hidden_state'):
+                tensor = out.last_hidden_state
+            elif hasattr(out, 'logits'):
+                tensor = out.logits
+            if tensor is None or not hasattr(tensor, 'numel') or tensor.numel() == 0:
+                return
+            # Compute hashes
+            state_hash = hash_tensor(tensor)
+            params_hash = hash_params(module)
+            # Determine parent hashes
+            # For now, use last layer's hash. More sophisticated: track actual data flow.
+            parent_hashes = [self.last_hash] if self.last_hash else []
+            # Compute stats
+            t = tensor.float()
+            stats = {
+                "mean": t.mean().item(),
+                "std": t.std().item(),
+                "min": t.min().item(),
+                "max": t.max().item(),
+                "sparsity": (tensor == 0).float().mean().item(),
+            }
+            # Create record
+            record = ProvenanceRecord(
+                layer_name=layer_name,
+                layer_idx=self.execution_counter,
+                state_hash=state_hash,
+                parent_hashes=parent_hashes,
+                params_hash=params_hash,
+                shape=list(tensor.shape),
+                dtype=str(tensor.dtype),
+                stats=stats,
+                execution_order=self.execution_counter,
+            )
+            # Add to chain
+            if self.current_chain:
+                self.current_chain.add_record(record)
+            # === WRITE TO TAPE (REDUNDANT LOGGING) ===
+            self._write_to_tape(record)
+            # Update tracking
+            self.last_hash = state_hash
+            self.layer_hashes[layer_name] = state_hash
+            self.execution_counter += 1
+            self._record_count += 1
+        return hook
+    def _write_to_tape(self, record: ProvenanceRecord):
+        """Write provenance record to tape file for redundant logging."""
+        import json
+        try:
+            with self._tape_lock:
+                if self._tape_file is None:
+                    self._tape_file = open(self._tape_path, "a", encoding="utf-8")
+                    print(f"[CASCADE] 📼 Provenance tape started: {self._tape_path}")
+                tape_record = {
+                    "seq": self._record_count,
+                    "record": record.to_dict(),
+                    "session_id": self._session_id,
+                    "model_id": self.model_id,
+                }
+                self._tape_file.write(json.dumps(tape_record, default=str) + "\n")
+                self._tape_file.flush()
+        except Exception as e:
+            pass  # Don't let tape errors break the main flow
+    def close_tape(self):
+        """Close the tape file."""
+        with self._tape_lock:
+            if self._tape_file:
+                self._tape_file.close()
+                self._tape_file = None
+                print(f"[CASCADE] 📼 Provenance tape closed: {self._record_count} records → {self._tape_path}")
+    def get_tape_path(self):
+        """Get the current tape file path."""
+        return self._tape_path
+    def _remove_hooks(self):
+        """Remove all registered hooks."""
+        for hook in self.hooks:
+            hook.remove()
+        self.hooks = []
+    def finalize_session(self, output_data: Any = None) -> ProvenanceChain:
+        """Finalize session, compute Merkle root, return chain."""
+        self._remove_hooks()
+        if self.current_chain is None:
+            raise ValueError("No active session")
+        if output_data is not None:
+            self.current_chain.output_hash = hash_input(output_data)
+        self.current_chain.finalize()
+        # Close tape (session complete)
+        self.close_tape()
+        chain = self.current_chain
+        self.current_chain = None
+        return chain
+# =============================================================================
+# VERIFICATION & COMPARISON
+# =============================================================================
+def verify_chain(chain: ProvenanceChain) -> Tuple[bool, str]:
+    """Verify a provenance chain's integrity."""
+    return chain.verify()
+def compare_chains(chain_a: ProvenanceChain, chain_b: ProvenanceChain) -> Dict[str, Any]:
+    """
+    Compare two provenance chains.
+    Useful for:
+    - Same model, different inputs (where did outputs diverge?)
+    - Different models, same input (structural comparison)
+    - Same everything (reproducibility check)
+    """
+    result = {
+        "model_match": chain_a.model_hash == chain_b.model_hash,
+        "input_match": chain_a.input_hash == chain_b.input_hash,
+        "output_match": chain_a.output_hash == chain_b.output_hash,
+        "merkle_match": chain_a.merkle_root == chain_b.merkle_root,
+        "divergence_points": [],
+        "a_only_layers": [],
+        "b_only_layers": [],
+        "matching_layers": [],
+    }
+    a_layers = set(chain_a.records.keys())
+    b_layers = set(chain_b.records.keys())
+    result["a_only_layers"] = list(a_layers - b_layers)
+    result["b_only_layers"] = list(b_layers - a_layers)
+    # Compare matching layers
+    for layer in a_layers & b_layers:
+        rec_a = chain_a.records[layer]
+        rec_b = chain_b.records[layer]
+        if rec_a.state_hash == rec_b.state_hash:
+            result["matching_layers"].append(layer)
+        else:
+            result["divergence_points"].append({
+                "layer": layer,
+                "hash_a": rec_a.state_hash,
+                "hash_b": rec_b.state_hash,
+                "stats_a": rec_a.stats,
+                "stats_b": rec_b.stats,
+            })
+    return result
+def export_chain_for_audit(chain: ProvenanceChain, filepath: str) -> None:
+    """Export chain to file for external audit."""
+    with open(filepath, 'w') as f:
+        f.write(chain.to_json(indent=2))
+def import_chain_for_audit(filepath: str) -> ProvenanceChain:
+    """Import chain from audit file."""
+    with open(filepath, 'r') as f:
+        data = json.load(f)
+    return ProvenanceChain.from_dict(data)

cascade/core/web3_bridge.py ADDED Viewed

	@@ -0,0 +1,846 @@

+"""
+CASCADE // WEB3 BRIDGE
+Blockchain integration for AI provenance.
+The bridge between neural networks and decentralized infrastructure.
+┌─────────────────────────────────────────────────────────────────┐
+│                    THE IMMUTABLE RECORD                         │
+│                                                                 │
+│   AI Inference ──► Provenance Chain ──► Merkle Root ──► Chain  │
+│                                              │                  │
+│                                              ▼                  │
+│                    ┌─────────────────────────────────┐         │
+│                    │     ETHEREUM / SOLANA / etc     │         │
+│                    │  ┌───────────────────────────┐  │         │
+│                    │  │   Attestation Contract    │  │         │
+│                    │  │   - Model hash            │  │         │
+│                    │  │   - Input hash            │  │         │
+│                    │  │   - Merkle root           │  │         │
+│                    │  │   - Timestamp             │  │         │
+│                    │  └───────────────────────────┘  │         │
+│                    └─────────────────────────────────┘         │
+│                                              │                  │
+│                                              ▼                  │
+│                              IPFS / Arweave / Filecoin          │
+│                              (Full provenance chain storage)    │
+└─────────────────────────────────────────────────────────────────┘
+Web3 provides:
+    - Timestamping (block finality)
+    - Immutability (blockchain consensus)
+    - Decentralized storage (IPFS)
+    - Public verifiability (anyone can audit)
+    - Economic incentives (staking, reputation)
+This module provides:
+    - EIP-712 typed data signatures (Ethereum standard)
+    - IPFS CID computation (content addressing)
+    - Smart contract ABI for attestation
+    - Multi-chain attestation format
+    - NFT metadata for provenance tokens
+"""
+import hashlib
+import json
+import time
+import struct
+from typing import Dict, List, Optional, Any, Tuple
+from dataclasses import dataclass, field, asdict
+import base64
+try:
+    from .provenance import ProvenanceChain, ProvenanceRecord, compute_merkle_root
+except ImportError:
+    from provenance import ProvenanceChain, ProvenanceRecord, compute_merkle_root
+# =============================================================================
+# CONSTANTS
+# =============================================================================
+# EIP-712 Domain for CASCADE attestations
+CASCADE_DOMAIN = {
+    "name": "CASCADE Provenance",
+    "version": "1",
+    "chainId": 1,  # Ethereum mainnet, override for other chains
+    "verifyingContract": "0x0000000000000000000000000000000000000000",  # Set on deployment
+}
+# Attestation type definition for EIP-712
+ATTESTATION_TYPES = {
+    "Attestation": [
+        {"name": "model_hash", "type": "bytes32"},
+        {"name": "input_hash", "type": "bytes32"},
+        {"name": "merkle_root", "type": "bytes32"},
+        {"name": "timestamp", "type": "uint256"},
+        {"name": "session_id", "type": "string"},
+        {"name": "layer_count", "type": "uint256"},
+    ]
+}
+# =============================================================================
+# ATTESTATION RECORD
+# =============================================================================
+@dataclass
+class Web3Attestation:
+    """
+    Blockchain-ready attestation of AI inference provenance.
+    This is the "receipt" that can be posted on-chain.
+    Minimal data for on-chain storage, full data on IPFS.
+    """
+    # Core identity
+    model_hash: str           # 32-byte hash of model weights
+    input_hash: str           # 32-byte hash of input data
+    output_hash: str          # 32-byte hash of output
+    merkle_root: str          # Merkle root of provenance chain
+    # Metadata
+    session_id: str           # Unique session identifier
+    timestamp: int            # Unix timestamp
+    layer_count: int          # Number of layers in chain
+    # Content addressing
+    ipfs_cid: Optional[str] = None       # IPFS CID for full chain
+    arweave_id: Optional[str] = None     # Arweave transaction ID
+    # Signatures (set by wallet)
+    signature: Optional[str] = None       # EIP-712 signature
+    signer: Optional[str] = None          # Ethereum address
+    # Chain info
+    chain_id: int = 1                     # 1=Ethereum, 137=Polygon, etc.
+    contract_address: Optional[str] = None
+    tx_hash: Optional[str] = None         # Transaction hash after posting
+    def to_eip712_message(self, domain: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        Format as EIP-712 typed data for signing.
+        This is the standard Ethereum signing format that wallets understand.
+        """
+        domain = domain or CASCADE_DOMAIN
+        return {
+            "types": {
+                "EIP712Domain": [
+                    {"name": "name", "type": "string"},
+                    {"name": "version", "type": "string"},
+                    {"name": "chainId", "type": "uint256"},
+                    {"name": "verifyingContract", "type": "address"},
+                ],
+                **ATTESTATION_TYPES
+            },
+            "primaryType": "Attestation",
+            "domain": domain,
+            "message": {
+                "model_hash": self._to_bytes32(self.model_hash),
+                "input_hash": self._to_bytes32(self.input_hash),
+                "merkle_root": self._to_bytes32(self.merkle_root),
+                "timestamp": self.timestamp,
+                "session_id": self.session_id,
+                "layer_count": self.layer_count,
+            }
+        }
+    def _to_bytes32(self, hex_str: str) -> str:
+        """Pad hash to bytes32 format."""
+        # Remove 0x prefix if present
+        clean = hex_str.replace("0x", "")
+        # Pad to 64 chars (32 bytes)
+        padded = clean.zfill(64)
+        return "0x" + padded
+    def to_contract_args(self) -> Tuple:
+        """
+        Format for smart contract function call.
+        Returns tuple matching:
+            function attest(bytes32 modelHash, bytes32 inputHash, bytes32 merkleRoot,
+                           string memory sessionId, uint256 layerCount)
+        """
+        return (
+            bytes.fromhex(self.model_hash.replace("0x", "").zfill(64)),
+            bytes.fromhex(self.input_hash.replace("0x", "").zfill(64)),
+            bytes.fromhex(self.merkle_root.replace("0x", "").zfill(64)),
+            self.session_id,
+            self.layer_count,
+        )
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize for storage/transmission."""
+        return asdict(self)
+    def to_json(self) -> str:
+        """JSON export."""
+        return json.dumps(self.to_dict(), indent=2)
+    @classmethod
+    def from_chain(cls, chain: ProvenanceChain) -> 'Web3Attestation':
+        """Create attestation from provenance chain."""
+        if not chain.finalized:
+            chain.finalize()
+        return cls(
+            model_hash=chain.model_hash,
+            input_hash=chain.input_hash,
+            output_hash=chain.output_hash or "0" * 16,
+            merkle_root=chain.merkle_root,
+            session_id=chain.session_id,
+            timestamp=int(chain.created_at),
+            layer_count=len(chain.records),
+        )
+# =============================================================================
+# IPFS CONTENT ADDRESSING
+# =============================================================================
+def compute_ipfs_cid_v0(data: bytes) -> str:
+    """
+    Compute IPFS CID v0 (Qm...) for data.
+    This is a simplified computation - actual IPFS uses more complex
+    chunking for large files. Suitable for JSON chain data.
+    CIDv0 format: Base58(0x12 || 0x20 || SHA256(data))
+    """
+    # SHA-256 hash
+    sha_hash = hashlib.sha256(data).digest()
+    # Multihash prefix: 0x12 (sha2-256), 0x20 (32 bytes)
+    multihash = bytes([0x12, 0x20]) + sha_hash
+    # Base58 encode (Bitcoin alphabet)
+    return base58_encode(multihash)
+def compute_ipfs_cid_v1(data: bytes) -> str:
+    """
+    Compute IPFS CID v1 (bafy...) for data.
+    CIDv1 format: multibase || version || codec || multihash
+    """
+    # SHA-256 hash
+    sha_hash = hashlib.sha256(data).digest()
+    # Build CIDv1:
+    # 0x01 = CID version 1
+    # 0x55 = raw binary codec (could also use 0x71 for dag-cbor)
+    # 0x12 = sha2-256
+    # 0x20 = 32 bytes
+    cid_bytes = bytes([0x01, 0x55, 0x12, 0x20]) + sha_hash
+    # Base32 lower with 'b' prefix (multibase)
+    import base64
+    b32 = base64.b32encode(cid_bytes).decode('ascii').lower().rstrip('=')
+    return 'b' + b32
+def base58_encode(data: bytes) -> str:
+    """Base58 encoding (Bitcoin alphabet)."""
+    ALPHABET = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"
+    # Count leading zeros
+    leading_zeros = 0
+    for byte in data:
+        if byte == 0:
+            leading_zeros += 1
+        else:
+            break
+    # Convert to integer
+    num = int.from_bytes(data, 'big')
+    # Convert to base58
+    result = ""
+    while num > 0:
+        num, remainder = divmod(num, 58)
+        result = ALPHABET[remainder] + result
+    # Add leading '1's for each leading zero byte
+    return '1' * leading_zeros + result
+def chain_to_ipfs_ready(chain: ProvenanceChain) -> Tuple[bytes, str]:
+    """
+    Prepare provenance chain for IPFS upload.
+    Returns:
+        (data_bytes, cid) - The data to upload and its expected CID
+    """
+    json_data = chain.to_json().encode('utf-8')
+    cid = compute_ipfs_cid_v0(json_data)
+    return json_data, cid
+# =============================================================================
+# SMART CONTRACT ABI
+# =============================================================================
+CASCADE_ATTESTATION_ABI = [
+    {
+        "name": "Attest",
+        "type": "event",
+        "inputs": [
+            {"name": "attester", "type": "address", "indexed": True},
+            {"name": "modelHash", "type": "bytes32", "indexed": True},
+            {"name": "merkleRoot", "type": "bytes32", "indexed": False},
+            {"name": "sessionId", "type": "string", "indexed": False},
+            {"name": "timestamp", "type": "uint256", "indexed": False},
+        ]
+    },
+    {
+        "name": "attest",
+        "type": "function",
+        "stateMutability": "nonpayable",
+        "inputs": [
+            {"name": "modelHash", "type": "bytes32"},
+            {"name": "inputHash", "type": "bytes32"},
+            {"name": "merkleRoot", "type": "bytes32"},
+            {"name": "sessionId", "type": "string"},
+            {"name": "layerCount", "type": "uint256"},
+        ],
+        "outputs": [{"name": "attestationId", "type": "uint256"}]
+    },
+    {
+        "name": "verify",
+        "type": "function",
+        "stateMutability": "view",
+        "inputs": [
+            {"name": "attestationId", "type": "uint256"},
+        ],
+        "outputs": [
+            {"name": "valid", "type": "bool"},
+            {"name": "attester", "type": "address"},
+            {"name": "modelHash", "type": "bytes32"},
+            {"name": "merkleRoot", "type": "bytes32"},
+        ]
+    },
+    {
+        "name": "getAttestation",
+        "type": "function",
+        "stateMutability": "view",
+        "inputs": [
+            {"name": "attestationId", "type": "uint256"},
+        ],
+        "outputs": [
+            {"name": "attester", "type": "address"},
+            {"name": "modelHash", "type": "bytes32"},
+            {"name": "inputHash", "type": "bytes32"},
+            {"name": "merkleRoot", "type": "bytes32"},
+            {"name": "sessionId", "type": "string"},
+            {"name": "layerCount", "type": "uint256"},
+            {"name": "timestamp", "type": "uint256"},
+        ]
+    },
+    {
+        "name": "attestationsByModel",
+        "type": "function",
+        "stateMutability": "view",
+        "inputs": [
+            {"name": "modelHash", "type": "bytes32"},
+        ],
+        "outputs": [
+            {"name": "attestationIds", "type": "uint256[]"},
+        ]
+    },
+]
+# Solidity source for the attestation contract
+CASCADE_ATTESTATION_SOLIDITY = '''
+// SPDX-License-Identifier: MIT
+pragma solidity ^0.8.19;
+/**
+ * @title CascadeAttestation
+ * @notice On-chain attestation of AI inference provenance
+ * @dev Stores Merkle roots for off-chain provenance chains
+ */
+contract CascadeAttestation {
+    struct Attestation {
+        address attester;
+        bytes32 modelHash;
+        bytes32 inputHash;
+        bytes32 merkleRoot;
+        string sessionId;
+        uint256 layerCount;
+        uint256 timestamp;
+        string ipfsCid;  // Optional: full chain on IPFS
+    }
+    // Attestation storage
+    mapping(uint256 => Attestation) public attestations;
+    uint256 public attestationCount;
+    // Index by model
+    mapping(bytes32 => uint256[]) public attestationsByModel;
+    // Index by attester
+    mapping(address => uint256[]) public attestationsByAttester;
+    // Events
+    event Attested(
+        uint256 indexed attestationId,
+        address indexed attester,
+        bytes32 indexed modelHash,
+        bytes32 merkleRoot,
+        string sessionId
+    );
+    /**
+     * @notice Create a new attestation
+     * @param modelHash Hash of the model weights
+     * @param inputHash Hash of the input data
+     * @param merkleRoot Merkle root of the provenance chain
+     * @param sessionId Unique session identifier
+     * @param layerCount Number of layers in the chain
+     * @return attestationId The ID of the new attestation
+     */
+    function attest(
+        bytes32 modelHash,
+        bytes32 inputHash,
+        bytes32 merkleRoot,
+        string memory sessionId,
+        uint256 layerCount
+    ) external returns (uint256 attestationId) {
+        attestationId = attestationCount++;
+        attestations[attestationId] = Attestation({
+            attester: msg.sender,
+            modelHash: modelHash,
+            inputHash: inputHash,
+            merkleRoot: merkleRoot,
+            sessionId: sessionId,
+            layerCount: layerCount,
+            timestamp: block.timestamp,
+            ipfsCid: ""
+        });
+        attestationsByModel[modelHash].push(attestationId);
+        attestationsByAttester[msg.sender].push(attestationId);
+        emit Attested(attestationId, msg.sender, modelHash, merkleRoot, sessionId);
+        return attestationId;
+    }
+    /**
+     * @notice Attest with IPFS CID for full chain data
+     */
+    function attestWithIPFS(
+        bytes32 modelHash,
+        bytes32 inputHash,
+        bytes32 merkleRoot,
+        string memory sessionId,
+        uint256 layerCount,
+        string memory ipfsCid
+    ) external returns (uint256 attestationId) {
+        attestationId = this.attest(modelHash, inputHash, merkleRoot, sessionId, layerCount);
+        attestations[attestationId].ipfsCid = ipfsCid;
+        return attestationId;
+    }
+    /**
+     * @notice Verify an attestation exists and return core data
+     */
+    function verify(uint256 attestationId) external view returns (
+        bool valid,
+        address attester,
+        bytes32 modelHash,
+        bytes32 merkleRoot
+    ) {
+        if (attestationId >= attestationCount) {
+            return (false, address(0), bytes32(0), bytes32(0));
+        }
+        Attestation storage a = attestations[attestationId];
+        return (true, a.attester, a.modelHash, a.merkleRoot);
+    }
+    /**
+     * @notice Get all attestations for a model
+     */
+    function getModelAttestations(bytes32 modelHash) external view returns (uint256[] memory) {
+        return attestationsByModel[modelHash];
+    }
+    /**
+     * @notice Get all attestations by an address
+     */
+    function getAttesterAttestations(address attester) external view returns (uint256[] memory) {
+        return attestationsByAttester[attester];
+    }
+}
+'''
+# =============================================================================
+# NFT METADATA (for provenance tokens)
+# =============================================================================
+def generate_nft_metadata(chain: ProvenanceChain,
+                         image_url: Optional[str] = None,
+                         animation_url: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Generate ERC-721 compatible metadata for a provenance NFT.
+    Each unique model×input×output combination could be an NFT,
+    proving that this specific inference happened.
+    """
+    if not chain.finalized:
+        chain.finalize()
+    # Generate attributes from chain
+    attributes = [
+        {"trait_type": "Model Hash", "value": chain.model_hash[:16]},
+        {"trait_type": "Input Hash", "value": chain.input_hash},
+        {"trait_type": "Merkle Root", "value": chain.merkle_root},
+        {"trait_type": "Layer Count", "value": len(chain.records)},
+        {"trait_type": "Timestamp", "value": int(chain.created_at)},
+    ]
+    # Add layer statistics as traits
+    if chain.records:
+        total_params = 0
+        layer_types = set()
+        for record in chain.records.values():
+            if record.params_hash != "no_params":
+                total_params += 1
+            # Extract layer type from name
+            parts = record.layer_name.split('.')
+            if len(parts) >= 2:
+                layer_types.add(parts[-1])
+        attributes.append({"trait_type": "Parameterized Layers", "value": total_params})
+        for lt in list(layer_types)[:5]:  # Max 5 layer types
+            attributes.append({"trait_type": f"Has {lt}", "value": "Yes"})
+    return {
+        "name": f"CASCADE Provenance #{chain.session_id}",
+        "description": f"Cryptographic proof of AI inference. Model: {chain.model_id}. "
+                      f"This NFT attests that a specific input was processed through "
+                      f"the model, producing a verifiable Merkle root of all layer activations.",
+        "image": image_url or "ipfs://QmDefaultCascadeImage",  # Placeholder
+        "animation_url": animation_url,  # Could link to 3D visualization
+        "external_url": f"https://cascade.ai/verify/{chain.session_id}",
+        "attributes": attributes,
+        "properties": {
+            "model_id": chain.model_id,
+            "model_hash": chain.model_hash,
+            "input_hash": chain.input_hash,
+            "output_hash": chain.output_hash,
+            "merkle_root": chain.merkle_root,
+            "session_id": chain.session_id,
+            "layer_count": len(chain.records),
+            "created_at": chain.created_at,
+        }
+    }
+# =============================================================================
+# MULTI-CHAIN SUPPORT
+# =============================================================================
+CHAIN_CONFIGS = {
+    "ethereum": {
+        "chain_id": 1,
+        "name": "Ethereum Mainnet",
+        "explorer": "https://etherscan.io",
+        "native_token": "ETH",
+    },
+    "polygon": {
+        "chain_id": 137,
+        "name": "Polygon",
+        "explorer": "https://polygonscan.com",
+        "native_token": "MATIC",
+    },
+    "arbitrum": {
+        "chain_id": 42161,
+        "name": "Arbitrum One",
+        "explorer": "https://arbiscan.io",
+        "native_token": "ETH",
+    },
+    "optimism": {
+        "chain_id": 10,
+        "name": "Optimism",
+        "explorer": "https://optimistic.etherscan.io",
+        "native_token": "ETH",
+    },
+    "base": {
+        "chain_id": 8453,
+        "name": "Base",
+        "explorer": "https://basescan.org",
+        "native_token": "ETH",
+    },
+    "solana": {
+        "chain_id": -1,  # Not EVM
+        "name": "Solana",
+        "explorer": "https://solscan.io",
+        "native_token": "SOL",
+    },
+}
+def get_chain_config(chain_name: str) -> Dict[str, Any]:
+    """Get configuration for a specific blockchain."""
+    return CHAIN_CONFIGS.get(chain_name.lower(), CHAIN_CONFIGS["ethereum"])
+# =============================================================================
+# WEB3 EXPORT UTILITIES
+# =============================================================================
+def export_for_web3(chain: ProvenanceChain,
+                    chain_name: str = "ethereum",
+                    include_full_chain: bool = True) -> Dict[str, Any]:
+    """
+    Export provenance chain in Web3-ready format.
+    Returns everything needed to post attestation on-chain.
+    """
+    attestation = Web3Attestation.from_chain(chain)
+    chain_config = get_chain_config(chain_name)
+    result = {
+        "attestation": attestation.to_dict(),
+        "eip712": attestation.to_eip712_message({
+            **CASCADE_DOMAIN,
+            "chainId": chain_config["chain_id"]
+        }),
+        "contract_abi": CASCADE_ATTESTATION_ABI,
+        "chain_config": chain_config,
+    }
+    if include_full_chain:
+        data, cid = chain_to_ipfs_ready(chain)
+        result["ipfs"] = {
+            "data": base64.b64encode(data).decode('ascii'),
+            "cid": cid,
+            "size_bytes": len(data),
+        }
+    return result
+def generate_verification_page(attestation: Web3Attestation,
+                              chain: Optional[ProvenanceChain] = None) -> str:
+    """
+    Generate an HTML verification page for an attestation.
+    This can be hosted anywhere and allows public verification.
+    """
+    records_html = ""
+    if chain:
+        for record in chain.records.values():
+            records_html += f"""
+            <tr>
+                <td>{record.layer_name}</td>
+                <td><code>{record.state_hash}</code></td>
+                <td>{record.shape}</td>
+                <td>{record.stats.get('mean', 0):.4f}</td>
+            </tr>
+            """
+    return f"""<!DOCTYPE html>
+<html>
+<head>
+    <title>CASCADE Provenance Verification</title>
+    <meta charset="utf-8">
+    <style>
+        body {{ font-family: 'Courier New', monospace; background: #0a0a0a; color: #00ff88; padding: 40px; }}
+        .container {{ max-width: 900px; margin: 0 auto; }}
+        h1 {{ color: #00ffcc; border-bottom: 2px solid #00ff88; padding-bottom: 10px; }}
+        .hash {{ font-family: monospace; background: #1a1a2e; padding: 10px; border-radius: 4px; word-break: break-all; }}
+        .verified {{ color: #00ff88; }}
+        .label {{ color: #888; font-size: 0.9em; }}
+        table {{ width: 100%; border-collapse: collapse; margin-top: 20px; }}
+        th, td {{ padding: 8px; border: 1px solid #333; text-align: left; }}
+        th {{ background: #1a1a2e; }}
+        code {{ background: #1a1a2e; padding: 2px 6px; border-radius: 3px; }}
+        .merkle {{ font-size: 1.5em; color: #ffcc00; text-align: center; padding: 20px; background: #1a1a2e; border-radius: 8px; margin: 20px 0; }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>🔗 CASCADE Provenance Verification</h1>
+        <div class="merkle">
+            Merkle Root: <code>{attestation.merkle_root}</code>
+        </div>
+        <h2>Attestation Details</h2>
+        <p class="label">Session ID</p>
+        <div class="hash">{attestation.session_id}</div>
+        <p class="label">Model Hash</p>
+        <div class="hash">{attestation.model_hash}</div>
+        <p class="label">Input Hash</p>
+        <div class="hash">{attestation.input_hash}</div>
+        <p class="label">Output Hash</p>
+        <div class="hash">{attestation.output_hash}</div>
+        <p class="label">Timestamp</p>
+        <div class="hash">{attestation.timestamp} ({time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime(attestation.timestamp))})</div>
+        <p class="label">Layer Count</p>
+        <div class="hash">{attestation.layer_count} layers</div>
+        {"<h2>Provenance Chain</h2><table><tr><th>Layer</th><th>State Hash</th><th>Shape</th><th>Mean</th></tr>" + records_html + "</table>" if chain else ""}
+        <h2>On-Chain Verification</h2>
+        <p>{"<span class='verified'>✓ Verified on " + get_chain_config('ethereum')['name'] + "</span>" if attestation.tx_hash else "⏳ Pending on-chain attestation"}</p>
+        {f"<p class='label'>Transaction</p><div class='hash'><a href='{get_chain_config('ethereum')['explorer']}/tx/{attestation.tx_hash}' style='color: #00ff88;'>{attestation.tx_hash}</a></div>" if attestation.tx_hash else ""}
+        <h2>IPFS Storage</h2>
+        <p>{f"<a href='https://ipfs.io/ipfs/{attestation.ipfs_cid}' style='color: #00ff88;'>{attestation.ipfs_cid}</a>" if attestation.ipfs_cid else "Full chain not yet pinned to IPFS"}</p>
+        <hr style="border-color: #333; margin: 40px 0;">
+        <p style="color: #666; text-align: center;">CASCADE Provenance Engine • Due process infrastructure for AI</p>
+    </div>
+</body>
+</html>
+"""
+# =============================================================================
+# SIGNATURE UTILITIES (for wallet integration)
+# =============================================================================
+def prepare_for_signing(attestation: Web3Attestation,
+                       chain_name: str = "ethereum") -> Dict[str, Any]:
+    """
+    Prepare attestation for wallet signing (MetaMask, etc).
+    Returns the EIP-712 message that wallets can sign.
+    """
+    chain_config = get_chain_config(chain_name)
+    eip712 = attestation.to_eip712_message({
+        **CASCADE_DOMAIN,
+        "chainId": chain_config["chain_id"]
+    })
+    return {
+        "method": "eth_signTypedData_v4",
+        "params": [
+            None,  # Address filled by wallet
+            json.dumps(eip712)
+        ],
+        "display": {
+            "title": "Sign CASCADE Attestation",
+            "description": f"Attest that model {attestation.model_hash[:16]}... "
+                          f"processed input {attestation.input_hash[:16]}...",
+            "merkle_root": attestation.merkle_root,
+        }
+    }
+def verify_signature(attestation: Web3Attestation,
+                    signature: str,
+                    expected_signer: str) -> Tuple[bool, str]:
+    """
+    Verify an EIP-712 signature.
+    Note: Full verification requires eth_utils/web3.py.
+    This is a structural check only.
+    """
+    if not signature or len(signature) < 130:
+        return False, "Invalid signature length"
+    if not signature.startswith("0x"):
+        return False, "Signature must start with 0x"
+    # Extract r, s, v components
+    try:
+        sig_bytes = bytes.fromhex(signature[2:])
+        if len(sig_bytes) != 65:
+            return False, f"Signature must be 65 bytes, got {len(sig_bytes)}"
+        r = sig_bytes[:32]
+        s = sig_bytes[32:64]
+        v = sig_bytes[64]
+        # v should be 27 or 28 (or 0/1 for some implementations)
+        if v not in [0, 1, 27, 28]:
+            return False, f"Invalid v value: {v}"
+        # Structural validation passed
+        # Full cryptographic verification requires ecrecover
+        return True, "Signature structure valid (full verification requires web3.py)"
+    except Exception as e:
+        return False, f"Signature parsing error: {str(e)}"
+# =============================================================================
+# CONVENIENCE FUNCTIONS
+# =============================================================================
+def attest_inference(chain: ProvenanceChain,
+                    chain_name: str = "ethereum") -> Web3Attestation:
+    """
+    One-liner to create attestation from provenance chain.
+    Usage:
+        attestation = attest_inference(chain)
+        print(attestation.merkle_root)
+    """
+    if not chain.finalized:
+        chain.finalize()
+    attestation = Web3Attestation.from_chain(chain)
+    # Compute IPFS CID
+    data, cid = chain_to_ipfs_ready(chain)
+    attestation.ipfs_cid = cid
+    # Set chain
+    attestation.chain_id = get_chain_config(chain_name)["chain_id"]
+    return attestation
+def quick_verify(merkle_root: str, layer_hashes: List[str]) -> bool:
+    """
+    Quick verification that layer hashes produce expected Merkle root.
+    """
+    computed = compute_merkle_root(layer_hashes)
+    return computed == merkle_root
+# =============================================================================
+# COMMAND LINE INTERFACE
+# =============================================================================
+if __name__ == "__main__":
+    import sys
+    print("CASCADE // WEB3 BRIDGE")
+    print("=" * 50)
+    print()
+    print("Smart Contract (Solidity):")
+    print("-" * 50)
+    print(CASCADE_ATTESTATION_SOLIDITY[:500] + "...")
+    print()
+    print("Contract ABI:")
+    print("-" * 50)
+    print(json.dumps(CASCADE_ATTESTATION_ABI, indent=2)[:500] + "...")
+    print()
+    print("Supported Chains:")
+    print("-" * 50)
+    for name, config in CHAIN_CONFIGS.items():
+        print(f"  {name}: Chain ID {config['chain_id']}")
+    print()
+    print("Usage:")
+    print("  from cascade.core.web3_bridge import attest_inference, export_for_web3")
+    print("  attestation = attest_inference(provenance_chain)")
+    print("  web3_data = export_for_web3(provenance_chain, 'polygon')")

cascade/data/__init__.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+CASCADE Data Observatory
+Dataset observation with the same rigor as model observation.
+Tracks provenance, schema, lineage using W3C PROV-O standard.
+"""
+from .entities import (
+    DatasetEntity,
+    Activity,
+    Agent,
+    Relationship,
+    RelationType,
+    ActivityType,
+    AgentType,
+    create_system_agent,
+    create_model_agent,
+    create_user_agent,
+)
+from .observer import DatasetObserver, ObservationContext
+from .provenance import ProvenanceGraph
+from .schema import SchemaObserver, DatasetSchema, FieldSchema, hash_content
+from .croissant import CroissantExporter, export_to_croissant
+from .hub import HubIntegration, AccountabilityBundle, push_to_hub, pull_from_hub
+from .license import (
+    SPDXLicense,
+    LicenseCategory,
+    LicenseRestriction,
+    LicenseCompatibility,
+    LicenseAnalyzer,
+    SPDX_LICENSES,
+    get_license,
+    check_license_compatibility,
+    get_derived_license,
+)
+from .pii import (
+    PIIType,
+    PIISeverity,
+    PIIMatch,
+    PIIScanResult,
+    PIIScanner,
+    scan_for_pii,
+    quick_pii_check,
+)
+from .live import (
+    LiveDocumentTracer,
+    TraceEvent,
+    TraceEventType,
+    DocumentSpan,
+    DocumentAssociation,
+    ConsoleTraceRenderer,
+    create_live_tracer,
+)
+__all__ = [
+    # Entities (PROV-O)
+    "DatasetEntity",
+    "Activity",
+    "Agent",
+    "Relationship",
+    "RelationType",
+    "ActivityType",
+    "AgentType",
+    "create_system_agent",
+    "create_model_agent",
+    "create_user_agent",
+    # Observer
+    "DatasetObserver",
+    "ObservationContext",
+    # Provenance
+    "ProvenanceGraph",
+    # Schema
+    "SchemaObserver",
+    "DatasetSchema",
+    "FieldSchema",
+    "hash_content",
+    # Export
+    "CroissantExporter",
+    "export_to_croissant",
+    # Accountability
+    "AccountabilityBundle",
+    # Hub
+    "HubIntegration",
+    "push_to_hub",
+    "pull_from_hub",
+    # License
+    "SPDXLicense",
+    "LicenseCategory",
+    "LicenseRestriction",
+    "LicenseCompatibility",
+    "LicenseAnalyzer",
+    "SPDX_LICENSES",
+    "get_license",
+    "check_license_compatibility",
+    "get_derived_license",
+    # PII Detection
+    "PIIType",
+    "PIISeverity",
+    "PIIMatch",
+    "PIIScanResult",
+    "PIIScanner",
+    "scan_for_pii",
+    "quick_pii_check",
+    # Live Document Tracing
+    "LiveDocumentTracer",
+    "TraceEvent",
+    "TraceEventType",
+    "DocumentSpan",
+    "DocumentAssociation",
+    "ConsoleTraceRenderer",
+    "create_live_tracer",
+]

cascade/data/croissant.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""
+Croissant Exporter
+Exports provenance graph to MLCommons Croissant format.
+Croissant is the emerging standard for ML dataset metadata.
+Reference: https://github.com/mlcommons/croissant
+"""
+import json
+import time
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+from .entities import DatasetEntity, Activity, Agent
+from .provenance import ProvenanceGraph
+class CroissantExporter:
+    """
+    Export provenance to Croissant JSON-LD format.
+    Croissant layers:
+    1. Metadata - description, license, citation
+    2. Resources - file descriptions
+    3. Structure - record sets and fields
+    4. ML Semantics - task types, splits
+    We add provenance as an extension.
+    """
+    CROISSANT_VERSION = "1.0"
+    CROISSANT_CONTEXT = "http://mlcommons.org/croissant/1.0"
+    def __init__(self, graph: ProvenanceGraph):
+        self.graph = graph
+    def export(
+        self,
+        name: str = None,
+        description: str = None,
+        license_url: str = None,
+        citation: str = None,
+        url: str = None,
+        include_provenance: bool = True,
+    ) -> Dict[str, Any]:
+        """
+        Export to Croissant JSON-LD.
+        Args:
+            name: Dataset name (defaults to graph name)
+            description: Dataset description
+            license_url: License URL
+            citation: Citation text
+            url: Dataset URL
+            include_provenance: Whether to include CASCADE provenance extension
+        Returns:
+            Croissant JSON-LD document
+        """
+        name = name or self.graph.name
+        doc = {
+            "@context": {
+                "@vocab": "http://schema.org/",
+                "sc": "http://schema.org/",
+                "cr": "http://mlcommons.org/croissant/",
+                "rai": "http://mlcommons.org/croissant/RAI/",
+                "spdx": "http://spdx.org/rdf/terms#",
+            },
+            "@type": "sc:Dataset",
+            "name": name,
+            "conformsTo": self.CROISSANT_CONTEXT,
+            "dateCreated": datetime.fromtimestamp(self.graph.created_at).isoformat(),
+            "dateModified": datetime.now().isoformat(),
+        }
+        if description:
+            doc["description"] = description
+        if license_url:
+            doc["license"] = license_url
+        if citation:
+            doc["citation"] = citation
+        if url:
+            doc["url"] = url
+        # Add distributions (file objects)
+        doc["distribution"] = self._build_distributions()
+        # Add record sets
+        doc["recordSet"] = self._build_record_sets()
+        # Add provenance extension
+        if include_provenance:
+            doc["cr:provenance"] = self._build_provenance_extension()
+        return doc
+    def _build_distributions(self) -> List[Dict[str, Any]]:
+        """Build distribution (FileObject) entries."""
+        distributions = []
+        for entity in self.graph.list_entities():
+            dist = {
+                "@type": "cr:FileObject",
+                "@id": entity.id,
+                "name": entity.name,
+            }
+            if entity.source_uri:
+                dist["contentUrl"] = entity.source_uri
+            if entity.content_hash:
+                dist["sha256"] = entity.content_hash
+            # License information (SPDX)
+            if entity.license_id:
+                dist["spdx:license"] = entity.license_id
+                if entity.license_url:
+                    dist["sc:license"] = entity.license_url
+                else:
+                    # Auto-generate SPDX license URL
+                    dist["sc:license"] = f"https://spdx.org/licenses/{entity.license_id}.html"
+            # Infer encoding format from source type
+            format_map = {
+                "hf_dataset": "application/x-arrow",
+                "hf_hub": "application/x-arrow",
+                "parquet": "application/x-parquet",
+                "csv": "text/csv",
+                "json": "application/json",
+                "jsonl": "application/x-jsonlines",
+            }
+            if entity.source_type in format_map:
+                dist["encodingFormat"] = format_map[entity.source_type]
+            if entity.size_bytes:
+                dist["contentSize"] = f"{entity.size_bytes} bytes"
+            distributions.append(dist)
+        return distributions
+    def _build_record_sets(self) -> List[Dict[str, Any]]:
+        """Build RecordSet entries from entity schemas."""
+        record_sets = []
+        for entity in self.graph.list_entities():
+            schema = entity.attributes.get("schema")
+            if not schema:
+                continue
+            fields = []
+            for field_name, field_info in schema.get("fields", {}).items():
+                field_entry = {
+                    "@type": "cr:Field",
+                    "name": field_name,
+                    "dataType": self._map_dtype_to_croissant(field_info.get("dtype", "string")),
+                }
+                if field_info.get("description"):
+                    field_entry["description"] = field_info["description"]
+                # Source reference
+                field_entry["source"] = {
+                    "fileObject": {"@id": entity.id},
+                    "extract": {"column": field_name},
+                }
+                fields.append(field_entry)
+            if fields:
+                record_set = {
+                    "@type": "cr:RecordSet",
+                    "@id": f"recordset_{entity.id}",
+                    "name": f"{entity.name}_records",
+                    "field": fields,
+                }
+                if entity.record_count:
+                    record_set["cr:recordCount"] = entity.record_count
+                record_sets.append(record_set)
+        return record_sets
+    def _map_dtype_to_croissant(self, dtype: str) -> str:
+        """Map internal dtype to Croissant/schema.org type."""
+        type_map = {
+            "string": "sc:Text",
+            "int8": "sc:Integer",
+            "int16": "sc:Integer",
+            "int32": "sc:Integer",
+            "int64": "sc:Integer",
+            "uint8": "sc:Integer",
+            "uint16": "sc:Integer",
+            "uint32": "sc:Integer",
+            "uint64": "sc:Integer",
+            "float16": "sc:Float",
+            "float32": "sc:Float",
+            "float64": "sc:Float",
+            "bool": "sc:Boolean",
+            "binary": "sc:Text",  # Base64 encoded
+            "image": "sc:ImageObject",
+            "audio": "sc:AudioObject",
+            "categorical": "sc:Text",  # With enumeration
+            "list": "sc:ItemList",
+            "struct": "sc:StructuredValue",
+        }
+        return type_map.get(dtype, "sc:Text")
+    def _build_provenance_extension(self) -> Dict[str, Any]:
+        """Build CASCADE provenance extension."""
+        return {
+            "@type": "cascade:ProvenanceGraph",
+            "cascade:rootHash": self.graph.root_hash,
+            "cascade:createdAt": datetime.fromtimestamp(self.graph.created_at).isoformat(),
+            # Entities with lineage
+            "cascade:entities": [
+                {
+                    "@id": e.id,
+                    "cascade:name": e.name,
+                    "cascade:contentHash": e.content_hash,
+                    "cascade:schemaHash": e.schema_hash,
+                    "cascade:version": e.version,
+                    "cascade:recordCount": e.record_count,
+                    "cascade:derivedFrom": self.graph.get_lineage(e.id, "upstream"),
+                }
+                for e in self.graph.list_entities()
+            ],
+            # Activities
+            "cascade:activities": [
+                {
+                    "@id": a.id,
+                    "cascade:type": a.activity_type.value,
+                    "cascade:name": a.name,
+                    "cascade:startedAt": datetime.fromtimestamp(a.started_at).isoformat() if a.started_at else None,
+                    "cascade:endedAt": datetime.fromtimestamp(a.ended_at).isoformat() if a.ended_at else None,
+                    "cascade:inputs": a.inputs,
+                    "cascade:outputs": a.outputs,
+                    "cascade:parameters": a.parameters,
+                }
+                for a in self.graph.list_activities()
+            ],
+            # Agents
+            "cascade:agents": [
+                {
+                    "@id": a.id,
+                    "cascade:type": a.agent_type.value,
+                    "cascade:name": a.name,
+                    "cascade:version": a.version,
+                }
+                for a in self.graph.list_agents()
+            ],
+        }
+    def to_json(self, **kwargs) -> str:
+        """Export to JSON string."""
+        return json.dumps(self.export(**kwargs), indent=2, default=str)
+    def save(self, path: str, **kwargs):
+        """Save to file."""
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(self.to_json(**kwargs))
+def export_to_croissant(
+    graph: ProvenanceGraph,
+    name: str = None,
+    description: str = None,
+    **kwargs,
+) -> Dict[str, Any]:
+    """
+    Convenience function to export provenance to Croissant.
+    Args:
+        graph: The provenance graph to export
+        name: Dataset name
+        description: Dataset description
+        **kwargs: Additional export options
+    Returns:
+        Croissant JSON-LD document
+    """
+    exporter = CroissantExporter(graph)
+    return exporter.export(name=name, description=description, **kwargs)

cascade/data/entities.py ADDED Viewed

	@@ -0,0 +1,349 @@

+"""
+PROV-O Entities for Dataset Observation
+W3C PROV Data Model:
+- Entity: A physical, digital, or conceptual thing (the dataset)
+- Activity: Something that occurs over time and acts upon entities
+- Agent: Something that bears responsibility for an activity
+Relationships:
+- wasGeneratedBy: Entity → Activity
+- wasDerivedFrom: Entity → Entity
+- wasAttributedTo: Entity → Agent
+- used: Activity → Entity
+- wasAssociatedWith: Activity → Agent
+"""
+import hashlib
+import json
+import time
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import Any, Dict, List, Optional, Union
+class RelationType(Enum):
+    """W3C PROV-O relationship types."""
+    # Entity relationships
+    WAS_GENERATED_BY = "wasGeneratedBy"      # Entity → Activity
+    WAS_DERIVED_FROM = "wasDerivedFrom"      # Entity → Entity
+    WAS_ATTRIBUTED_TO = "wasAttributedTo"    # Entity → Agent
+    WAS_REVISION_OF = "wasRevisionOf"        # Entity → Entity (versioning)
+    HAD_PRIMARY_SOURCE = "hadPrimarySource"  # Entity → Entity
+    # Activity relationships
+    USED = "used"                            # Activity → Entity
+    WAS_ASSOCIATED_WITH = "wasAssociatedWith"  # Activity → Agent
+    WAS_INFORMED_BY = "wasInformedBy"        # Activity → Activity
+    WAS_STARTED_BY = "wasStartedBy"          # Activity → Entity
+    WAS_ENDED_BY = "wasEndedBy"              # Activity → Entity
+    # Agent relationships
+    ACTED_ON_BEHALF_OF = "actedOnBehalfOf"   # Agent → Agent
+@dataclass
+class Relationship:
+    """A provenance relationship between two nodes."""
+    relation_type: RelationType
+    source_id: str
+    target_id: str
+    timestamp: float = field(default_factory=time.time)
+    attributes: Dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "type": self.relation_type.value,
+            "source": self.source_id,
+            "target": self.target_id,
+            "timestamp": self.timestamp,
+            "attributes": self.attributes,
+        }
+    def to_prov_n(self) -> str:
+        """Export as PROV-N notation."""
+        return f"{self.relation_type.value}({self.source_id}, {self.target_id})"
+@dataclass
+class DatasetEntity:
+    """
+    A dataset entity in the provenance graph.
+    Corresponds to prov:Entity - any physical, digital, or conceptual thing.
+    In our case: a dataset, a version of a dataset, or a split.
+    """
+    id: str
+    name: str
+    # Content identification
+    content_hash: Optional[str] = None  # SHA-256 of data content
+    schema_hash: Optional[str] = None   # SHA-256 of schema/features
+    # Versioning
+    version: Optional[str] = None
+    previous_version: Optional[str] = None
+    # Source
+    source_type: str = "unknown"  # hf_hub, local, s3, gcs, etc.
+    source_uri: Optional[str] = None
+    # License (SPDX identifier)
+    license_id: Optional[str] = None    # e.g., "MIT", "CC-BY-4.0", "Apache-2.0"
+    license_url: Optional[str] = None   # URL to license text
+    # Statistics
+    record_count: Optional[int] = None
+    size_bytes: Optional[int] = None
+    splits: Dict[str, int] = field(default_factory=dict)  # split_name → count
+    # Metadata
+    attributes: Dict[str, Any] = field(default_factory=dict)
+    # Timestamps
+    created_at: float = field(default_factory=time.time)
+    def __post_init__(self):
+        """Generate ID if not provided."""
+        if not self.id:
+            self.id = f"entity:{self.name}:{int(self.created_at * 1000)}"
+    def compute_hash(self) -> str:
+        """Compute entity hash from content."""
+        content = json.dumps({
+            "id": self.id,
+            "name": self.name,
+            "content_hash": self.content_hash,
+            "schema_hash": self.schema_hash,
+            "version": self.version,
+            "record_count": self.record_count,
+        }, sort_keys=True)
+        return hashlib.sha256(content.encode()).hexdigest()
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "@type": "prov:Entity",
+            "@id": self.id,
+            "name": self.name,
+            "content_hash": self.content_hash,
+            "schema_hash": self.schema_hash,
+            "version": self.version,
+            "previous_version": self.previous_version,
+            "source_type": self.source_type,
+            "source_uri": self.source_uri,
+            "license_id": self.license_id,
+            "license_url": self.license_url,
+            "record_count": self.record_count,
+            "size_bytes": self.size_bytes,
+            "splits": self.splits,
+            "attributes": self.attributes,
+            "created_at": self.created_at,
+        }
+    def to_prov_n(self) -> str:
+        """Export as PROV-N notation."""
+        attrs = ", ".join([
+            f'prov:label="{self.name}"',
+            f'cascade:contentHash="{self.content_hash or "unknown"}"',
+            f'cascade:recordCount="{self.record_count or 0}"',
+            f'cascade:license="{self.license_id or "unknown"}"',
+        ])
+        return f"entity({self.id}, [{attrs}])"
+class ActivityType(Enum):
+    """Types of dataset activities."""
+    INGEST = "ingest"           # Load from source
+    TRANSFORM = "transform"     # Filter, map, join, etc.
+    SPLIT = "split"             # Train/test/val split
+    AUGMENT = "augment"         # Data augmentation
+    CLEAN = "clean"             # Cleaning/preprocessing
+    MERGE = "merge"             # Combining datasets
+    SAMPLE = "sample"           # Sampling/subsetting
+    EXPORT = "export"           # Export to format
+    TRAIN = "train"             # Model training (consumption)
+    EVALUATE = "evaluate"       # Model evaluation
+    INFERENCE = "inference"     # Model inference
+    ENTITY_RESOLUTION = "entity_resolution"  # Data Unity matching
+@dataclass
+class Activity:
+    """
+    An activity in the provenance graph.
+    Corresponds to prov:Activity - something that occurs over time
+    and acts upon or with entities.
+    """
+    id: str
+    activity_type: ActivityType
+    name: str
+    # Timing
+    started_at: Optional[float] = None
+    ended_at: Optional[float] = None
+    # Input/Output tracking
+    inputs: List[str] = field(default_factory=list)   # Entity IDs
+    outputs: List[str] = field(default_factory=list)  # Entity IDs
+    # Agent who performed this
+    agent_id: Optional[str] = None
+    # Parameters/configuration used
+    parameters: Dict[str, Any] = field(default_factory=dict)
+    # Metadata
+    attributes: Dict[str, Any] = field(default_factory=dict)
+    def __post_init__(self):
+        if not self.id:
+            self.id = f"activity:{self.activity_type.value}:{int(time.time() * 1000)}"
+        if self.started_at is None:
+            self.started_at = time.time()
+    def start(self):
+        """Mark activity as started."""
+        self.started_at = time.time()
+    def end(self):
+        """Mark activity as ended."""
+        self.ended_at = time.time()
+    @property
+    def duration(self) -> Optional[float]:
+        """Duration in seconds."""
+        if self.started_at and self.ended_at:
+            return self.ended_at - self.started_at
+        return None
+    def add_input(self, entity_id: str):
+        """Record an input entity."""
+        if entity_id not in self.inputs:
+            self.inputs.append(entity_id)
+    def add_output(self, entity_id: str):
+        """Record an output entity."""
+        if entity_id not in self.outputs:
+            self.outputs.append(entity_id)
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "@type": "prov:Activity",
+            "@id": self.id,
+            "activity_type": self.activity_type.value,
+            "name": self.name,
+            "started_at": self.started_at,
+            "ended_at": self.ended_at,
+            "duration": self.duration,
+            "inputs": self.inputs,
+            "outputs": self.outputs,
+            "agent_id": self.agent_id,
+            "parameters": self.parameters,
+            "attributes": self.attributes,
+        }
+    def to_prov_n(self) -> str:
+        """Export as PROV-N notation."""
+        start = datetime.fromtimestamp(self.started_at).isoformat() if self.started_at else "-"
+        end = datetime.fromtimestamp(self.ended_at).isoformat() if self.ended_at else "-"
+        attrs = f'prov:label="{self.name}", cascade:type="{self.activity_type.value}"'
+        return f"activity({self.id}, {start}, {end}, [{attrs}])"
+class AgentType(Enum):
+    """Types of agents."""
+    PERSON = "person"
+    ORGANIZATION = "organization"
+    SOFTWARE = "software"
+    MODEL = "model"
+    PIPELINE = "pipeline"
+    SYSTEM = "system"
+@dataclass
+class Agent:
+    """
+    An agent in the provenance graph.
+    Corresponds to prov:Agent - something that bears responsibility
+    for an activity taking place.
+    """
+    id: str
+    agent_type: AgentType
+    name: str
+    # For software/model agents
+    version: Optional[str] = None
+    # For organizational hierarchy
+    parent_agent_id: Optional[str] = None
+    # Contact/identification
+    identifier: Optional[str] = None  # HF username, email, etc.
+    # Metadata
+    attributes: Dict[str, Any] = field(default_factory=dict)
+    # Timestamp
+    created_at: float = field(default_factory=time.time)
+    def __post_init__(self):
+        if not self.id:
+            self.id = f"agent:{self.agent_type.value}:{self.name}".replace(" ", "_").lower()
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "@type": "prov:Agent",
+            "@id": self.id,
+            "agent_type": self.agent_type.value,
+            "name": self.name,
+            "version": self.version,
+            "parent_agent_id": self.parent_agent_id,
+            "identifier": self.identifier,
+            "attributes": self.attributes,
+            "created_at": self.created_at,
+        }
+    def to_prov_n(self) -> str:
+        """Export as PROV-N notation."""
+        attrs = f'prov:label="{self.name}", cascade:type="{self.agent_type.value}"'
+        if self.version:
+            attrs += f', cascade:version="{self.version}"'
+        return f"agent({self.id}, [{attrs}])"
+# Convenience factory functions
+def create_system_agent(name: str = "cascade", version: str = "1.0.0") -> Agent:
+    """Create a system agent for automated operations."""
+    return Agent(
+        id=f"agent:system:{name}",
+        agent_type=AgentType.SYSTEM,
+        name=name,
+        version=version,
+    )
+def create_model_agent(model_id: str, version: str = None) -> Agent:
+    """Create an agent representing an ML model."""
+    return Agent(
+        id=f"agent:model:{model_id.replace('/', '_')}",
+        agent_type=AgentType.MODEL,
+        name=model_id,
+        version=version,
+        identifier=model_id,
+    )
+def create_user_agent(username: str, org: str = None) -> Agent:
+    """Create an agent representing a user."""
+    agent = Agent(
+        id=f"agent:person:{username}",
+        agent_type=AgentType.PERSON,
+        name=username,
+        identifier=username,
+    )
+    if org:
+        agent.parent_agent_id = f"agent:organization:{org}"
+    return agent

cascade/data/hub.py ADDED Viewed

	@@ -0,0 +1,533 @@

+"""
+HuggingFace Hub Integration
+Push and pull dataset provenance to/from HuggingFace Hub.
+Exports complete W3C PROV-O accountability bundle:
+- cascade_provenance.json (CASCADE native format)
+- prov_o.jsonld (W3C PROV-O JSON-LD - interoperable)
+- prov_n.txt (W3C PROV-N notation - human readable)
+- activities.jsonl (Activity log for audit)
+- agents.json (Agent attributions)
+- croissant.json (MLCommons Croissant)
+"""
+import json
+import time
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional
+from .provenance import ProvenanceGraph
+from .croissant import CroissantExporter
+class AccountabilityBundle:
+    """
+    Complete W3C PROV-O accountability package.
+    When a dataset is extracted, this bundle provides full audit trail:
+    - Who created/modified it (agents)
+    - What transformations occurred (activities)
+    - Where it came from (entity lineage)
+    - When everything happened (timestamps)
+    - How to verify integrity (hashes)
+    """
+    def __init__(self, graph: ProvenanceGraph):
+        self.graph = graph
+        self.created_at = datetime.now(timezone.utc).isoformat()
+    def to_prov_o_jsonld(self) -> Dict[str, Any]:
+        """Export W3C PROV-O JSON-LD (interoperable standard)."""
+        return self.graph.to_prov_jsonld()
+    def to_prov_n(self) -> str:
+        """Export W3C PROV-N notation (human readable)."""
+        return self.graph.to_prov_n()
+    def to_activity_log(self) -> List[Dict[str, Any]]:
+        """Export activity log for audit (JSONL format)."""
+        activities = []
+        for activity in self.graph.list_activities():
+            activities.append({
+                "id": activity.id,
+                "name": activity.name,
+                "type": activity.activity_type.value,
+                "started_at": datetime.fromtimestamp(activity.started_at).isoformat() if activity.started_at else None,
+                "ended_at": datetime.fromtimestamp(activity.ended_at).isoformat() if activity.ended_at else None,
+                "duration_seconds": activity.duration,
+                "inputs": activity.inputs,
+                "outputs": activity.outputs,
+                "parameters": activity.parameters,
+                "attributes": activity.attributes,
+            })
+        return activities
+    def to_agent_attributions(self) -> Dict[str, Any]:
+        """Export agent attributions for accountability."""
+        agents = {}
+        for agent in self.graph.list_agents():
+            agents[agent.id] = {
+                "name": agent.name,
+                "type": agent.agent_type.value,
+                "version": agent.version,
+                "identifier": agent.identifier,
+                "attributes": agent.attributes,
+            }
+        # Build attribution matrix: which agent did what
+        attributions = []
+        for rel in self.graph.list_relationships():
+            if rel.relation_type.value == "wasAssociatedWith":
+                activity = self.graph.get_activity(rel.source_id)
+                agent = self.graph.get_agent(rel.target_id)
+                if activity and agent:
+                    attributions.append({
+                        "activity_id": activity.id,
+                        "activity_name": activity.name,
+                        "agent_id": agent.id,
+                        "agent_name": agent.name,
+                        "timestamp": datetime.fromtimestamp(activity.started_at).isoformat() if activity.started_at else None,
+                    })
+        return {
+            "agents": agents,
+            "attributions": attributions,
+            "total_agents": len(agents),
+            "total_attributions": len(attributions),
+        }
+    def to_integrity_manifest(self) -> Dict[str, Any]:
+        """Export integrity manifest for verification."""
+        is_valid, invalid_ids = self.graph.verify_integrity()
+        return {
+            "root_hash": self.graph.root_hash,
+            "created_at": self.created_at,
+            "is_valid": is_valid,
+            "invalid_entity_ids": invalid_ids,
+            "entity_hashes": {
+                entity.id: {
+                    "content_hash": entity.content_hash,
+                    "schema_hash": entity.schema_hash,
+                }
+                for entity in self.graph.list_entities()
+            },
+            "verification_note": (
+                "To verify: recompute content hashes and compare against this manifest. "
+                "Any mismatch indicates data tampering."
+            ),
+        }
+    def export(self, output_dir: str):
+        """Export all accountability artifacts to a directory."""
+        import os
+        os.makedirs(output_dir, exist_ok=True)
+        # 1. CASCADE provenance JSON
+        with open(os.path.join(output_dir, "cascade_provenance.json"), "w") as f:
+            json.dump(self.graph.to_dict(), f, indent=2, default=str)
+        # 2. W3C PROV-O JSON-LD
+        with open(os.path.join(output_dir, "prov_o.jsonld"), "w") as f:
+            json.dump(self.to_prov_o_jsonld(), f, indent=2, default=str)
+        # 3. W3C PROV-N notation
+        with open(os.path.join(output_dir, "prov_n.txt"), "w") as f:
+            f.write(self.to_prov_n())
+        # 4. Activity log
+        with open(os.path.join(output_dir, "activities.jsonl"), "w") as f:
+            for activity in self.to_activity_log():
+                f.write(json.dumps(activity, default=str) + "\n")
+        # 5. Agent attributions
+        with open(os.path.join(output_dir, "agents.json"), "w") as f:
+            json.dump(self.to_agent_attributions(), f, indent=2, default=str)
+        # 6. Integrity manifest
+        with open(os.path.join(output_dir, "integrity_manifest.json"), "w") as f:
+            json.dump(self.to_integrity_manifest(), f, indent=2, default=str)
+        # 7. Croissant metadata
+        exporter = CroissantExporter(self.graph)
+        croissant_content = exporter.to_json(name="dataset", url="local://")
+        with open(os.path.join(output_dir, "croissant.json"), "w") as f:
+            f.write(croissant_content)
+    def summary(self) -> Dict[str, Any]:
+        """Summary of the accountability bundle."""
+        stats = self.graph.stats
+        return {
+            "bundle_created_at": self.created_at,
+            "graph_name": self.graph.name,
+            "root_hash": self.graph.root_hash,
+            "entities": stats["entities"],
+            "activities": stats["activities"],
+            "agents": stats["agents"],
+            "relationships": stats["relationships"],
+            "files_included": [
+                "cascade_provenance.json",
+                "prov_o.jsonld",
+                "prov_n.txt",
+                "activities.jsonl",
+                "agents.json",
+                "integrity_manifest.json",
+                "croissant.json",
+            ],
+        }
+class HubIntegration:
+    """
+    Integration with HuggingFace Hub for dataset provenance.
+    Stores complete accountability bundle:
+    1. cascade_provenance.json - CASCADE native format
+    2. prov_o.jsonld - W3C PROV-O JSON-LD (interoperable)
+    3. prov_n.txt - W3C PROV-N notation (human readable)
+    4. activities.jsonl - Activity log for audit
+    5. agents.json - Agent attributions
+    6. integrity_manifest.json - Hash verification
+    7. croissant.json - MLCommons Croissant
+    8. README.md - Human-readable provenance section
+    """
+    PROVENANCE_FILENAME = "cascade_provenance.json"
+    PROV_O_FILENAME = "prov_o.jsonld"
+    PROV_N_FILENAME = "prov_n.txt"
+    ACTIVITIES_FILENAME = "activities.jsonl"
+    AGENTS_FILENAME = "agents.json"
+    INTEGRITY_FILENAME = "integrity_manifest.json"
+    CROISSANT_FILENAME = "croissant.json"
+    def __init__(self, token: str = None):
+        """
+        Initialize Hub integration.
+        Args:
+            token: HuggingFace API token (optional, uses cached token if not provided)
+        """
+        self.token = token
+    def push_provenance(
+        self,
+        graph: ProvenanceGraph,
+        repo_id: str,
+        commit_message: str = "Update provenance",
+        private: bool = False,
+        include_croissant: bool = True,
+        full_accountability: bool = True,
+    ) -> str:
+        """
+        Push complete accountability bundle to HuggingFace Hub.
+        Args:
+            graph: The provenance graph to push
+            repo_id: HuggingFace repo ID (e.g., "username/dataset-name")
+            commit_message: Commit message
+            private: Whether the repo should be private
+            include_croissant: Whether to include Croissant JSON-LD
+            full_accountability: Whether to include full W3C PROV-O bundle
+        Returns:
+            URL of the pushed provenance
+        """
+        from huggingface_hub import HfApi, CommitOperationAdd
+        api = HfApi(token=self.token)
+        # Ensure repo exists
+        api.create_repo(
+            repo_id=repo_id,
+            repo_type="dataset",
+            private=private,
+            exist_ok=True,
+        )
+        operations = []
+        bundle = AccountabilityBundle(graph)
+        # 1. CASCADE provenance JSON (native format)
+        provenance_content = json.dumps(graph.to_dict(), indent=2, default=str)
+        operations.append(CommitOperationAdd(
+            path_in_repo=self.PROVENANCE_FILENAME,
+            path_or_fileobj=provenance_content.encode("utf-8"),
+        ))
+        if full_accountability:
+            # 2. W3C PROV-O JSON-LD (interoperable standard)
+            prov_o_content = json.dumps(bundle.to_prov_o_jsonld(), indent=2, default=str)
+            operations.append(CommitOperationAdd(
+                path_in_repo=self.PROV_O_FILENAME,
+                path_or_fileobj=prov_o_content.encode("utf-8"),
+            ))
+            # 3. W3C PROV-N notation (human readable)
+            prov_n_content = bundle.to_prov_n()
+            operations.append(CommitOperationAdd(
+                path_in_repo=self.PROV_N_FILENAME,
+                path_or_fileobj=prov_n_content.encode("utf-8"),
+            ))
+            # 4. Activity log (JSONL for easy grep/audit)
+            activities = bundle.to_activity_log()
+            activities_content = "\n".join(json.dumps(a, default=str) for a in activities)
+            operations.append(CommitOperationAdd(
+                path_in_repo=self.ACTIVITIES_FILENAME,
+                path_or_fileobj=activities_content.encode("utf-8"),
+            ))
+            # 5. Agent attributions
+            agents_content = json.dumps(bundle.to_agent_attributions(), indent=2, default=str)
+            operations.append(CommitOperationAdd(
+                path_in_repo=self.AGENTS_FILENAME,
+                path_or_fileobj=agents_content.encode("utf-8"),
+            ))
+            # 6. Integrity manifest (for verification)
+            integrity_content = json.dumps(bundle.to_integrity_manifest(), indent=2, default=str)
+            operations.append(CommitOperationAdd(
+                path_in_repo=self.INTEGRITY_FILENAME,
+                path_or_fileobj=integrity_content.encode("utf-8"),
+            ))
+        # 7. Croissant JSON-LD (MLCommons standard)
+        if include_croissant:
+            exporter = CroissantExporter(graph)
+            croissant_content = exporter.to_json(
+                name=repo_id.split("/")[-1],
+                url=f"https://huggingface.co/datasets/{repo_id}",
+            )
+            operations.append(CommitOperationAdd(
+                path_in_repo=self.CROISSANT_FILENAME,
+                path_or_fileobj=croissant_content.encode("utf-8"),
+            ))
+        # Commit all accountability artifacts
+        api.create_commit(
+            repo_id=repo_id,
+            repo_type="dataset",
+            operations=operations,
+            commit_message=commit_message,
+        )
+        return f"https://huggingface.co/datasets/{repo_id}"
+    def pull_provenance(self, repo_id: str) -> Optional[ProvenanceGraph]:
+        """
+        Pull provenance from HuggingFace Hub.
+        Args:
+            repo_id: HuggingFace repo ID
+        Returns:
+            ProvenanceGraph if found, None otherwise
+        """
+        from huggingface_hub import hf_hub_download
+        try:
+            # Download provenance file
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=self.PROVENANCE_FILENAME,
+                repo_type="dataset",
+                token=self.token,
+            )
+            with open(local_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            return ProvenanceGraph.from_dict(data)
+        except Exception as e:
+            print(f"Could not pull provenance from {repo_id}: {e}")
+            return None
+    def get_dataset_provenance_url(self, repo_id: str) -> str:
+        """Get URL to provenance file in Hub."""
+        return f"https://huggingface.co/datasets/{repo_id}/blob/main/{self.PROVENANCE_FILENAME}"
+    def update_dataset_card(
+        self,
+        repo_id: str,
+        graph: ProvenanceGraph,
+    ) -> str:
+        """
+        Update dataset card with provenance summary.
+        Adds/updates YAML front-matter with:
+        - Lineage information
+        - Root hash
+        - Entity/activity counts
+        Args:
+            repo_id: HuggingFace repo ID
+            graph: Provenance graph
+        Returns:
+            URL of the updated dataset
+        """
+        from huggingface_hub import HfApi, hf_hub_download
+        api = HfApi(token=self.token)
+        # Build provenance section for README
+        provenance_section = self._build_readme_section(graph)
+        # Get current README
+        try:
+            readme_path = hf_hub_download(
+                repo_id=repo_id,
+                filename="README.md",
+                repo_type="dataset",
+                token=self.token,
+            )
+            with open(readme_path, "r", encoding="utf-8") as f:
+                current_readme = f.read()
+        except:
+            current_readme = f"# {repo_id.split('/')[-1]}\n\n"
+        # Update or append provenance section
+        marker_start = "<!-- CASCADE_PROVENANCE_START -->"
+        marker_end = "<!-- CASCADE_PROVENANCE_END -->"
+        if marker_start in current_readme:
+            # Replace existing section
+            import re
+            pattern = re.escape(marker_start) + r".*?" + re.escape(marker_end)
+            new_readme = re.sub(
+                pattern,
+                f"{marker_start}\n{provenance_section}\n{marker_end}",
+                current_readme,
+                flags=re.DOTALL,
+            )
+        else:
+            # Append section
+            new_readme = current_readme.rstrip() + f"\n\n{marker_start}\n{provenance_section}\n{marker_end}\n"
+        # Push updated README
+        api.upload_file(
+            path_or_fileobj=new_readme.encode("utf-8"),
+            path_in_repo="README.md",
+            repo_id=repo_id,
+            repo_type="dataset",
+            commit_message="Update provenance in README",
+        )
+        return f"https://huggingface.co/datasets/{repo_id}"
+    def _build_readme_section(self, graph: ProvenanceGraph) -> str:
+        """Build provenance section for README."""
+        stats = graph.stats
+        bundle = AccountabilityBundle(graph)
+        lines = [
+            "## 🔗 Provenance & Accountability",
+            "",
+            "This dataset has CASCADE provenance tracking enabled with full W3C PROV-O compliance.",
+            "",
+            "### Integrity",
+            "",
+            f"| Metric | Value |",
+            f"|--------|-------|",
+            f"| Root Hash | `{graph.root_hash[:16]}...` |",
+            f"| Entities | {stats['entities']} |",
+            f"| Activities | {stats['activities']} |",
+            f"| Agents | {stats['agents']} |",
+            f"| Relationships | {stats['relationships']} |",
+            "",
+        ]
+        # Add lineage summary
+        entities = graph.list_entities()
+        if entities:
+            lines.append("### Lineage")
+            lines.append("")
+            for entity in entities[:5]:  # Show first 5
+                upstream = graph.get_lineage(entity.id, "upstream")
+                if upstream:
+                    lines.append(f"- **{entity.name}** derived from: {', '.join(upstream[:3])}")
+                else:
+                    lines.append(f"- **{entity.name}** (source)")
+            if len(entities) > 5:
+                lines.append(f"- ... and {len(entities) - 5} more entities")
+            lines.append("")
+        # Add activities summary
+        activities = graph.list_activities()
+        if activities:
+            lines.append("### Activities")
+            lines.append("")
+            for activity in activities[:5]:
+                duration = f" ({activity.duration:.2f}s)" if activity.duration else ""
+                lines.append(f"- **{activity.name}** [{activity.activity_type.value}]{duration}")
+            if len(activities) > 5:
+                lines.append(f"- ... and {len(activities) - 5} more activities")
+            lines.append("")
+        # Add agents summary
+        agents = graph.list_agents()
+        if agents:
+            lines.append("### Agents (Accountability)")
+            lines.append("")
+            for agent in agents[:5]:
+                lines.append(f"- **{agent.name}** [{agent.agent_type.value}]")
+            if len(agents) > 5:
+                lines.append(f"- ... and {len(agents) - 5} more agents")
+            lines.append("")
+        # Accountability bundle files
+        lines.extend([
+            "### Accountability Bundle",
+            "",
+            "| File | Standard | Description |",
+            "|------|----------|-------------|",
+            f"| [{self.PROVENANCE_FILENAME}]({self.PROVENANCE_FILENAME}) | CASCADE | Native provenance format |",
+            f"| [{self.PROV_O_FILENAME}]({self.PROV_O_FILENAME}) | W3C PROV-O | Interoperable JSON-LD |",
+            f"| [{self.PROV_N_FILENAME}]({self.PROV_N_FILENAME}) | W3C PROV-N | Human-readable notation |",
+            f"| [{self.ACTIVITIES_FILENAME}]({self.ACTIVITIES_FILENAME}) | JSONL | Activity audit log |",
+            f"| [{self.AGENTS_FILENAME}]({self.AGENTS_FILENAME}) | JSON | Agent attributions |",
+            f"| [{self.INTEGRITY_FILENAME}]({self.INTEGRITY_FILENAME}) | JSON | Hash verification manifest |",
+            f"| [{self.CROISSANT_FILENAME}]({self.CROISSANT_FILENAME}) | MLCommons | Croissant metadata |",
+            "",
+        ])
+        return "\n".join(lines)
+def push_to_hub(
+    graph: ProvenanceGraph,
+    repo_id: str,
+    token: str = None,
+    private: bool = False,
+) -> str:
+    """
+    Convenience function to push provenance to Hub.
+    Args:
+        graph: Provenance graph to push
+        repo_id: HuggingFace repo ID
+        token: HF token (optional)
+        private: Whether repo should be private
+    Returns:
+        URL of the pushed provenance
+    """
+    hub = HubIntegration(token=token)
+    return hub.push_provenance(graph, repo_id, private=private)
+def pull_from_hub(repo_id: str, token: str = None) -> Optional[ProvenanceGraph]:
+    """
+    Convenience function to pull provenance from Hub.
+    Args:
+        repo_id: HuggingFace repo ID
+        token: HF token (optional)
+    Returns:
+        ProvenanceGraph if found
+    """
+    hub = HubIntegration(token=token)
+    return hub.pull_provenance(repo_id)

cascade/data/license.py ADDED Viewed

	@@ -0,0 +1,635 @@

+"""
+SPDX License Tracking for CASCADE
+Industry standard license tracking based on:
+- SPDX (Software Package Data Exchange) - Linux Foundation
+- HuggingFace Dataset Cards license field
+- Croissant metadata license property
+License Compatibility Rules:
+- Permissive (MIT, Apache-2.0) → Can derive into restrictive
+- Copyleft (GPL-3.0) → Derivatives must also be copyleft
+- NonCommercial (CC-BY-NC-*) → Propagates non-commercial restriction
+- ShareAlike (CC-BY-SA-*) → Derivatives must use same license
+- NoDerivatives (CC-BY-ND-*) → Cannot create derivatives
+References:
+- https://spdx.org/licenses/
+- https://creativecommons.org/licenses/
+"""
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, List, Optional, Set, Tuple, Any
+class LicenseCategory(Enum):
+    """License categories for compatibility analysis."""
+    PERMISSIVE = "permissive"           # MIT, Apache, BSD
+    WEAK_COPYLEFT = "weak-copyleft"     # LGPL, MPL
+    STRONG_COPYLEFT = "strong-copyleft" # GPL, AGPL
+    CREATIVE_COMMONS = "creative-commons"
+    PUBLIC_DOMAIN = "public-domain"     # CC0, Unlicense
+    PROPRIETARY = "proprietary"
+    UNKNOWN = "unknown"
+class LicenseRestriction(Enum):
+    """License restrictions that propagate to derivatives."""
+    NONE = "none"
+    ATTRIBUTION = "attribution"          # Must credit original
+    SHARE_ALIKE = "share-alike"         # Derivatives same license
+    NON_COMMERCIAL = "non-commercial"   # No commercial use
+    NO_DERIVATIVES = "no-derivatives"   # Cannot modify
+    COPYLEFT = "copyleft"               # Must open source derivatives
+@dataclass
+class SPDXLicense:
+    """
+    SPDX License Information.
+    Based on SPDX License List: https://spdx.org/licenses/
+    """
+    id: str                              # SPDX identifier (e.g., "MIT", "Apache-2.0")
+    name: str                            # Full name
+    category: LicenseCategory = LicenseCategory.UNKNOWN
+    restrictions: Set[LicenseRestriction] = field(default_factory=set)
+    osi_approved: bool = False           # Open Source Initiative approved
+    fsf_libre: bool = False              # FSF Free/Libre
+    url: Optional[str] = None            # License text URL
+    def allows_commercial(self) -> bool:
+        """Check if license allows commercial use."""
+        return LicenseRestriction.NON_COMMERCIAL not in self.restrictions
+    def allows_derivatives(self) -> bool:
+        """Check if license allows creating derivatives."""
+        return LicenseRestriction.NO_DERIVATIVES not in self.restrictions
+    def requires_attribution(self) -> bool:
+        """Check if license requires attribution."""
+        return LicenseRestriction.ATTRIBUTION in self.restrictions
+    def requires_share_alike(self) -> bool:
+        """Check if license requires same license for derivatives."""
+        return (
+            LicenseRestriction.SHARE_ALIKE in self.restrictions or
+            LicenseRestriction.COPYLEFT in self.restrictions
+        )
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "spdx_id": self.id,
+            "name": self.name,
+            "category": self.category.value,
+            "restrictions": [r.value for r in self.restrictions],
+            "osi_approved": self.osi_approved,
+            "fsf_libre": self.fsf_libre,
+            "url": self.url,
+        }
+# SPDX License Registry - Common ML/Data licenses
+SPDX_LICENSES: Dict[str, SPDXLicense] = {
+    # Public Domain
+    "CC0-1.0": SPDXLicense(
+        id="CC0-1.0",
+        name="Creative Commons Zero v1.0 Universal",
+        category=LicenseCategory.PUBLIC_DOMAIN,
+        restrictions=set(),
+        osi_approved=False,
+        fsf_libre=True,
+        url="https://creativecommons.org/publicdomain/zero/1.0/",
+    ),
+    "Unlicense": SPDXLicense(
+        id="Unlicense",
+        name="The Unlicense",
+        category=LicenseCategory.PUBLIC_DOMAIN,
+        restrictions=set(),
+        osi_approved=True,
+        fsf_libre=True,
+        url="https://unlicense.org/",
+    ),
+    # Permissive
+    "MIT": SPDXLicense(
+        id="MIT",
+        name="MIT License",
+        category=LicenseCategory.PERMISSIVE,
+        restrictions={LicenseRestriction.ATTRIBUTION},
+        osi_approved=True,
+        fsf_libre=True,
+        url="https://opensource.org/licenses/MIT",
+    ),
+    "Apache-2.0": SPDXLicense(
+        id="Apache-2.0",
+        name="Apache License 2.0",
+        category=LicenseCategory.PERMISSIVE,
+        restrictions={LicenseRestriction.ATTRIBUTION},
+        osi_approved=True,
+        fsf_libre=True,
+        url="https://www.apache.org/licenses/LICENSE-2.0",
+    ),
+    "BSD-2-Clause": SPDXLicense(
+        id="BSD-2-Clause",
+        name='BSD 2-Clause "Simplified" License',
+        category=LicenseCategory.PERMISSIVE,
+        restrictions={LicenseRestriction.ATTRIBUTION},
+        osi_approved=True,
+        fsf_libre=True,
+        url="https://opensource.org/licenses/BSD-2-Clause",
+    ),
+    "BSD-3-Clause": SPDXLicense(
+        id="BSD-3-Clause",
+        name='BSD 3-Clause "New" or "Revised" License',
+        category=LicenseCategory.PERMISSIVE,
+        restrictions={LicenseRestriction.ATTRIBUTION},
+        osi_approved=True,
+        fsf_libre=True,
+        url="https://opensource.org/licenses/BSD-3-Clause",
+    ),
+    # Creative Commons
+    "CC-BY-4.0": SPDXLicense(
+        id="CC-BY-4.0",
+        name="Creative Commons Attribution 4.0",
+        category=LicenseCategory.CREATIVE_COMMONS,
+        restrictions={LicenseRestriction.ATTRIBUTION},
+        osi_approved=False,
+        fsf_libre=True,
+        url="https://creativecommons.org/licenses/by/4.0/",
+    ),
+    "CC-BY-SA-4.0": SPDXLicense(
+        id="CC-BY-SA-4.0",
+        name="Creative Commons Attribution ShareAlike 4.0",
+        category=LicenseCategory.CREATIVE_COMMONS,
+        restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.SHARE_ALIKE},
+        osi_approved=False,
+        fsf_libre=True,
+        url="https://creativecommons.org/licenses/by-sa/4.0/",
+    ),
+    "CC-BY-NC-4.0": SPDXLicense(
+        id="CC-BY-NC-4.0",
+        name="Creative Commons Attribution NonCommercial 4.0",
+        category=LicenseCategory.CREATIVE_COMMONS,
+        restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.NON_COMMERCIAL},
+        osi_approved=False,
+        fsf_libre=False,
+        url="https://creativecommons.org/licenses/by-nc/4.0/",
+    ),
+    "CC-BY-NC-SA-4.0": SPDXLicense(
+        id="CC-BY-NC-SA-4.0",
+        name="Creative Commons Attribution NonCommercial ShareAlike 4.0",
+        category=LicenseCategory.CREATIVE_COMMONS,
+        restrictions={
+            LicenseRestriction.ATTRIBUTION,
+            LicenseRestriction.NON_COMMERCIAL,
+            LicenseRestriction.SHARE_ALIKE,
+        },
+        osi_approved=False,
+        fsf_libre=False,
+        url="https://creativecommons.org/licenses/by-nc-sa/4.0/",
+    ),
+    "CC-BY-ND-4.0": SPDXLicense(
+        id="CC-BY-ND-4.0",
+        name="Creative Commons Attribution NoDerivatives 4.0",
+        category=LicenseCategory.CREATIVE_COMMONS,
+        restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.NO_DERIVATIVES},
+        osi_approved=False,
+        fsf_libre=False,
+        url="https://creativecommons.org/licenses/by-nd/4.0/",
+    ),
+    # Weak Copyleft
+    "LGPL-3.0": SPDXLicense(
+        id="LGPL-3.0",
+        name="GNU Lesser General Public License v3.0",
+        category=LicenseCategory.WEAK_COPYLEFT,
+        restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT},
+        osi_approved=True,
+        fsf_libre=True,
+        url="https://www.gnu.org/licenses/lgpl-3.0.html",
+    ),
+    "MPL-2.0": SPDXLicense(
+        id="MPL-2.0",
+        name="Mozilla Public License 2.0",
+        category=LicenseCategory.WEAK_COPYLEFT,
+        restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT},
+        osi_approved=True,
+        fsf_libre=True,
+        url="https://www.mozilla.org/en-US/MPL/2.0/",
+    ),
+    # Strong Copyleft
+    "GPL-3.0": SPDXLicense(
+        id="GPL-3.0",
+        name="GNU General Public License v3.0",
+        category=LicenseCategory.STRONG_COPYLEFT,
+        restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT},
+        osi_approved=True,
+        fsf_libre=True,
+        url="https://www.gnu.org/licenses/gpl-3.0.html",
+    ),
+    "AGPL-3.0": SPDXLicense(
+        id="AGPL-3.0",
+        name="GNU Affero General Public License v3.0",
+        category=LicenseCategory.STRONG_COPYLEFT,
+        restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT},
+        osi_approved=True,
+        fsf_libre=True,
+        url="https://www.gnu.org/licenses/agpl-3.0.html",
+    ),
+    # ML-Specific
+    "OpenRAIL": SPDXLicense(
+        id="OpenRAIL",
+        name="Open RAIL License",
+        category=LicenseCategory.PERMISSIVE,
+        restrictions={LicenseRestriction.ATTRIBUTION},
+        osi_approved=False,
+        fsf_libre=False,
+        url="https://huggingface.co/blog/open_rail",
+    ),
+    "OpenRAIL-M": SPDXLicense(
+        id="OpenRAIL-M",
+        name="Open RAIL-M License",
+        category=LicenseCategory.PERMISSIVE,
+        restrictions={LicenseRestriction.ATTRIBUTION},
+        osi_approved=False,
+        fsf_libre=False,
+        url="https://www.licenses.ai/blog/2022/8/26/bigscience-open-rail-m-license",
+    ),
+    # Special
+    "other": SPDXLicense(
+        id="other",
+        name="Other/Custom License",
+        category=LicenseCategory.UNKNOWN,
+        restrictions=set(),
+        osi_approved=False,
+        fsf_libre=False,
+        url=None,
+    ),
+    "unknown": SPDXLicense(
+        id="unknown",
+        name="Unknown License",
+        category=LicenseCategory.UNKNOWN,
+        restrictions=set(),
+        osi_approved=False,
+        fsf_libre=False,
+        url=None,
+    ),
+}
+def get_license(spdx_id: str) -> SPDXLicense:
+    """
+    Get license by SPDX identifier.
+    Args:
+        spdx_id: SPDX license identifier (case-insensitive)
+    Returns:
+        SPDXLicense object (unknown if not found)
+    """
+    # Normalize common variants
+    normalized = spdx_id.strip()
+    # Direct lookup
+    if normalized in SPDX_LICENSES:
+        return SPDX_LICENSES[normalized]
+    # Case-insensitive lookup
+    for key, license in SPDX_LICENSES.items():
+        if key.lower() == normalized.lower():
+            return license
+    # Common aliases
+    aliases = {
+        "mit": "MIT",
+        "apache": "Apache-2.0",
+        "apache2": "Apache-2.0",
+        "gpl": "GPL-3.0",
+        "gpl3": "GPL-3.0",
+        "lgpl": "LGPL-3.0",
+        "bsd": "BSD-3-Clause",
+        "cc0": "CC0-1.0",
+        "cc-by": "CC-BY-4.0",
+        "cc-by-sa": "CC-BY-SA-4.0",
+        "cc-by-nc": "CC-BY-NC-4.0",
+        "cc-by-nc-sa": "CC-BY-NC-SA-4.0",
+        "cc-by-nd": "CC-BY-ND-4.0",
+        "unlicense": "Unlicense",
+        "public domain": "CC0-1.0",
+        "openrail": "OpenRAIL",
+    }
+    lower_id = normalized.lower().replace("_", "-").replace(" ", "-")
+    if lower_id in aliases:
+        return SPDX_LICENSES[aliases[lower_id]]
+    # Return unknown
+    return SPDX_LICENSES["unknown"]
+@dataclass
+class LicenseCompatibility:
+    """Result of license compatibility check."""
+    compatible: bool
+    derived_license: Optional[SPDXLicense] = None
+    issues: List[str] = field(default_factory=list)
+    warnings: List[str] = field(default_factory=list)
+    attribution_required: List[str] = field(default_factory=list)  # Source IDs requiring attribution
+class LicenseAnalyzer:
+    """
+    Analyze license compatibility for dataset derivation.
+    Rules:
+    1. No-Derivatives: Cannot create derivatives
+    2. Share-Alike: Must use same license
+    3. Copyleft: Must use compatible copyleft license
+    4. Non-Commercial: Restriction propagates
+    5. Attribution: Must credit all sources
+    """
+    # License compatibility matrix (can this → derive into that?)
+    # Rows: source license category, Columns: derived license category
+    COMPATIBILITY_MATRIX = {
+        LicenseCategory.PUBLIC_DOMAIN: {
+            LicenseCategory.PUBLIC_DOMAIN: True,
+            LicenseCategory.PERMISSIVE: True,
+            LicenseCategory.CREATIVE_COMMONS: True,
+            LicenseCategory.WEAK_COPYLEFT: True,
+            LicenseCategory.STRONG_COPYLEFT: True,
+            LicenseCategory.PROPRIETARY: True,
+        },
+        LicenseCategory.PERMISSIVE: {
+            LicenseCategory.PUBLIC_DOMAIN: False,
+            LicenseCategory.PERMISSIVE: True,
+            LicenseCategory.CREATIVE_COMMONS: True,
+            LicenseCategory.WEAK_COPYLEFT: True,
+            LicenseCategory.STRONG_COPYLEFT: True,
+            LicenseCategory.PROPRIETARY: True,
+        },
+        LicenseCategory.CREATIVE_COMMONS: {
+            LicenseCategory.PUBLIC_DOMAIN: False,
+            LicenseCategory.PERMISSIVE: False,  # Depends on specific CC
+            LicenseCategory.CREATIVE_COMMONS: True,  # Depends on specific CC
+            LicenseCategory.WEAK_COPYLEFT: False,
+            LicenseCategory.STRONG_COPYLEFT: False,
+            LicenseCategory.PROPRIETARY: False,
+        },
+        LicenseCategory.WEAK_COPYLEFT: {
+            LicenseCategory.PUBLIC_DOMAIN: False,
+            LicenseCategory.PERMISSIVE: False,
+            LicenseCategory.CREATIVE_COMMONS: False,
+            LicenseCategory.WEAK_COPYLEFT: True,
+            LicenseCategory.STRONG_COPYLEFT: True,
+            LicenseCategory.PROPRIETARY: False,
+        },
+        LicenseCategory.STRONG_COPYLEFT: {
+            LicenseCategory.PUBLIC_DOMAIN: False,
+            LicenseCategory.PERMISSIVE: False,
+            LicenseCategory.CREATIVE_COMMONS: False,
+            LicenseCategory.WEAK_COPYLEFT: False,
+            LicenseCategory.STRONG_COPYLEFT: True,
+            LicenseCategory.PROPRIETARY: False,
+        },
+    }
+    def check_compatibility(
+        self,
+        source_licenses: List[Tuple[str, str]],  # List of (entity_id, spdx_id)
+        target_license: Optional[str] = None,
+    ) -> LicenseCompatibility:
+        """
+        Check if source licenses allow derivation.
+        Args:
+            source_licenses: List of (entity_id, license_id) tuples
+            target_license: Intended license for derived work (optional)
+        Returns:
+            LicenseCompatibility result
+        """
+        if not source_licenses:
+            return LicenseCompatibility(
+                compatible=True,
+                derived_license=SPDX_LICENSES["unknown"],
+            )
+        issues = []
+        warnings = []
+        attribution_required = []
+        # Collect all restrictions
+        all_restrictions: Set[LicenseRestriction] = set()
+        licenses = []
+        for entity_id, spdx_id in source_licenses:
+            lic = get_license(spdx_id)
+            licenses.append((entity_id, lic))
+            all_restrictions.update(lic.restrictions)
+            # Track attribution requirements
+            if lic.requires_attribution():
+                attribution_required.append(entity_id)
+        # Check No-Derivatives
+        for entity_id, lic in licenses:
+            if LicenseRestriction.NO_DERIVATIVES in lic.restrictions:
+                issues.append(
+                    f"Cannot derive from '{entity_id}': license '{lic.id}' prohibits derivatives"
+                )
+        if issues:
+            return LicenseCompatibility(
+                compatible=False,
+                issues=issues,
+                warnings=warnings,
+                attribution_required=attribution_required,
+            )
+        # Determine derived license
+        derived = self._compute_derived_license(licenses, all_restrictions)
+        # Check target license compatibility
+        if target_license:
+            target = get_license(target_license)
+            if not self._can_relicense(derived, target):
+                issues.append(
+                    f"Cannot license derived work as '{target.id}': "
+                    f"must use '{derived.id}' or compatible license"
+                )
+        # Add warnings
+        if LicenseRestriction.NON_COMMERCIAL in all_restrictions:
+            warnings.append("Derived work restricted to non-commercial use only")
+        if LicenseRestriction.SHARE_ALIKE in all_restrictions:
+            warnings.append(f"Derived work must use ShareAlike-compatible license: {derived.id}")
+        if LicenseRestriction.COPYLEFT in all_restrictions:
+            warnings.append(f"Derived work must use copyleft license: {derived.id}")
+        return LicenseCompatibility(
+            compatible=len(issues) == 0,
+            derived_license=derived,
+            issues=issues,
+            warnings=warnings,
+            attribution_required=attribution_required,
+        )
+    def _compute_derived_license(
+        self,
+        licenses: List[Tuple[str, SPDXLicense]],
+        all_restrictions: Set[LicenseRestriction],
+    ) -> SPDXLicense:
+        """
+        Compute the most restrictive license for derived work.
+        The derived license is the "lowest common denominator" that
+        satisfies all source license requirements.
+        """
+        # Priority: Strong Copyleft > Weak Copyleft > CC-SA > CC-NC > Permissive > Public Domain
+        has_strong_copyleft = any(
+            lic.category == LicenseCategory.STRONG_COPYLEFT
+            for _, lic in licenses
+        )
+        has_weak_copyleft = any(
+            lic.category == LicenseCategory.WEAK_COPYLEFT
+            for _, lic in licenses
+        )
+        has_share_alike = LicenseRestriction.SHARE_ALIKE in all_restrictions
+        has_non_commercial = LicenseRestriction.NON_COMMERCIAL in all_restrictions
+        # Strong copyleft dominates
+        if has_strong_copyleft:
+            for _, lic in licenses:
+                if lic.category == LicenseCategory.STRONG_COPYLEFT:
+                    return lic
+        # Weak copyleft next
+        if has_weak_copyleft:
+            for _, lic in licenses:
+                if lic.category == LicenseCategory.WEAK_COPYLEFT:
+                    return lic
+        # CC with restrictions
+        if has_share_alike and has_non_commercial:
+            return SPDX_LICENSES["CC-BY-NC-SA-4.0"]
+        elif has_share_alike:
+            return SPDX_LICENSES["CC-BY-SA-4.0"]
+        elif has_non_commercial:
+            return SPDX_LICENSES["CC-BY-NC-4.0"]
+        # Most permissive with attribution
+        if LicenseRestriction.ATTRIBUTION in all_restrictions:
+            # Check if any source requires specific license
+            for _, lic in licenses:
+                if lic.category == LicenseCategory.CREATIVE_COMMONS:
+                    return lic
+            return SPDX_LICENSES["CC-BY-4.0"]
+        # Public domain
+        return SPDX_LICENSES["CC0-1.0"]
+    def _can_relicense(self, source: SPDXLicense, target: SPDXLicense) -> bool:
+        """Check if source license allows relicensing to target."""
+        # Same license is always OK
+        if source.id == target.id:
+            return True
+        # No relicensing from copyleft to non-copyleft
+        if LicenseRestriction.COPYLEFT in source.restrictions:
+            if LicenseRestriction.COPYLEFT not in target.restrictions:
+                return False
+        # No relicensing from share-alike to non-share-alike
+        if LicenseRestriction.SHARE_ALIKE in source.restrictions:
+            if LicenseRestriction.SHARE_ALIKE not in target.restrictions:
+                return False
+        # Non-commercial must propagate
+        if LicenseRestriction.NON_COMMERCIAL in source.restrictions:
+            if LicenseRestriction.NON_COMMERCIAL not in target.restrictions:
+                return False
+        return True
+    def generate_attribution(
+        self,
+        sources: List[Tuple[str, str, str]],  # (entity_id, license_id, name)
+    ) -> str:
+        """
+        Generate attribution text for derived work.
+        Args:
+            sources: List of (entity_id, license_id, name) tuples
+        Returns:
+            Attribution text
+        """
+        lines = [
+            "## Attribution",
+            "",
+            "This dataset is derived from the following sources:",
+            "",
+        ]
+        for entity_id, license_id, name in sources:
+            lic = get_license(license_id)
+            if lic.requires_attribution():
+                line = f"- **{name}** (`{entity_id}`)"
+                if lic.url:
+                    line += f" - Licensed under [{lic.id}]({lic.url})"
+                else:
+                    line += f" - Licensed under {lic.id}"
+                lines.append(line)
+        if len(lines) == 4:  # No attributions needed
+            return ""
+        lines.append("")
+        return "\n".join(lines)
+# Singleton analyzer
+_analyzer = LicenseAnalyzer()
+def check_license_compatibility(
+    sources: List[Tuple[str, str]],
+    target: Optional[str] = None,
+) -> LicenseCompatibility:
+    """
+    Convenience function to check license compatibility.
+    Args:
+        sources: List of (entity_id, license_id) tuples
+        target: Intended license for derived work
+    Returns:
+        LicenseCompatibility result
+    """
+    return _analyzer.check_compatibility(sources, target)
+def get_derived_license(sources: List[str]) -> SPDXLicense:
+    """
+    Get the appropriate license for a work derived from given sources.
+    Args:
+        sources: List of SPDX license identifiers
+    Returns:
+        SPDXLicense for the derived work
+    """
+    result = _analyzer.check_compatibility([
+        (f"source_{i}", lic) for i, lic in enumerate(sources)
+    ])
+    return result.derived_license or SPDX_LICENSES["unknown"]

cascade/data/live.py ADDED Viewed

	@@ -0,0 +1,844 @@

+"""
+Live Document Tracer
+Real-time streaming of document-centric provenance events.
+This is the LIVE version of what the export system freezes.
+Instead of:  Model runs → Process → Export frozen provenance
+We do:       Model runs → STREAM events → View live document highlights
+Same data model as the observer/exporter, just streamed in real-time
+with document snippet context attached.
+Usage:
+    # Create observer with live streaming
+    observer = DatasetObserver("my_pipeline")
+    tracer = LiveDocumentTracer(observer)
+    # Subscribe to events
+    tracer.on_event(my_handler)
+    # Or stream to async consumer
+    async for event in tracer.stream():
+        render_highlight(event)
+"""
+import asyncio
+import json
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Callable, Dict, Generator, List, Optional, Set, Tuple
+from queue import Queue
+from threading import Lock
+from pathlib import Path
+class TraceEventType(Enum):
+    """Types of document trace events."""
+    # Data flow events
+    DOCUMENT_TOUCHED = "document_touched"      # Model accessed this document/record
+    SPAN_HIGHLIGHTED = "span_highlighted"      # Specific text span being processed
+    ASSOCIATION_CREATED = "association_created"  # Link between two spans/documents
+    # Activity events
+    ACTIVITY_STARTED = "activity_started"
+    ACTIVITY_PROGRESS = "activity_progress"
+    ACTIVITY_COMPLETED = "activity_completed"
+    # Entity events
+    ENTITY_CREATED = "entity_created"
+    ENTITY_DERIVED = "entity_derived"
+    # Relationship events
+    LINK_CREATED = "link_created"
+@dataclass
+class DocumentSpan:
+    """
+    A span within a document being traced.
+    This is the atomic unit of live visualization -
+    the specific text/content the model is touching.
+    """
+    document_id: str           # Entity or record ID
+    document_name: str         # Human-readable name
+    field_name: str = ""       # Column/field if applicable
+    row_index: int = -1        # Row if applicable
+    # The actual content span
+    text: str = ""             # The snippet text
+    start_char: int = -1       # Start position in full text
+    end_char: int = -1         # End position in full text
+    # Visual hints
+    highlight_type: str = "default"  # "source", "target", "match", "attention"
+    confidence: float = 1.0    # For attention/relevance visualization
+    # Metadata
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "document_id": self.document_id,
+            "document_name": self.document_name,
+            "field_name": self.field_name,
+            "row_index": self.row_index,
+            "text": self.text,
+            "start_char": self.start_char,
+            "end_char": self.end_char,
+            "highlight_type": self.highlight_type,
+            "confidence": self.confidence,
+            "metadata": self.metadata,
+        }
+@dataclass
+class DocumentAssociation:
+    """
+    An association between two document spans.
+    Represents the model saying "this connects to that".
+    """
+    source: DocumentSpan
+    target: DocumentSpan
+    association_type: str = "related"  # "match", "derived", "similar", "references"
+    confidence: float = 1.0
+    # Why this association was made
+    reason: str = ""
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "source": self.source.to_dict(),
+            "target": self.target.to_dict(),
+            "association_type": self.association_type,
+            "confidence": self.confidence,
+            "reason": self.reason,
+        }
+@dataclass
+class TraceEvent:
+    """
+    A single trace event for live document visualization.
+    This is what gets streamed to the UI in real-time.
+    """
+    event_type: TraceEventType
+    timestamp: float = field(default_factory=time.time)
+    # Activity context
+    activity_id: Optional[str] = None
+    activity_name: Optional[str] = None
+    activity_type: Optional[str] = None
+    # Document spans involved
+    spans: List[DocumentSpan] = field(default_factory=list)
+    # Association if this event creates one
+    association: Optional[DocumentAssociation] = None
+    # Progress for long operations
+    progress: Optional[float] = None  # 0.0 to 1.0
+    progress_message: Optional[str] = None
+    # Raw provenance data (for export compatibility)
+    entity_id: Optional[str] = None
+    relationship_type: Optional[str] = None
+    # Metadata
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "event_type": self.event_type.value,
+            "timestamp": self.timestamp,
+            "activity_id": self.activity_id,
+            "activity_name": self.activity_name,
+            "activity_type": self.activity_type,
+            "spans": [s.to_dict() for s in self.spans],
+            "association": self.association.to_dict() if self.association else None,
+            "progress": self.progress,
+            "progress_message": self.progress_message,
+            "entity_id": self.entity_id,
+            "metadata": self.metadata,
+        }
+    def to_json(self) -> str:
+        return json.dumps(self.to_dict(), default=str)
+class LiveDocumentTracer:
+    """
+    Real-time document tracing for live visualization.
+    Hooks into DatasetObserver to stream events as they happen,
+    enriched with document snippet context for visualization.
+    This is the LIVE version of what CroissantExporter freezes.
+    NEW: Now writes all events to a tape file (JSONL) for buffered playback!
+    """
+    def __init__(self, observer=None, buffer_size: int = 1000, log_dir: str = "./logs"):
+        """
+        Initialize tracer.
+        Args:
+            observer: DatasetObserver to hook into (optional)
+            buffer_size: Max events to buffer for replay
+            log_dir: Directory for tape files (JSONL logs)
+        """
+        self.observer = observer
+        self.buffer_size = buffer_size
+        # Event subscribers
+        self._handlers: List[Callable[[TraceEvent], None]] = []
+        self._async_handlers: List[Callable[[TraceEvent], Any]] = []
+        # Event buffer for replay/late subscribers
+        self._buffer: List[TraceEvent] = []
+        self._buffer_lock = Lock()
+        # Async queue for streaming
+        self._async_queue: Optional[asyncio.Queue] = None
+        # Current activity context
+        self._current_activity_id: Optional[str] = None
+        self._current_activity_name: Optional[str] = None
+        self._current_activity_type: Optional[str] = None
+        # Document context cache
+        self._document_cache: Dict[str, Dict[str, Any]] = {}
+        # === TAPE FILE FOR PLAYBACK ===
+        self._log_dir = Path(log_dir)
+        self._log_dir.mkdir(parents=True, exist_ok=True)
+        self._session_id = int(time.time())
+        self._tape_path = self._log_dir / f"unity_tape_{self._session_id}.jsonl"
+        self._tape_file = None
+        self._tape_lock = Lock()
+        self._event_count = 0
+    # ═══════════════════════════════════════════════════════════════════════════
+    # SUBSCRIPTION
+    # ═══════════════════════════════════════════════════════════════════════════
+    def on_event(self, handler: Callable[[TraceEvent], None]):
+        """Subscribe to trace events (sync handler)."""
+        self._handlers.append(handler)
+        return self  # Allow chaining
+    def on_event_async(self, handler: Callable[[TraceEvent], Any]):
+        """Subscribe to trace events (async handler)."""
+        self._async_handlers.append(handler)
+        return self
+    def remove_handler(self, handler):
+        """Unsubscribe a handler."""
+        if handler in self._handlers:
+            self._handlers.remove(handler)
+        if handler in self._async_handlers:
+            self._async_handlers.remove(handler)
+    # ═══════════════════════════════════════════════════════════════════════════
+    # EVENT EMISSION
+    # ═══════════════════════════════════════════════════════════════════════════
+    def emit(self, event: TraceEvent):
+        """
+        Emit a trace event to all subscribers.
+        Called internally when provenance events occur.
+        Also writes to tape file for buffered playback!
+        """
+        self._event_count += 1
+        # Add to buffer
+        with self._buffer_lock:
+            self._buffer.append(event)
+            if len(self._buffer) > self.buffer_size:
+                self._buffer.pop(0)
+        # === WRITE TO TAPE (JSONL) ===
+        self._write_to_tape(event)
+        # Call sync handlers
+        for handler in self._handlers:
+            try:
+                handler(event)
+            except Exception as e:
+                print(f"Handler error: {e}")
+        # Queue for async handlers
+        if self._async_queue:
+            try:
+                self._async_queue.put_nowait(event)
+            except asyncio.QueueFull:
+                pass  # Drop if queue full
+    def _write_to_tape(self, event: TraceEvent):
+        """Write event to tape file for later playback."""
+        try:
+            with self._tape_lock:
+                # Lazy open the file
+                if self._tape_file is None:
+                    self._tape_file = open(self._tape_path, "a", encoding="utf-8")
+                    print(f"[CASCADE] 📼 Unity tape started: {self._tape_path}")
+                # Build tape record with full context
+                record = {
+                    "seq": self._event_count,
+                    "event": event.to_dict(),
+                    "session_id": self._session_id,
+                }
+                json_line = json.dumps(record, default=str) + "\n"
+                self._tape_file.write(json_line)
+                self._tape_file.flush()
+                # Debug: Log first few events
+                if self._event_count <= 3:
+                    print(f"[CASCADE] 📝 Wrote event {self._event_count} to tape: {event.event_type}")
+        except Exception as e:
+            # Don't let tape errors break the main flow
+            print(f"[CASCADE] ⚠️ Tape write error: {e}")
+            pass
+    def _write_raw_to_tape(self, record: Dict[str, Any]):
+        """Write a raw record to tape file (for docspace events)."""
+        try:
+            with self._tape_lock:
+                # Lazy open the file
+                if self._tape_file is None:
+                    self._tape_file = open(self._tape_path, "a", encoding="utf-8")
+                    print(f"[CASCADE] 📼 Unity tape started: {self._tape_path}")
+                self._tape_file.write(json.dumps(record, default=str) + "\n")
+                self._tape_file.flush()
+        except Exception:
+            pass
+    # ═══════════════════════════════════════════════════════════════════════════
+    # DOCUMENT SPACE EVENTS (for polling iframe)
+    # ═══════════════════════════════════════════════════════════════════════════
+    def emit_entity(self, entity_id: str, source: str, text: str, index: int, side: str = "a"):
+        """
+        Emit an entity for Document Space visualization.
+        Args:
+            entity_id: Unique ID for the entity
+            source: Source dataset name
+            text: Preview text (truncated)
+            index: Row index in dataset
+            side: "a" or "b" to indicate which dataset
+        """
+        self._event_count += 1
+        record = {
+            "seq": self._event_count,
+            "type": "docspace_entity",
+            "side": side,
+            "data": {
+                "id": entity_id,
+                "source": source,
+                "text": text[:200],
+                "index": index,
+            },
+            "session_id": self._session_id,
+        }
+        self._write_raw_to_tape(record)
+    def emit_match(self, doc_a_id: str, doc_b_id: str, score: float):
+        """
+        Emit a match for Document Space visualization.
+        Args:
+            doc_a_id: ID of entity from dataset A
+            doc_b_id: ID of entity from dataset B
+            score: Similarity score (0-1)
+        """
+        self._event_count += 1
+        record = {
+            "seq": self._event_count,
+            "type": "docspace_match",
+            "data": {
+                "docA": doc_a_id,
+                "docB": doc_b_id,
+                "score": float(score),
+            },
+            "session_id": self._session_id,
+        }
+        self._write_raw_to_tape(record)
+    def emit_phase(self, phase: str, progress: float, message: str = ""):
+        """
+        Emit a phase update for Document Space.
+        Args:
+            phase: Current phase (embedding_a, embedding_b, comparing, complete)
+            progress: Progress 0-1
+            message: Status message
+        """
+        self._event_count += 1
+        record = {
+            "seq": self._event_count,
+            "type": "docspace_phase",
+            "data": {
+                "phase": phase,
+                "progress": float(progress),
+                "message": message,
+            },
+            "session_id": self._session_id,
+        }
+        self._write_raw_to_tape(record)
+    def close_tape(self):
+        """Close the tape file (call when session ends)."""
+        with self._tape_lock:
+            if self._tape_file:
+                self._tape_file.close()
+                self._tape_file = None
+                print(f"[CASCADE] 📼 Unity tape closed: {self._event_count} events → {self._tape_path}")
+    def get_tape_path(self) -> Optional[Path]:
+        """Get the path to the current tape file (whether open or not)."""
+        return self._tape_path
+    @staticmethod
+    def load_tape(tape_path: str) -> List[Dict[str, Any]]:
+        """
+        Load events from a tape file for playback.
+        Args:
+            tape_path: Path to the .jsonl tape file
+        Returns:
+            List of event records in chronological order
+        """
+        events = []
+        with open(tape_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    try:
+                        events.append(json.loads(line))
+                    except json.JSONDecodeError:
+                        pass  # Skip malformed lines
+        return events
+    async def stream(self) -> Generator[TraceEvent, None, None]:
+        """
+        Async generator for streaming events.
+        Usage:
+            async for event in tracer.stream():
+                await render(event)
+        """
+        self._async_queue = asyncio.Queue(maxsize=self.buffer_size)
+        # Replay buffer first
+        with self._buffer_lock:
+            for event in self._buffer:
+                yield event
+        # Then stream new events
+        while True:
+            event = await self._async_queue.get()
+            yield event
+    def get_buffer(self) -> List[TraceEvent]:
+        """Get buffered events for replay."""
+        with self._buffer_lock:
+            return list(self._buffer)
+    def clear_buffer(self):
+        """Clear the event buffer."""
+        with self._buffer_lock:
+            self._buffer.clear()
+    # ═══════════════════════════════════════════════════════════════════════════
+    # TRACING API - Call these to emit events
+    # ═══════════════════════════════════════════════════════════════════════════
+    def start_activity(
+        self,
+        activity_id: str,
+        activity_name: str,
+        activity_type: str = "transform",
+    ):
+        """Signal start of an activity (for context)."""
+        self._current_activity_id = activity_id
+        self._current_activity_name = activity_name
+        self._current_activity_type = activity_type
+        self.emit(TraceEvent(
+            event_type=TraceEventType.ACTIVITY_STARTED,
+            activity_id=activity_id,
+            activity_name=activity_name,
+            activity_type=activity_type,
+        ))
+    def end_activity(self, activity_id: str = None):
+        """Signal end of an activity."""
+        self.emit(TraceEvent(
+            event_type=TraceEventType.ACTIVITY_COMPLETED,
+            activity_id=activity_id or self._current_activity_id,
+            activity_name=self._current_activity_name,
+            activity_type=self._current_activity_type,
+        ))
+        self._current_activity_id = None
+        self._current_activity_name = None
+        self._current_activity_type = None
+    def report_progress(
+        self,
+        progress: float,
+        message: str = "",
+        activity_id: str = None,
+    ):
+        """Report progress on current activity."""
+        self.emit(TraceEvent(
+            event_type=TraceEventType.ACTIVITY_PROGRESS,
+            activity_id=activity_id or self._current_activity_id,
+            activity_name=self._current_activity_name,
+            progress=progress,
+            progress_message=message,
+        ))
+    def touch_document(
+        self,
+        document_id: str,
+        document_name: str,
+        snippet: str = "",
+        field_name: str = "",
+        row_index: int = -1,
+        highlight_type: str = "default",
+        confidence: float = 1.0,
+        **metadata,
+    ):
+        """
+        Signal that the model touched a document/record.
+        This creates a highlight in the live view.
+        """
+        span = DocumentSpan(
+            document_id=document_id,
+            document_name=document_name,
+            field_name=field_name,
+            row_index=row_index,
+            text=snippet,
+            highlight_type=highlight_type,
+            confidence=confidence,
+            metadata=metadata,
+        )
+        self.emit(TraceEvent(
+            event_type=TraceEventType.DOCUMENT_TOUCHED,
+            activity_id=self._current_activity_id,
+            activity_name=self._current_activity_name,
+            activity_type=self._current_activity_type,
+            spans=[span],
+            entity_id=document_id,
+            metadata=metadata,
+        ))
+        return span
+    def highlight_span(
+        self,
+        document_id: str,
+        document_name: str,
+        text: str,
+        start_char: int = -1,
+        end_char: int = -1,
+        field_name: str = "",
+        row_index: int = -1,
+        highlight_type: str = "attention",
+        confidence: float = 1.0,
+        **metadata,
+    ):
+        """
+        Highlight a specific span within a document.
+        For showing exactly where in the text the model is focusing.
+        """
+        span = DocumentSpan(
+            document_id=document_id,
+            document_name=document_name,
+            field_name=field_name,
+            row_index=row_index,
+            text=text,
+            start_char=start_char,
+            end_char=end_char,
+            highlight_type=highlight_type,
+            confidence=confidence,
+            metadata=metadata,
+        )
+        self.emit(TraceEvent(
+            event_type=TraceEventType.SPAN_HIGHLIGHTED,
+            activity_id=self._current_activity_id,
+            activity_name=self._current_activity_name,
+            activity_type=self._current_activity_type,
+            spans=[span],
+            metadata=metadata,
+        ))
+        return span
+    def create_association(
+        self,
+        source_doc_id: str,
+        source_doc_name: str,
+        source_text: str,
+        target_doc_id: str,
+        target_doc_name: str,
+        target_text: str,
+        association_type: str = "related",
+        confidence: float = 1.0,
+        reason: str = "",
+        **metadata,
+    ):
+        """
+        Create an association between two document spans.
+        This is the "A connects to B" visualization.
+        """
+        source = DocumentSpan(
+            document_id=source_doc_id,
+            document_name=source_doc_name,
+            text=source_text,
+            highlight_type="source",
+            confidence=confidence,
+        )
+        target = DocumentSpan(
+            document_id=target_doc_id,
+            document_name=target_doc_name,
+            text=target_text,
+            highlight_type="target",
+            confidence=confidence,
+        )
+        association = DocumentAssociation(
+            source=source,
+            target=target,
+            association_type=association_type,
+            confidence=confidence,
+            reason=reason,
+        )
+        self.emit(TraceEvent(
+            event_type=TraceEventType.ASSOCIATION_CREATED,
+            activity_id=self._current_activity_id,
+            activity_name=self._current_activity_name,
+            activity_type=self._current_activity_type,
+            spans=[source, target],
+            association=association,
+            metadata=metadata,
+        ))
+        return association
+    def entity_created(
+        self,
+        entity_id: str,
+        entity_name: str,
+        record_count: int = None,
+        **metadata,
+    ):
+        """Signal that a new entity was created in provenance."""
+        self.emit(TraceEvent(
+            event_type=TraceEventType.ENTITY_CREATED,
+            activity_id=self._current_activity_id,
+            activity_name=self._current_activity_name,
+            entity_id=entity_id,
+            metadata={"name": entity_name, "record_count": record_count, **metadata},
+        ))
+    def entity_derived(
+        self,
+        derived_id: str,
+        derived_name: str,
+        source_ids: List[str],
+        **metadata,
+    ):
+        """Signal that an entity was derived from others."""
+        self.emit(TraceEvent(
+            event_type=TraceEventType.ENTITY_DERIVED,
+            activity_id=self._current_activity_id,
+            activity_name=self._current_activity_name,
+            entity_id=derived_id,
+            metadata={"name": derived_name, "sources": source_ids, **metadata},
+        ))
+    def link_created(
+        self,
+        source_id: str,
+        target_id: str,
+        relationship_type: str,
+        **metadata,
+    ):
+        """Signal that a provenance link was created."""
+        self.emit(TraceEvent(
+            event_type=TraceEventType.LINK_CREATED,
+            activity_id=self._current_activity_id,
+            activity_name=self._current_activity_name,
+            relationship_type=relationship_type,
+            metadata={"source": source_id, "target": target_id, **metadata},
+        ))
+    # ═══════════════════════════════════════════════════════════════════════════
+    # EXPORT (Freeze the live state)
+    # ═══════════════════════════════════════════════════════════════════════════
+    def export_session(self) -> Dict[str, Any]:
+        """
+        Export the trace session as frozen data.
+        This is the bridge between live and export -
+        same data, just frozen at a point in time.
+        """
+        with self._buffer_lock:
+            return {
+                "events": [e.to_dict() for e in self._buffer],
+                "event_count": len(self._buffer),
+                "exported_at": time.time(),
+            }
+    def export_associations(self) -> List[Dict[str, Any]]:
+        """Export just the associations for visualization."""
+        associations = []
+        with self._buffer_lock:
+            for event in self._buffer:
+                if event.association:
+                    associations.append(event.association.to_dict())
+        return associations
+    def export_timeline(self) -> List[Dict[str, Any]]:
+        """Export events as a timeline."""
+        timeline = []
+        with self._buffer_lock:
+            for event in self._buffer:
+                timeline.append({
+                    "timestamp": event.timestamp,
+                    "type": event.event_type.value,
+                    "activity": event.activity_name,
+                    "spans": len(event.spans),
+                    "has_association": event.association is not None,
+                })
+        return timeline
+# ═══════════════════════════════════════════════════════════════════════════════
+# CONSOLE RENDERER - Simple text-based live view
+# ═══════════════════════════════════════════════════════════════════════════════
+class ConsoleTraceRenderer:
+    """
+    Simple console renderer for live document traces.
+    Good for debugging and terminal-based workflows.
+    """
+    def __init__(self, show_snippets: bool = True, max_snippet_len: int = 80):
+        self.show_snippets = show_snippets
+        self.max_snippet_len = max_snippet_len
+    def render(self, event: TraceEvent):
+        """Render event to console."""
+        timestamp = time.strftime("%H:%M:%S", time.localtime(event.timestamp))
+        if event.event_type == TraceEventType.ACTIVITY_STARTED:
+            print(f"\n[{timestamp}] ▶ {event.activity_name} ({event.activity_type})")
+            print("─" * 60)
+        elif event.event_type == TraceEventType.ACTIVITY_COMPLETED:
+            print("─" * 60)
+            print(f"[{timestamp}] ✓ {event.activity_name} completed")
+        elif event.event_type == TraceEventType.ACTIVITY_PROGRESS:
+            pct = int((event.progress or 0) * 100)
+            bar = "█" * (pct // 5) + "░" * (20 - pct // 5)
+            msg = event.progress_message or ""
+            print(f"\r[{timestamp}] [{bar}] {pct}% {msg}", end="", flush=True)
+            if pct >= 100:
+                print()
+        elif event.event_type == TraceEventType.DOCUMENT_TOUCHED:
+            for span in event.spans:
+                snippet = self._truncate(span.text)
+                print(f"[{timestamp}]   📄 {span.document_name}", end="")
+                if span.field_name:
+                    print(f"[{span.field_name}]", end="")
+                if span.row_index >= 0:
+                    print(f" row={span.row_index}", end="")
+                if self.show_snippets and snippet:
+                    print(f"\n            └─ \"{snippet}\"")
+                else:
+                    print()
+        elif event.event_type == TraceEventType.SPAN_HIGHLIGHTED:
+            for span in event.spans:
+                snippet = self._truncate(span.text)
+                conf = f"{span.confidence:.0%}" if span.confidence < 1.0 else ""
+                print(f"[{timestamp}]   🔍 [{span.highlight_type}] {conf}")
+                if self.show_snippets and snippet:
+                    print(f"            └─ \"{snippet}\"")
+        elif event.event_type == TraceEventType.ASSOCIATION_CREATED:
+            assoc = event.association
+            if assoc:
+                src = self._truncate(assoc.source.text, 40)
+                tgt = self._truncate(assoc.target.text, 40)
+                print(f"[{timestamp}]   🔗 {assoc.association_type} ({assoc.confidence:.0%})")
+                print(f"            ├─ \"{src}\"")
+                print(f"            └─ \"{tgt}\"")
+                if assoc.reason:
+                    print(f"            ({assoc.reason})")
+        elif event.event_type == TraceEventType.ENTITY_CREATED:
+            name = event.metadata.get("name", event.entity_id)
+            count = event.metadata.get("record_count", "?")
+            print(f"[{timestamp}]   ✦ Entity created: {name} ({count} records)")
+        elif event.event_type == TraceEventType.ENTITY_DERIVED:
+            name = event.metadata.get("name", event.entity_id)
+            sources = event.metadata.get("sources", [])
+            print(f"[{timestamp}]   ⤵ Entity derived: {name} ← {len(sources)} sources")
+    def _truncate(self, text: str, max_len: int = None) -> str:
+        max_len = max_len or self.max_snippet_len
+        if not text:
+            return ""
+        text = text.replace("\n", " ").strip()
+        if len(text) > max_len:
+            return text[:max_len-3] + "..."
+        return text
+# ═══════════════════════════════════════════════════════════════════════════════
+# CONVENIENCE
+# ═══════════════════════════════════════════════════════════════════════════════
+def create_live_tracer(observer=None, console: bool = False) -> LiveDocumentTracer:
+    """
+    Create a live document tracer.
+    Args:
+        observer: DatasetObserver to hook into
+        console: If True, attach console renderer
+    Returns:
+        Configured LiveDocumentTracer
+    """
+    tracer = LiveDocumentTracer(observer)
+    if console:
+        renderer = ConsoleTraceRenderer()
+        tracer.on_event(renderer.render)
+    return tracer

cascade/data/observer.py ADDED Viewed

	@@ -0,0 +1,666 @@

+"""
+Dataset Observer
+The main interface for observing datasets.
+Provides context managers for tracking ingest, transform, and consume operations.
+"""
+import hashlib
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, Generator, List, Optional, Union
+from .entities import (
+    DatasetEntity, Activity, Agent, Relationship, RelationType,
+    ActivityType, AgentType, create_system_agent, create_model_agent, create_user_agent
+)
+from .provenance import ProvenanceGraph
+from .schema import SchemaObserver, DatasetSchema, hash_content
+@dataclass
+class ObservationContext:
+    """
+    Context for an ongoing observation.
+    Used within context managers to track inputs/outputs.
+    """
+    activity: Activity
+    observer: "DatasetObserver"
+    _inputs: List[DatasetEntity] = field(default_factory=list)
+    _outputs: List[DatasetEntity] = field(default_factory=list)
+    def input(self, dataset, name: str = None, **kwargs) -> DatasetEntity:
+        """
+        Register an input dataset.
+        Args:
+            dataset: HuggingFace Dataset, DatasetDict, or entity ID
+            name: Optional name override
+            **kwargs: Additional entity attributes
+        Returns:
+            The created or retrieved DatasetEntity
+        """
+        # If string, assume it's an existing entity ID
+        if isinstance(dataset, str):
+            entity = self.observer.graph.get_entity(dataset)
+            if entity:
+                self._inputs.append(entity)
+                self.activity.add_input(entity.id)
+                self.observer.graph.link_usage(self.activity.id, entity.id)
+                return entity
+            else:
+                raise ValueError(f"Entity not found: {dataset}")
+        # Otherwise, observe the dataset
+        entity = self.observer.observe_dataset(dataset, name=name, **kwargs)
+        self._inputs.append(entity)
+        self.activity.add_input(entity.id)
+        self.observer.graph.link_usage(self.activity.id, entity.id)
+        return entity
+    def output(self, dataset, name: str = None, **kwargs) -> DatasetEntity:
+        """
+        Register an output dataset.
+        Args:
+            dataset: HuggingFace Dataset, DatasetDict, or dict
+            name: Optional name override
+            **kwargs: Additional entity attributes
+        Returns:
+            The created DatasetEntity
+        """
+        entity = self.observer.observe_dataset(dataset, name=name, **kwargs)
+        self._outputs.append(entity)
+        self.activity.add_output(entity.id)
+        # Link generation
+        self.observer.graph.link_generation(entity.id, self.activity.id)
+        # Link derivation from all inputs
+        for input_entity in self._inputs:
+            self.observer.graph.link_derivation(entity.id, input_entity.id)
+        return entity
+    @property
+    def inputs(self) -> List[DatasetEntity]:
+        return self._inputs
+    @property
+    def outputs(self) -> List[DatasetEntity]:
+        return self._outputs
+class DatasetObserver:
+    """
+    Observer for dataset operations.
+    Tracks:
+    - Dataset loading (ingest)
+    - Transformations (filter, map, join, etc.)
+    - Consumption (training, inference)
+    Example:
+        observer = DatasetObserver()
+        with observer.observe_ingest("squad") as ctx:
+            ds = load_dataset("squad")
+            ctx.output(ds)
+        with observer.observe_transform("filter_english") as ctx:
+            ctx.input(ds)
+            filtered = ds.filter(lambda x: x["lang"] == "en")
+            ctx.output(filtered)
+        chain = observer.export_provenance()
+    """
+    def __init__(
+        self,
+        name: str = "default",
+        agent: Agent = None,
+    ):
+        """
+        Initialize observer.
+        Args:
+            name: Name for the provenance graph
+            agent: Default agent for activities (defaults to graph's system agent)
+        """
+        self.graph = ProvenanceGraph(name=name)
+        self.schema_observer = SchemaObserver()
+        # Use provided agent or the graph's default system agent
+        if agent:
+            self._default_agent = agent
+            self.graph.add_agent(agent)
+        else:
+            # Use the graph's already-created system agent
+            self._default_agent = self.graph._system_agent
+        # Entity counter for unique IDs
+        self._counter = 0
+    def _next_id(self, prefix: str) -> str:
+        """Generate unique ID."""
+        self._counter += 1
+        return f"{prefix}:{int(time.time() * 1000)}:{self._counter:04d}"
+    # ═════════════════════════════════════════════════════��═════════════════════
+    # DATASET OBSERVATION
+    # ═══════════════════════════════════════════════════════════════════════════
+    def observe_dataset(
+        self,
+        dataset,
+        name: str = None,
+        source_type: str = None,
+        source_uri: str = None,
+        version: str = None,
+        license_id: str = None,
+        license_url: str = None,
+        **kwargs,
+    ) -> DatasetEntity:
+        """
+        Observe a dataset and create an entity.
+        Args:
+            dataset: HuggingFace Dataset, DatasetDict, DataFrame, or dict
+            name: Name for the entity
+            source_type: Type of source (hf_hub, local, etc.)
+            source_uri: URI of the source
+            version: Version string
+            license_id: SPDX license identifier (e.g., "MIT", "CC-BY-4.0")
+            license_url: URL to the license text
+            **kwargs: Additional attributes
+        Returns:
+            DatasetEntity representing the dataset
+        """
+        # Infer name if not provided
+        if name is None:
+            if hasattr(dataset, 'info') and hasattr(dataset.info, 'dataset_name'):
+                name = dataset.info.dataset_name
+            elif hasattr(dataset, 'config_name'):
+                name = dataset.config_name
+            else:
+                name = f"dataset_{self._counter + 1}"
+        # Try to extract license from HuggingFace dataset info
+        if license_id is None and hasattr(dataset, 'info'):
+            info = dataset.info
+            if hasattr(info, 'license') and info.license:
+                license_id = info.license
+        # Observe schema
+        schema = self._observe_schema(dataset)
+        # Compute content hash
+        content_hash = self._compute_content_hash(dataset)
+        # Get record count and splits
+        record_count, splits = self._get_counts(dataset)
+        # Infer source
+        if source_type is None:
+            source_type = self._infer_source_type(dataset)
+        # Create entity
+        entity = DatasetEntity(
+            id=self._next_id("entity"),
+            name=name,
+            content_hash=content_hash,
+            schema_hash=schema.hash() if schema else None,
+            version=version,
+            source_type=source_type,
+            source_uri=source_uri,
+            license_id=license_id,
+            license_url=license_url,
+            record_count=record_count,
+            splits=splits,
+            attributes={
+                "schema": schema.to_dict() if schema else None,
+                **kwargs,
+            },
+        )
+        # Add to graph
+        self.graph.add_entity(entity)
+        return entity
+    def register_agent(self, name: str, agent_type: str = "software", version: str = None) -> Agent:
+        """
+        Register a new agent in the provenance graph.
+        Args:
+            name: Name of the agent
+            agent_type: Type of agent (software, model, person, etc.)
+            version: Optional version string
+        Returns:
+            The created Agent
+        """
+        if agent_type == "model":
+            agent = create_model_agent(name, version=version)
+        elif agent_type == "system":
+            agent = create_system_agent(name, version=version)
+        elif agent_type == "person":
+            agent = create_user_agent(name)
+        else:
+            # Default to software agent or generic
+            try:
+                type_enum = AgentType(agent_type)
+            except ValueError:
+                type_enum = AgentType.SOFTWARE
+            agent = Agent(
+                id=f"agent:{type_enum.value}:{name.replace(' ', '_').lower()}",
+                agent_type=type_enum,
+                name=name,
+                version=version
+            )
+        self.graph.add_agent(agent)
+        return agent
+    def _observe_schema(self, dataset) -> Optional[DatasetSchema]:
+        """Extract schema from dataset."""
+        try:
+            # HuggingFace Dataset
+            if hasattr(dataset, 'features'):
+                return self.schema_observer.observe_hf_dataset(dataset)
+            # Pandas DataFrame
+            if hasattr(dataset, 'dtypes') and hasattr(dataset, 'columns'):
+                return self.schema_observer.observe_pandas(dataset)
+            # Dict
+            if isinstance(dataset, dict):
+                # Check if it's columnar (dict of lists)
+                if all(isinstance(v, list) for v in dataset.values()):
+                    return self.schema_observer.observe_dict(dataset)
+            return None
+        except Exception as e:
+            # Don't fail observation if schema extraction fails
+            print(f"Warning: Could not extract schema: {e}")
+            return None
+    def _compute_content_hash(self, dataset) -> str:
+        """Compute content hash of dataset."""
+        try:
+            return hash_content(dataset)
+        except Exception:
+            # Fallback to timestamp-based hash
+            return hashlib.sha256(str(time.time()).encode()).hexdigest()
+    def _get_counts(self, dataset) -> tuple:
+        """Get record count and split counts."""
+        record_count = None
+        splits = {}
+        try:
+            # HuggingFace DatasetDict
+            if hasattr(dataset, 'keys') and hasattr(dataset, '__getitem__'):
+                for split_name in dataset.keys():
+                    split_ds = dataset[split_name]
+                    if hasattr(split_ds, '__len__'):
+                        splits[split_name] = len(split_ds)
+                record_count = sum(splits.values()) if splits else None
+            # Single dataset
+            elif hasattr(dataset, '__len__'):
+                record_count = len(dataset)
+        except Exception:
+            pass
+        return record_count, splits
+    def _infer_source_type(self, dataset) -> str:
+        """Infer source type from dataset."""
+        # HuggingFace Dataset
+        if hasattr(dataset, '_info'):
+            return "hf_dataset"
+        # Pandas
+        if hasattr(dataset, 'dtypes'):
+            return "pandas"
+        # Dict
+        if isinstance(dataset, dict):
+            return "dict"
+        return "unknown"
+    # ═══════════════════════════════════════════════════════════════════════════
+    # CONTEXT MANAGERS
+    # ═══════════════════════════════════════════════════════════════════════════
+    @contextmanager
+    def observe_ingest(
+        self,
+        name: str,
+        source_uri: str = None,
+        agent: Agent = None,
+        **kwargs,
+    ) -> Generator[ObservationContext, None, None]:
+        """
+        Observe a dataset ingest operation.
+        Args:
+            name: Name of the ingest operation
+            source_uri: URI of the data source
+            agent: Agent performing the ingest
+            **kwargs: Additional activity parameters
+        Yields:
+            ObservationContext for registering inputs/outputs
+        Example:
+            with observer.observe_ingest("load_squad", source_uri="hf://squad") as ctx:
+                ds = load_dataset("squad")
+                ctx.output(ds, name="squad")
+        """
+        activity = Activity(
+            id=self._next_id("activity"),
+            activity_type=ActivityType.INGEST,
+            name=name,
+            agent_id=(agent or self._default_agent).id,
+            parameters={"source_uri": source_uri, **kwargs},
+        )
+        activity.start()
+        ctx = ObservationContext(activity=activity, observer=self)
+        try:
+            yield ctx
+        finally:
+            activity.end()
+            self.graph.add_activity(activity)
+            self.graph.link_association(activity.id, activity.agent_id)
+    @contextmanager
+    def observe_transform(
+        self,
+        name: str,
+        transform_type: str = None,
+        agent: Agent = None,
+        **kwargs,
+    ) -> Generator[ObservationContext, None, None]:
+        """
+        Observe a dataset transformation.
+        Args:
+            name: Name of the transform
+            transform_type: Type of transform (filter, map, join, etc.)
+            agent: Agent performing the transform
+            **kwargs: Additional activity parameters
+        Yields:
+            ObservationContext for registering inputs/outputs
+        Example:
+            with observer.observe_transform("filter_english") as ctx:
+                ctx.input(ds)
+                filtered = ds.filter(lambda x: x["lang"] == "en")
+                ctx.output(filtered)
+        """
+        activity = Activity(
+            id=self._next_id("activity"),
+            activity_type=ActivityType.TRANSFORM,
+            name=name,
+            agent_id=(agent or self._default_agent).id,
+            parameters={"transform_type": transform_type, **kwargs},
+        )
+        activity.start()
+        ctx = ObservationContext(activity=activity, observer=self)
+        try:
+            yield ctx
+        finally:
+            activity.end()
+            self.graph.add_activity(activity)
+            self.graph.link_association(activity.id, activity.agent_id)
+    @contextmanager
+    def observe_consume(
+        self,
+        name: str,
+        model_id: str = None,
+        consume_type: str = "train",
+        agent: Agent = None,
+        **kwargs,
+    ) -> Generator[ObservationContext, None, None]:
+        """
+        Observe dataset consumption (training, inference).
+        Args:
+            name: Name of the consumption operation
+            model_id: ID of the model consuming the data
+            consume_type: Type of consumption (train, evaluate, inference)
+            agent: Agent performing the consumption
+            **kwargs: Additional activity parameters
+        Yields:
+            ObservationContext for registering inputs/outputs
+        Example:
+            with observer.observe_consume("train_qa_model", model_id="bert-base") as ctx:
+                ctx.input(train_ds)
+                model = train(train_ds)
+                # Model provenance now links to data provenance!
+        """
+        # Create model agent if model_id provided
+        if model_id and agent is None:
+            agent = create_model_agent(model_id)
+            self.graph.add_agent(agent)
+        activity_type = {
+            "train": ActivityType.TRAIN,
+            "evaluate": ActivityType.EVALUATE,
+            "inference": ActivityType.INFERENCE,
+        }.get(consume_type, ActivityType.TRAIN)
+        activity = Activity(
+            id=self._next_id("activity"),
+            activity_type=activity_type,
+            name=name,
+            agent_id=(agent or self._default_agent).id,
+            parameters={"model_id": model_id, "consume_type": consume_type, **kwargs},
+        )
+        activity.start()
+        ctx = ObservationContext(activity=activity, observer=self)
+        try:
+            yield ctx
+        finally:
+            activity.end()
+            self.graph.add_activity(activity)
+            self.graph.link_association(activity.id, activity.agent_id)
+    @contextmanager
+    def observe_entity_resolution(
+        self,
+        name: str,
+        model_id: str = None,
+        threshold: float = None,
+        agent: Agent = None,
+        **kwargs,
+    ) -> Generator[ObservationContext, None, None]:
+        """
+        Observe entity resolution / data unity operation.
+        Args:
+            name: Name of the operation
+            model_id: Embedding model used
+            threshold: Similarity threshold
+            agent: Agent performing the operation
+            **kwargs: Additional parameters
+        Example:
+            with observer.observe_entity_resolution("match_patients_claims") as ctx:
+                ctx.input(patients_ds)
+                ctx.input(claims_ds)
+                unified = run_unity(patients_ds, claims_ds)
+                ctx.output(unified)
+        """
+        if model_id and agent is None:
+            agent = create_model_agent(model_id)
+            self.graph.add_agent(agent)
+        activity = Activity(
+            id=self._next_id("activity"),
+            activity_type=ActivityType.ENTITY_RESOLUTION,
+            name=name,
+            agent_id=(agent or self._default_agent).id,
+            parameters={
+                "model_id": model_id,
+                "threshold": threshold,
+                **kwargs,
+            },
+        )
+        activity.start()
+        ctx = ObservationContext(activity=activity, observer=self)
+        try:
+            yield ctx
+        finally:
+            activity.end()
+            self.graph.add_activity(activity)
+            self.graph.link_association(activity.id, activity.agent_id)
+    # ═══════════════════════════════════════════════════════════════════════════
+    # EXPORT
+    # ═══════════════════════════════════════════════════════════════════════════
+    def export_provenance(self) -> ProvenanceGraph:
+        """Export the provenance graph."""
+        return self.graph
+    def to_dict(self) -> Dict[str, Any]:
+        """Export observation state to dictionary."""
+        return {
+            "graph": self.graph.to_dict(),
+            "counter": self._counter,
+        }
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "DatasetObserver":
+        """Load observer from dictionary."""
+        observer = cls()
+        observer.graph = ProvenanceGraph.from_dict(data["graph"])
+        observer._counter = data.get("counter", 0)
+        return observer
+    # ═══════════════════════════════════════════════════════════════════════════
+    # STATISTICS
+    # ══��════════════════════════════════════════════════════════════════════════
+    @property
+    def stats(self) -> Dict[str, Any]:
+        """Get observer statistics."""
+        return {
+            "graph": self.graph.stats,
+            "root_hash": self.graph.root_hash,
+        }
+    # ═══════════════════════════════════════════════════════════════════════════
+    # LICENSE TRACKING
+    # ═══════════════════════════════════════════════════════════════════════════
+    def check_license_compatibility(
+        self,
+        entity_ids: List[str],
+        target_license: str = None,
+    ):
+        """
+        Check license compatibility for deriving from entities.
+        Args:
+            entity_ids: List of source entity IDs
+            target_license: Intended SPDX license for derived work
+        Returns:
+            LicenseCompatibility result
+        Example:
+            result = observer.check_license_compatibility(
+                ["entity:123", "entity:456"],
+                target_license="MIT"
+            )
+            if not result.compatible:
+                print(f"Issues: {result.issues}")
+        """
+        from .license import check_license_compatibility
+        sources = []
+        for entity_id in entity_ids:
+            entity = self.graph.get_entity(entity_id)
+            if entity:
+                license_id = entity.license_id or "unknown"
+                sources.append((entity_id, license_id))
+        return check_license_compatibility(sources, target_license)
+    def get_derived_license(self, entity_ids: List[str]):
+        """
+        Get the appropriate license for a work derived from entities.
+        Args:
+            entity_ids: List of source entity IDs
+        Returns:
+            SPDXLicense for the derived work
+        """
+        from .license import get_derived_license
+        licenses = []
+        for entity_id in entity_ids:
+            entity = self.graph.get_entity(entity_id)
+            if entity and entity.license_id:
+                licenses.append(entity.license_id)
+        return get_derived_license(licenses) if licenses else None
+    def generate_attribution(self, entity_ids: List[str] = None) -> str:
+        """
+        Generate attribution text for entities.
+        Args:
+            entity_ids: List of entity IDs (defaults to all entities)
+        Returns:
+            Markdown attribution text
+        """
+        from .license import LicenseAnalyzer
+        analyzer = LicenseAnalyzer()
+        if entity_ids is None:
+            entities = self.graph.list_entities()
+        else:
+            entities = [
+                self.graph.get_entity(eid) for eid in entity_ids
+                if self.graph.get_entity(eid)
+            ]
+        sources = [
+            (e.id, e.license_id or "unknown", e.name)
+            for e in entities
+        ]
+        return analyzer.generate_attribution(sources)
+    def __repr__(self) -> str:
+        return f"DatasetObserver({self.graph})"

cascade/data/pii.py ADDED Viewed

	@@ -0,0 +1,748 @@

+"""
+PII Detection for CASCADE
+Industry standard PII (Personally Identifiable Information) detection
+based on Microsoft Presidio patterns and common PII taxonomies.
+References:
+- Microsoft Presidio: https://github.com/microsoft/presidio
+- NIST PII Guide: https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-122.pdf
+- GDPR Article 4 (personal data definition)
+PII Categories:
+1. Direct Identifiers: Name, SSN, passport, driver's license
+2. Quasi-Identifiers: Age, ZIP code, gender, dates
+3. Sensitive Data: Health, financial, biometric
+Detection Methods:
+- Regex patterns (fast, high precision for structured PII)
+- Context-aware detection (surrounding words improve accuracy)
+- Checksum validation (SSN, credit cards, etc.)
+"""
+import re
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional, Pattern, Set, Tuple
+class PIIType(Enum):
+    """Types of PII that can be detected."""
+    # Direct Identifiers
+    PERSON_NAME = "PERSON_NAME"
+    EMAIL = "EMAIL"
+    PHONE_NUMBER = "PHONE_NUMBER"
+    SSN = "SSN"                        # Social Security Number
+    CREDIT_CARD = "CREDIT_CARD"
+    IBAN = "IBAN"                      # International Bank Account Number
+    IP_ADDRESS = "IP_ADDRESS"
+    MAC_ADDRESS = "MAC_ADDRESS"
+    PASSPORT = "PASSPORT"
+    DRIVERS_LICENSE = "DRIVERS_LICENSE"
+    # Quasi-Identifiers
+    DATE_OF_BIRTH = "DATE_OF_BIRTH"
+    AGE = "AGE"
+    ZIPCODE = "ZIPCODE"
+    ADDRESS = "ADDRESS"
+    # Sensitive Data
+    MEDICAL_RECORD = "MEDICAL_RECORD"
+    API_KEY = "API_KEY"
+    AWS_KEY = "AWS_KEY"
+    PASSWORD = "PASSWORD"
+    CRYPTO_WALLET = "CRYPTO_WALLET"
+    # Location
+    GPS_COORDINATES = "GPS_COORDINATES"
+    # URLs and IDs
+    URL = "URL"
+    USERNAME = "USERNAME"
+class PIISeverity(Enum):
+    """Severity levels for PII findings."""
+    CRITICAL = "critical"   # Direct identifier, immediate re-identification risk
+    HIGH = "high"           # Sensitive data, significant privacy risk
+    MEDIUM = "medium"       # Quasi-identifier, re-identification when combined
+    LOW = "low"             # Minimal risk, contextual sensitivity
+@dataclass
+class PIIMatch:
+    """A detected PII instance."""
+    pii_type: PIIType
+    severity: PIISeverity
+    value: str              # The matched text (may be redacted for display)
+    start: int              # Start position in text
+    end: int                # End position in text
+    confidence: float       # 0.0 to 1.0
+    context: str = ""       # Surrounding text for context
+    field_name: str = ""    # Column/field where found
+    row_index: int = -1     # Row index if applicable
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "type": self.pii_type.value,
+            "severity": self.severity.value,
+            "value_preview": self._redact(self.value),
+            "start": self.start,
+            "end": self.end,
+            "confidence": self.confidence,
+            "field_name": self.field_name,
+            "row_index": self.row_index,
+        }
+    def _redact(self, value: str, show_chars: int = 4) -> str:
+        """Partially redact the value for display."""
+        if len(value) <= show_chars:
+            return "*" * len(value)
+        return value[:show_chars] + "*" * (len(value) - show_chars)
+@dataclass
+class PIIPattern:
+    """A pattern for detecting PII."""
+    pii_type: PIIType
+    severity: PIISeverity
+    pattern: Pattern
+    confidence: float = 0.85
+    validator: Optional[Callable[[str], bool]] = None  # Additional validation
+    context_patterns: List[str] = field(default_factory=list)  # Boost confidence if context matches
+@dataclass
+class PIIScanResult:
+    """Result of scanning content for PII."""
+    total_matches: int = 0
+    matches_by_type: Dict[str, int] = field(default_factory=dict)
+    matches_by_severity: Dict[str, int] = field(default_factory=dict)
+    matches_by_field: Dict[str, int] = field(default_factory=dict)
+    sample_matches: List[PIIMatch] = field(default_factory=list)  # First N matches
+    fields_with_pii: Set[str] = field(default_factory=set)
+    high_risk_fields: Set[str] = field(default_factory=set)  # Fields with CRITICAL/HIGH PII
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "total_matches": self.total_matches,
+            "matches_by_type": self.matches_by_type,
+            "matches_by_severity": self.matches_by_severity,
+            "matches_by_field": self.matches_by_field,
+            "fields_with_pii": list(self.fields_with_pii),
+            "high_risk_fields": list(self.high_risk_fields),
+            "sample_matches": [m.to_dict() for m in self.sample_matches[:10]],
+        }
+    def has_critical_pii(self) -> bool:
+        """Check if any critical PII was found."""
+        return self.matches_by_severity.get("critical", 0) > 0
+    def has_high_risk_pii(self) -> bool:
+        """Check if any high-risk PII was found."""
+        return (
+            self.matches_by_severity.get("critical", 0) > 0 or
+            self.matches_by_severity.get("high", 0) > 0
+        )
+    @property
+    def summary(self) -> str:
+        """Human-readable summary."""
+        if self.total_matches == 0:
+            return "No PII detected"
+        lines = [f"Found {self.total_matches} PII instance(s):"]
+        for sev in ["critical", "high", "medium", "low"]:
+            count = self.matches_by_severity.get(sev, 0)
+            if count > 0:
+                lines.append(f"  • {sev.upper()}: {count}")
+        if self.high_risk_fields:
+            lines.append(f"  ⚠ High-risk fields: {', '.join(self.high_risk_fields)}")
+        return "\n".join(lines)
+# ═══════════════════════════════════════════════════════════════════════════════
+# VALIDATION FUNCTIONS
+# ═══════════════════════════════════════════════════════════════════════════════
+def validate_luhn(card_number: str) -> bool:
+    """
+    Validate credit card using Luhn algorithm.
+    Used by Visa, MasterCard, American Express, etc.
+    """
+    digits = [int(d) for d in re.sub(r'\D', '', card_number)]
+    if len(digits) < 13 or len(digits) > 19:
+        return False
+    # Luhn checksum
+    checksum = 0
+    for i, digit in enumerate(reversed(digits)):
+        if i % 2 == 1:
+            digit *= 2
+            if digit > 9:
+                digit -= 9
+        checksum += digit
+    return checksum % 10 == 0
+def validate_ssn(ssn: str) -> bool:
+    """
+    Validate US Social Security Number format.
+    SSN format: AAA-BB-CCCC
+    - AAA: Area number (001-899, excluding 666)
+    - BB: Group number (01-99)
+    - CCCC: Serial number (0001-9999)
+    """
+    clean = re.sub(r'\D', '', ssn)
+    if len(clean) != 9:
+        return False
+    area = int(clean[:3])
+    group = int(clean[3:5])
+    serial = int(clean[5:])
+    # Invalid patterns
+    if area == 0 or area == 666 or area >= 900:
+        return False
+    if group == 0:
+        return False
+    if serial == 0:
+        return False
+    # Known invalid SSNs (advertising, testing)
+    invalid_ssns = {
+        "078051120",  # Woolworth promotional
+        "219099999",  # Advertising
+    }
+    if clean in invalid_ssns:
+        return False
+    return True
+def validate_iban(iban: str) -> bool:
+    """
+    Validate IBAN using MOD-97 checksum.
+    """
+    clean = re.sub(r'\s', '', iban).upper()
+    if len(clean) < 15 or len(clean) > 34:
+        return False
+    # Move country code and check digits to end
+    rearranged = clean[4:] + clean[:4]
+    # Convert letters to numbers (A=10, B=11, etc.)
+    numeric = ""
+    for char in rearranged:
+        if char.isdigit():
+            numeric += char
+        else:
+            numeric += str(ord(char) - ord('A') + 10)
+    # MOD 97 check
+    return int(numeric) % 97 == 1
+# ═══════════════════════════════════════════════════════════════════════════════
+# PII PATTERNS (Based on Microsoft Presidio)
+# ═══════════════════════════════════════════════════════════════════════════════
+PII_PATTERNS: List[PIIPattern] = [
+    # Email - RFC 5322 simplified
+    PIIPattern(
+        pii_type=PIIType.EMAIL,
+        severity=PIISeverity.HIGH,
+        pattern=re.compile(
+            r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
+            re.IGNORECASE
+        ),
+        confidence=0.95,
+        context_patterns=["email", "e-mail", "contact", "mail"],
+    ),
+    # Phone Number - International formats
+    PIIPattern(
+        pii_type=PIIType.PHONE_NUMBER,
+        severity=PIISeverity.MEDIUM,
+        pattern=re.compile(
+            r'''
+            (?:
+                \+?1?[-.\s]?                           # Country code
+                \(?[2-9]\d{2}\)?[-.\s]?                # Area code
+                [2-9]\d{2}[-.\s]?                      # Exchange
+                \d{4}                                   # Subscriber
+            |
+                \+?\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]? # International
+                \d{1,4}[-.\s]?\d{1,9}
+            )
+            ''',
+            re.VERBOSE
+        ),
+        confidence=0.75,
+        context_patterns=["phone", "tel", "mobile", "cell", "call", "fax"],
+    ),
+    # SSN - US Social Security Number
+    PIIPattern(
+        pii_type=PIIType.SSN,
+        severity=PIISeverity.CRITICAL,
+        pattern=re.compile(
+            r'\b(?!000|666|9\d{2})\d{3}[-\s]?(?!00)\d{2}[-\s]?(?!0000)\d{4}\b'
+        ),
+        confidence=0.85,
+        validator=validate_ssn,
+        context_patterns=["ssn", "social security", "tax id", "taxpayer"],
+    ),
+    # Credit Card - Major card formats
+    PIIPattern(
+        pii_type=PIIType.CREDIT_CARD,
+        severity=PIISeverity.CRITICAL,
+        pattern=re.compile(
+            r'''
+            \b(?:
+                4[0-9]{12}(?:[0-9]{3})?               # Visa
+            |
+                5[1-5][0-9]{14}                       # MasterCard
+            |
+                3[47][0-9]{13}                        # American Express
+            |
+                6(?:011|5[0-9]{2})[0-9]{12}           # Discover
+            |
+                (?:2131|1800|35\d{3})\d{11}           # JCB
+            )\b
+            |
+            \b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b  # Spaced format
+            ''',
+            re.VERBOSE
+        ),
+        confidence=0.90,
+        validator=validate_luhn,
+        context_patterns=["card", "credit", "visa", "mastercard", "amex", "payment"],
+    ),
+    # IP Address - IPv4
+    PIIPattern(
+        pii_type=PIIType.IP_ADDRESS,
+        severity=PIISeverity.MEDIUM,
+        pattern=re.compile(
+            r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
+        ),
+        confidence=0.90,
+        context_patterns=["ip", "address", "server", "host", "client"],
+    ),
+    # IP Address - IPv6
+    PIIPattern(
+        pii_type=PIIType.IP_ADDRESS,
+        severity=PIISeverity.MEDIUM,
+        pattern=re.compile(
+            r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b'
+        ),
+        confidence=0.90,
+    ),
+    # MAC Address
+    PIIPattern(
+        pii_type=PIIType.MAC_ADDRESS,
+        severity=PIISeverity.LOW,
+        pattern=re.compile(
+            r'\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b'
+        ),
+        confidence=0.95,
+    ),
+    # IBAN - International Bank Account Number
+    PIIPattern(
+        pii_type=PIIType.IBAN,
+        severity=PIISeverity.CRITICAL,
+        pattern=re.compile(
+            r'\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}(?:[A-Z0-9]?){0,16}\b',
+            re.IGNORECASE
+        ),
+        confidence=0.85,
+        validator=validate_iban,
+        context_patterns=["iban", "bank", "account", "transfer"],
+    ),
+    # API Key patterns
+    PIIPattern(
+        pii_type=PIIType.API_KEY,
+        severity=PIISeverity.CRITICAL,
+        pattern=re.compile(
+            r'''
+            (?:
+                sk[-_]live[-_][a-zA-Z0-9]{24,}       # Stripe
+            |
+                sk[-_]test[-_][a-zA-Z0-9]{24,}       # Stripe test
+            |
+                pk[-_]live[-_][a-zA-Z0-9]{24,}       # Stripe public
+            |
+                ghp_[a-zA-Z0-9]{36}                   # GitHub PAT
+            |
+                gho_[a-zA-Z0-9]{36}                   # GitHub OAuth
+            |
+                github_pat_[a-zA-Z0-9]{22}_[a-zA-Z0-9]{59}  # GitHub fine-grained
+            |
+                xox[baprs]-[a-zA-Z0-9-]{10,}         # Slack
+            |
+                ya29\.[a-zA-Z0-9_-]+                  # Google OAuth
+            )
+            ''',
+            re.VERBOSE
+        ),
+        confidence=0.95,
+        context_patterns=["api", "key", "token", "secret", "auth"],
+    ),
+    # AWS Access Key
+    PIIPattern(
+        pii_type=PIIType.AWS_KEY,
+        severity=PIISeverity.CRITICAL,
+        pattern=re.compile(
+            r'\b(?:AKIA|ABIA|ACCA|ASIA)[A-Z0-9]{16}\b'
+        ),
+        confidence=0.95,
+        context_patterns=["aws", "amazon", "key", "access"],
+    ),
+    # Crypto Wallet - Bitcoin
+    PIIPattern(
+        pii_type=PIIType.CRYPTO_WALLET,
+        severity=PIISeverity.HIGH,
+        pattern=re.compile(
+            r'\b(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}\b'
+        ),
+        confidence=0.80,
+        context_patterns=["bitcoin", "btc", "wallet", "crypto"],
+    ),
+    # Crypto Wallet - Ethereum
+    PIIPattern(
+        pii_type=PIIType.CRYPTO_WALLET,
+        severity=PIISeverity.HIGH,
+        pattern=re.compile(
+            r'\b0x[a-fA-F0-9]{40}\b'
+        ),
+        confidence=0.80,
+        context_patterns=["ethereum", "eth", "wallet", "crypto"],
+    ),
+    # GPS Coordinates
+    PIIPattern(
+        pii_type=PIIType.GPS_COORDINATES,
+        severity=PIISeverity.MEDIUM,
+        pattern=re.compile(
+            r'[-+]?(?:[1-8]?\d(?:\.\d+)?|90(?:\.0+)?)\s*,\s*[-+]?(?:180(?:\.0+)?|(?:(?:1[0-7]\d)|(?:[1-9]?\d))(?:\.\d+)?)'
+        ),
+        confidence=0.70,
+        context_patterns=["location", "coordinates", "lat", "lng", "gps"],
+    ),
+    # Date of Birth patterns
+    PIIPattern(
+        pii_type=PIIType.DATE_OF_BIRTH,
+        severity=PIISeverity.MEDIUM,
+        pattern=re.compile(
+            r'\b(?:0?[1-9]|1[0-2])[/\-.](?:0?[1-9]|[12]\d|3[01])[/\-.](?:19|20)\d{2}\b'
+        ),
+        confidence=0.60,  # Low base - needs context
+        context_patterns=["birth", "dob", "born", "birthday", "date of birth"],
+    ),
+    # US ZIP Code
+    PIIPattern(
+        pii_type=PIIType.ZIPCODE,
+        severity=PIISeverity.LOW,
+        pattern=re.compile(
+            r'\b\d{5}(?:-\d{4})?\b'
+        ),
+        confidence=0.50,  # Low - needs context
+        context_patterns=["zip", "postal", "address", "code"],
+    ),
+    # URL (can contain sensitive info in path/query)
+    PIIPattern(
+        pii_type=PIIType.URL,
+        severity=PIISeverity.LOW,
+        pattern=re.compile(
+            r'https?://[^\s<>"{}|\\^`\[\]]+',
+            re.IGNORECASE
+        ),
+        confidence=0.70,
+    ),
+]
+class PIIScanner:
+    """
+    Scanner for detecting PII in text and datasets.
+    Uses regex patterns with optional validation and context boosting.
+    """
+    def __init__(
+        self,
+        patterns: List[PIIPattern] = None,
+        min_confidence: float = 0.5,
+        context_boost: float = 0.1,
+    ):
+        """
+        Initialize scanner.
+        Args:
+            patterns: Custom patterns (defaults to PII_PATTERNS)
+            min_confidence: Minimum confidence to report (0.0-1.0)
+            context_boost: Confidence boost when context matches
+        """
+        self.patterns = patterns or PII_PATTERNS
+        self.min_confidence = min_confidence
+        self.context_boost = context_boost
+    def scan_text(
+        self,
+        text: str,
+        field_name: str = "",
+        row_index: int = -1,
+    ) -> List[PIIMatch]:
+        """
+        Scan text for PII.
+        Args:
+            text: Text to scan
+            field_name: Optional field name for tracking
+            row_index: Optional row index for tracking
+        Returns:
+            List of PIIMatch objects
+        """
+        if not text or not isinstance(text, str):
+            return []
+        matches = []
+        text_lower = text.lower()
+        for pattern in self.patterns:
+            for match in pattern.pattern.finditer(text):
+                value = match.group()
+                confidence = pattern.confidence
+                # Validate if validator provided
+                if pattern.validator:
+                    if not pattern.validator(value):
+                        continue
+                # Context boost
+                if pattern.context_patterns:
+                    for ctx in pattern.context_patterns:
+                        if ctx in text_lower:
+                            confidence = min(1.0, confidence + self.context_boost)
+                            break
+                # Apply minimum confidence filter
+                if confidence >= self.min_confidence:
+                    # Get surrounding context (50 chars each side)
+                    start = max(0, match.start() - 50)
+                    end = min(len(text), match.end() + 50)
+                    context = text[start:end]
+                    matches.append(PIIMatch(
+                        pii_type=pattern.pii_type,
+                        severity=pattern.severity,
+                        value=value,
+                        start=match.start(),
+                        end=match.end(),
+                        confidence=confidence,
+                        context=context,
+                        field_name=field_name,
+                        row_index=row_index,
+                    ))
+        return matches
+    def scan_dict(
+        self,
+        data: Dict[str, List[Any]],
+        sample_size: int = 1000,
+    ) -> PIIScanResult:
+        """
+        Scan a columnar dict for PII.
+        Args:
+            data: Dict of column_name -> values
+            sample_size: Max rows to scan per column
+        Returns:
+            PIIScanResult with aggregated findings
+        """
+        result = PIIScanResult()
+        for field_name, values in data.items():
+            if not values:
+                continue
+            # Sample values
+            sample = values[:sample_size]
+            for row_idx, value in enumerate(sample):
+                if not isinstance(value, str):
+                    value = str(value) if value is not None else ""
+                matches = self.scan_text(value, field_name, row_idx)
+                for match in matches:
+                    result.total_matches += 1
+                    # Count by type
+                    type_name = match.pii_type.value
+                    result.matches_by_type[type_name] = result.matches_by_type.get(type_name, 0) + 1
+                    # Count by severity
+                    sev = match.severity.value
+                    result.matches_by_severity[sev] = result.matches_by_severity.get(sev, 0) + 1
+                    # Count by field
+                    result.matches_by_field[field_name] = result.matches_by_field.get(field_name, 0) + 1
+                    # Track fields
+                    result.fields_with_pii.add(field_name)
+                    if match.severity in [PIISeverity.CRITICAL, PIISeverity.HIGH]:
+                        result.high_risk_fields.add(field_name)
+                    # Keep samples
+                    if len(result.sample_matches) < 100:
+                        result.sample_matches.append(match)
+        return result
+    def scan_dataset(
+        self,
+        dataset,
+        sample_size: int = 1000,
+    ) -> PIIScanResult:
+        """
+        Scan a HuggingFace Dataset or DatasetDict for PII.
+        Args:
+            dataset: HuggingFace Dataset or DatasetDict
+            sample_size: Max rows to scan
+        Returns:
+            PIIScanResult with aggregated findings
+        """
+        # Handle DatasetDict (multiple splits)
+        if hasattr(dataset, 'keys') and callable(dataset.keys):
+            combined = PIIScanResult()
+            for split_name in dataset.keys():
+                split_result = self.scan_dataset(dataset[split_name], sample_size)
+                # Merge results
+                combined.total_matches += split_result.total_matches
+                for k, v in split_result.matches_by_type.items():
+                    combined.matches_by_type[k] = combined.matches_by_type.get(k, 0) + v
+                for k, v in split_result.matches_by_severity.items():
+                    combined.matches_by_severity[k] = combined.matches_by_severity.get(k, 0) + v
+                for k, v in split_result.matches_by_field.items():
+                    combined.matches_by_field[k] = combined.matches_by_field.get(k, 0) + v
+                combined.fields_with_pii.update(split_result.fields_with_pii)
+                combined.high_risk_fields.update(split_result.high_risk_fields)
+                combined.sample_matches.extend(split_result.sample_matches[:20])
+            return combined
+        # Single Dataset
+        result = PIIScanResult()
+        # Get column names
+        if hasattr(dataset, 'features'):
+            columns = list(dataset.features.keys())
+        elif hasattr(dataset, 'column_names'):
+            columns = dataset.column_names
+        else:
+            return result
+        # Sample rows
+        num_rows = len(dataset) if hasattr(dataset, '__len__') else sample_size
+        sample_indices = range(min(sample_size, num_rows))
+        for idx in sample_indices:
+            row = dataset[idx]
+            for col in columns:
+                value = row.get(col) if isinstance(row, dict) else getattr(row, col, None)
+                if not isinstance(value, str):
+                    value = str(value) if value is not None else ""
+                matches = self.scan_text(value, col, idx)
+                for match in matches:
+                    result.total_matches += 1
+                    type_name = match.pii_type.value
+                    result.matches_by_type[type_name] = result.matches_by_type.get(type_name, 0) + 1
+                    sev = match.severity.value
+                    result.matches_by_severity[sev] = result.matches_by_severity.get(sev, 0) + 1
+                    result.matches_by_field[col] = result.matches_by_field.get(col, 0) + 1
+                    result.fields_with_pii.add(col)
+                    if match.severity in [PIISeverity.CRITICAL, PIISeverity.HIGH]:
+                        result.high_risk_fields.add(col)
+                    if len(result.sample_matches) < 100:
+                        result.sample_matches.append(match)
+        return result
+# Singleton scanner
+_scanner = PIIScanner()
+def scan_for_pii(
+    data,
+    sample_size: int = 1000,
+    min_confidence: float = 0.5,
+) -> PIIScanResult:
+    """
+    Convenience function to scan data for PII.
+    Args:
+        data: Text, dict, or HuggingFace Dataset
+        sample_size: Max rows to scan
+        min_confidence: Minimum confidence threshold
+    Returns:
+        PIIScanResult with findings
+    """
+    scanner = PIIScanner(min_confidence=min_confidence)
+    if isinstance(data, str):
+        matches = scanner.scan_text(data)
+        result = PIIScanResult(
+            total_matches=len(matches),
+            sample_matches=matches,
+        )
+        for m in matches:
+            result.matches_by_type[m.pii_type.value] = result.matches_by_type.get(m.pii_type.value, 0) + 1
+            result.matches_by_severity[m.severity.value] = result.matches_by_severity.get(m.severity.value, 0) + 1
+        return result
+    if isinstance(data, dict):
+        return scanner.scan_dict(data, sample_size)
+    # Assume HuggingFace Dataset
+    return scanner.scan_dataset(data, sample_size)
+def quick_pii_check(data, sample_size: int = 100) -> bool:
+    """
+    Quick check if data contains any PII.
+    Returns True if PII is found, False otherwise.
+    """
+    result = scan_for_pii(data, sample_size=sample_size, min_confidence=0.7)
+    return result.total_matches > 0

cascade/data/provenance.py ADDED Viewed

	@@ -0,0 +1,503 @@

+"""
+Provenance Graph
+Tracks entities, activities, agents, and their relationships.
+Supports Merkle tree hashing for tamper-evident lineage.
+"""
+import hashlib
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Set, Tuple, Iterator
+from .entities import (
+    DatasetEntity, Activity, Agent, Relationship, RelationType,
+    ActivityType, AgentType, create_system_agent
+)
+@dataclass
+class ProvenanceNode:
+    """A node in the provenance graph with hash chain."""
+    node_id: str
+    node_type: str  # entity, activity, agent
+    data: Dict[str, Any]
+    # Hash chain
+    node_hash: str = ""
+    parent_hashes: List[str] = field(default_factory=list)
+    def __post_init__(self):
+        if not self.node_hash:
+            self.node_hash = self._compute_hash()
+    def _compute_hash(self) -> str:
+        """Compute hash including parent hashes (Merkle-style)."""
+        content = json.dumps({
+            "id": self.node_id,
+            "type": self.node_type,
+            "data": self.data,
+            "parents": sorted(self.parent_hashes),
+        }, sort_keys=True, default=str)
+        return hashlib.sha256(content.encode()).hexdigest()
+class ProvenanceGraph:
+    """
+    A graph of provenance relationships.
+    Tracks:
+    - Entities (datasets, versions, splits)
+    - Activities (transforms, training, inference)
+    - Agents (users, models, pipelines)
+    - Relationships between them
+    Provides:
+    - Lineage queries (what produced this? what did this produce?)
+    - Hash chain for integrity verification
+    - Export to PROV-O and Croissant formats
+    """
+    def __init__(self, name: str = "default"):
+        self.name = name
+        self.created_at = time.time()
+        # Storage
+        self._entities: Dict[str, DatasetEntity] = {}
+        self._activities: Dict[str, Activity] = {}
+        self._agents: Dict[str, Agent] = {}
+        self._relationships: List[Relationship] = []
+        # Hash chain
+        self._nodes: Dict[str, ProvenanceNode] = {}
+        self._root_hash: Optional[str] = None
+        # Default system agent
+        self._system_agent = create_system_agent("cascade-data-observatory")
+        self.add_agent(self._system_agent)
+    # ═══════════════════════════════════════════════════════════════════════════
+    # ENTITY MANAGEMENT
+    # ═══════════════════════════════════════════════════════════════════════════
+    def add_entity(self, entity: DatasetEntity) -> str:
+        """Add a dataset entity to the graph."""
+        self._entities[entity.id] = entity
+        # Create provenance node
+        node = ProvenanceNode(
+            node_id=entity.id,
+            node_type="entity",
+            data=entity.to_dict(),
+        )
+        self._nodes[entity.id] = node
+        self._update_root_hash()
+        return entity.id
+    def get_entity(self, entity_id: str) -> Optional[DatasetEntity]:
+        """Get entity by ID."""
+        return self._entities.get(entity_id)
+    def list_entities(self) -> List[DatasetEntity]:
+        """List all entities."""
+        return list(self._entities.values())
+    # ═══════════════════════════════════════════════════════════════════════════
+    # ACTIVITY MANAGEMENT
+    # ═══════════════════════════════════════════════════════════════════════════
+    def add_activity(self, activity: Activity) -> str:
+        """Add an activity to the graph."""
+        self._activities[activity.id] = activity
+        # Link to agent
+        if not activity.agent_id:
+            activity.agent_id = self._system_agent.id
+        # Create provenance node with parent hashes from inputs
+        parent_hashes = []
+        for input_id in activity.inputs:
+            if input_id in self._nodes:
+                parent_hashes.append(self._nodes[input_id].node_hash)
+        node = ProvenanceNode(
+            node_id=activity.id,
+            node_type="activity",
+            data=activity.to_dict(),
+            parent_hashes=parent_hashes,
+        )
+        self._nodes[activity.id] = node
+        self._update_root_hash()
+        return activity.id
+    def get_activity(self, activity_id: str) -> Optional[Activity]:
+        """Get activity by ID."""
+        return self._activities.get(activity_id)
+    def list_activities(self) -> List[Activity]:
+        """List all activities."""
+        return list(self._activities.values())
+    # ═══════════════════════════════════════════════════════════════════════════
+    # AGENT MANAGEMENT
+    # ═══════════════════════════════════════════════════════════════════════════
+    def add_agent(self, agent: Agent) -> str:
+        """Add an agent to the graph."""
+        self._agents[agent.id] = agent
+        node = ProvenanceNode(
+            node_id=agent.id,
+            node_type="agent",
+            data=agent.to_dict(),
+        )
+        self._nodes[agent.id] = node
+        return agent.id
+    def get_agent(self, agent_id: str) -> Optional[Agent]:
+        """Get agent by ID."""
+        return self._agents.get(agent_id)
+    def list_agents(self) -> List[Agent]:
+        """List all agents."""
+        return list(self._agents.values())
+    def list_relationships(self) -> List[Relationship]:
+        """List all relationships."""
+        return list(self._relationships)
+    # ═══════════════════════════════════════════════════════════════════════════
+    # RELATIONSHIP MANAGEMENT
+    # ═══════════════════════════════════════════════════════════════════════════
+    def add_relationship(
+        self,
+        relation_type: RelationType,
+        source_id: str,
+        target_id: str,
+        attributes: Dict[str, Any] = None,
+        timestamp: float = None,
+    ) -> Relationship:
+        """Add a relationship between nodes."""
+        rel = Relationship(
+            relation_type=relation_type,
+            source_id=source_id,
+            target_id=target_id,
+            timestamp=timestamp if timestamp is not None else time.time(),
+            attributes=attributes or {},
+        )
+        self._relationships.append(rel)
+        return rel
+    def link_derivation(self, derived_id: str, source_id: str) -> Relationship:
+        """Record that derived entity came from source entity."""
+        return self.add_relationship(
+            RelationType.WAS_DERIVED_FROM,
+            source_id=derived_id,
+            target_id=source_id,
+        )
+    def link_generation(self, entity_id: str, activity_id: str) -> Relationship:
+        """Record that entity was generated by activity."""
+        return self.add_relationship(
+            RelationType.WAS_GENERATED_BY,
+            source_id=entity_id,
+            target_id=activity_id,
+        )
+    def link_usage(self, activity_id: str, entity_id: str) -> Relationship:
+        """Record that activity used entity as input."""
+        return self.add_relationship(
+            RelationType.USED,
+            source_id=activity_id,
+            target_id=entity_id,
+        )
+    def link_attribution(self, entity_id: str, agent_id: str) -> Relationship:
+        """Record that entity was attributed to agent."""
+        return self.add_relationship(
+            RelationType.WAS_ATTRIBUTED_TO,
+            source_id=entity_id,
+            target_id=agent_id,
+        )
+    def link_association(self, activity_id: str, agent_id: str) -> Relationship:
+        """Record that activity was associated with agent."""
+        return self.add_relationship(
+            RelationType.WAS_ASSOCIATED_WITH,
+            source_id=activity_id,
+            target_id=agent_id,
+        )
+    # ═══════════════════════════════════════════════════════════════════════════
+    # LINEAGE QUERIES
+    # ═══════════════════════════════════════════════════════════════════════════
+    def get_lineage(self, entity_id: str, direction: str = "upstream") -> List[str]:
+        """
+        Get lineage for an entity.
+        Args:
+            entity_id: The entity to trace
+            direction: "upstream" (what produced this) or "downstream" (what this produced)
+        Returns:
+            List of entity IDs in lineage order
+        """
+        visited: Set[str] = set()
+        lineage: List[str] = []
+        def trace(current_id: str):
+            if current_id in visited:
+                return
+            visited.add(current_id)
+            for rel in self._relationships:
+                if direction == "upstream":
+                    # Follow wasDerivedFrom backwards
+                    if rel.relation_type == RelationType.WAS_DERIVED_FROM:
+                        if rel.source_id == current_id:
+                            lineage.append(rel.target_id)
+                            trace(rel.target_id)
+                else:
+                    # Follow wasDerivedFrom forwards
+                    if rel.relation_type == RelationType.WAS_DERIVED_FROM:
+                        if rel.target_id == current_id:
+                            lineage.append(rel.source_id)
+                            trace(rel.source_id)
+        trace(entity_id)
+        return lineage
+    def get_activities_for_entity(self, entity_id: str) -> List[Activity]:
+        """Get activities that generated or used this entity."""
+        activity_ids = set()
+        for rel in self._relationships:
+            if rel.relation_type == RelationType.WAS_GENERATED_BY:
+                if rel.source_id == entity_id:
+                    activity_ids.add(rel.target_id)
+            elif rel.relation_type == RelationType.USED:
+                if rel.target_id == entity_id:
+                    activity_ids.add(rel.source_id)
+        return [self._activities[aid] for aid in activity_ids if aid in self._activities]
+    def get_inputs_for_activity(self, activity_id: str) -> List[DatasetEntity]:
+        """Get entities that were inputs to an activity."""
+        entity_ids = set()
+        for rel in self._relationships:
+            if rel.relation_type == RelationType.USED:
+                if rel.source_id == activity_id:
+                    entity_ids.add(rel.target_id)
+        return [self._entities[eid] for eid in entity_ids if eid in self._entities]
+    def get_outputs_for_activity(self, activity_id: str) -> List[DatasetEntity]:
+        """Get entities that were outputs of an activity."""
+        entity_ids = set()
+        for rel in self._relationships:
+            if rel.relation_type == RelationType.WAS_GENERATED_BY:
+                if rel.target_id == activity_id:
+                    entity_ids.add(rel.source_id)
+        return [self._entities[eid] for eid in entity_ids if eid in self._entities]
+    # ═══════════════════════════════════════════════════════════════════════════
+    # HASH CHAIN
+    # ═══════════════════════════════════════════════════════════════════════════
+    def _update_root_hash(self):
+        """Update the Merkle root hash."""
+        if not self._nodes:
+            self._root_hash = None
+            return
+        # Compute root from all node hashes
+        all_hashes = sorted([n.node_hash for n in self._nodes.values()])
+        combined = "".join(all_hashes)
+        self._root_hash = hashlib.sha256(combined.encode()).hexdigest()
+    @property
+    def root_hash(self) -> Optional[str]:
+        """Get the current Merkle root hash."""
+        return self._root_hash
+    def verify_integrity(self) -> Tuple[bool, List[str]]:
+        """
+        Verify integrity of the provenance graph.
+        Returns:
+            (is_valid, list of invalid node IDs)
+        """
+        invalid = []
+        for node_id, node in self._nodes.items():
+            expected_hash = node._compute_hash()
+            if expected_hash != node.node_hash:
+                invalid.append(node_id)
+        return len(invalid) == 0, invalid
+    # ═══════════════════════════════════════════════════════════════════════════
+    # EXPORT
+    # ═══════════════════════════════════════════════════════════════════════════
+    def to_dict(self) -> Dict[str, Any]:
+        """Export graph to dictionary."""
+        return {
+            "name": self.name,
+            "created_at": self.created_at,
+            "root_hash": self._root_hash,
+            "entities": {k: v.to_dict() for k, v in self._entities.items()},
+            "activities": {k: v.to_dict() for k, v in self._activities.items()},
+            "agents": {k: v.to_dict() for k, v in self._agents.items()},
+            "relationships": [r.to_dict() for r in self._relationships],
+        }
+    def to_prov_n(self) -> str:
+        """Export as PROV-N notation."""
+        lines = [
+            f"document",
+            f"  prefix cascade <https://cascade.ai/ns/>",
+            f"  prefix prov <http://www.w3.org/ns/prov#>",
+            f"",
+        ]
+        # Entities
+        for entity in self._entities.values():
+            lines.append(f"  {entity.to_prov_n()}")
+        lines.append("")
+        # Activities
+        for activity in self._activities.values():
+            lines.append(f"  {activity.to_prov_n()}")
+        lines.append("")
+        # Agents
+        for agent in self._agents.values():
+            lines.append(f"  {agent.to_prov_n()}")
+        lines.append("")
+        # Relationships
+        for rel in self._relationships:
+            lines.append(f"  {rel.to_prov_n()}")
+        lines.append("")
+        lines.append("endDocument")
+        return "\n".join(lines)
+    def to_prov_jsonld(self) -> Dict[str, Any]:
+        """Export as PROV-O JSON-LD."""
+        return {
+            "@context": {
+                "prov": "http://www.w3.org/ns/prov#",
+                "cascade": "https://cascade.ai/ns/",
+                "xsd": "http://www.w3.org/2001/XMLSchema#",
+            },
+            "@graph": [
+                *[e.to_dict() for e in self._entities.values()],
+                *[a.to_dict() for a in self._activities.values()],
+                *[a.to_dict() for a in self._agents.values()],
+            ],
+        }
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ProvenanceGraph":
+        """Load graph from dictionary."""
+        graph = cls(name=data.get("name", "default"))
+        graph.created_at = data.get("created_at", time.time())
+        # Load entities
+        for entity_data in data.get("entities", {}).values():
+            entity = DatasetEntity(
+                id=entity_data["@id"],
+                name=entity_data["name"],
+                content_hash=entity_data.get("content_hash"),
+                schema_hash=entity_data.get("schema_hash"),
+                version=entity_data.get("version"),
+                previous_version=entity_data.get("previous_version"),
+                source_type=entity_data.get("source_type", "unknown"),
+                source_uri=entity_data.get("source_uri"),
+                record_count=entity_data.get("record_count"),
+                size_bytes=entity_data.get("size_bytes"),
+                splits=entity_data.get("splits", {}),
+                attributes=entity_data.get("attributes", {}),
+                created_at=entity_data.get("created_at", time.time()),
+            )
+            graph.add_entity(entity)
+        # Load activities
+        for activity_data in data.get("activities", {}).values():
+            activity = Activity(
+                id=activity_data["@id"],
+                activity_type=ActivityType(activity_data["activity_type"]),
+                name=activity_data["name"],
+                started_at=activity_data.get("started_at"),
+                ended_at=activity_data.get("ended_at"),
+                inputs=activity_data.get("inputs", []),
+                outputs=activity_data.get("outputs", []),
+                agent_id=activity_data.get("agent_id"),
+                parameters=activity_data.get("parameters", {}),
+                attributes=activity_data.get("attributes", {}),
+            )
+            graph.add_activity(activity)
+        # Load agents
+        for agent_data in data.get("agents", {}).values():
+            agent = Agent(
+                id=agent_data["@id"],
+                agent_type=AgentType(agent_data["agent_type"]),
+                name=agent_data["name"],
+                version=agent_data.get("version"),
+                parent_agent_id=agent_data.get("parent_agent_id"),
+                identifier=agent_data.get("identifier"),
+                attributes=agent_data.get("attributes", {}),
+                created_at=agent_data.get("created_at", time.time()),
+            )
+            graph.add_agent(agent)
+        # Load relationships
+        for rel_data in data.get("relationships", []):
+            graph.add_relationship(
+                relation_type=RelationType(rel_data["type"]),
+                source_id=rel_data["source"],
+                target_id=rel_data["target"],
+                attributes=rel_data.get("attributes", {}),
+                timestamp=rel_data.get("timestamp"),
+            )
+        return graph
+    # ═══════════════════════════════════════════════════════════════════════════
+    # STATISTICS
+    # ═══════════════════════════════════════════════════════════════════════════
+    @property
+    def stats(self) -> Dict[str, int]:
+        """Get graph statistics."""
+        return {
+            "entities": len(self._entities),
+            "activities": len(self._activities),
+            "agents": len(self._agents),
+            "relationships": len(self._relationships),
+        }
+    def __repr__(self) -> str:
+        stats = self.stats
+        return (
+            f"ProvenanceGraph(name='{self.name}', "
+            f"entities={stats['entities']}, "
+            f"activities={stats['activities']}, "
+            f"relationships={stats['relationships']})"
+        )

cascade/data/schema.py ADDED Viewed

	@@ -0,0 +1,417 @@

+"""
+Schema Observer
+Observes and hashes dataset schemas/features.
+Works with HuggingFace datasets Features, Pandas DataFrames, and raw dicts.
+"""
+import hashlib
+import json
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Union
+@dataclass
+class FieldSchema:
+    """Schema for a single field/column."""
+    name: str
+    dtype: str  # Normalized type name
+    # Type details
+    nullable: bool = True
+    is_list: bool = False
+    list_inner_type: Optional[str] = None
+    # For ClassLabel
+    is_categorical: bool = False
+    categories: Optional[List[str]] = None
+    num_categories: Optional[int] = None
+    # For nested structures
+    nested_fields: Optional[Dict[str, "FieldSchema"]] = None
+    # For arrays/tensors
+    shape: Optional[tuple] = None
+    # Constraints
+    min_value: Optional[float] = None
+    max_value: Optional[float] = None
+    pattern: Optional[str] = None  # Regex for strings
+    # Metadata
+    description: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        result = {
+            "name": self.name,
+            "dtype": self.dtype,
+            "nullable": self.nullable,
+        }
+        if self.is_list:
+            result["is_list"] = True
+            result["list_inner_type"] = self.list_inner_type
+        if self.is_categorical:
+            result["is_categorical"] = True
+            result["categories"] = self.categories
+            result["num_categories"] = self.num_categories
+        if self.nested_fields:
+            result["nested_fields"] = {
+                k: v.to_dict() for k, v in self.nested_fields.items()
+            }
+        if self.shape:
+            result["shape"] = self.shape
+        if self.description:
+            result["description"] = self.description
+        return result
+    def hash(self) -> str:
+        """Hash this field's structure."""
+        content = json.dumps(self.to_dict(), sort_keys=True)
+        return hashlib.sha256(content.encode()).hexdigest()[:16]
+@dataclass
+class DatasetSchema:
+    """Complete schema for a dataset."""
+    fields: Dict[str, FieldSchema] = field(default_factory=dict)
+    # Dataset-level metadata
+    primary_key: Optional[List[str]] = None
+    foreign_keys: Dict[str, str] = field(default_factory=dict)  # field → target
+    # Source info
+    source_format: Optional[str] = None  # arrow, parquet, csv, etc.
+    def add_field(self, field_schema: FieldSchema):
+        """Add a field to the schema."""
+        self.fields[field_schema.name] = field_schema
+    @property
+    def field_names(self) -> List[str]:
+        return list(self.fields.keys())
+    @property
+    def num_fields(self) -> int:
+        return len(self.fields)
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "fields": {k: v.to_dict() for k, v in self.fields.items()},
+            "primary_key": self.primary_key,
+            "foreign_keys": self.foreign_keys,
+            "source_format": self.source_format,
+        }
+    def hash(self) -> str:
+        """Compute schema hash - identifies structure regardless of content."""
+        # Sort fields for deterministic hashing
+        ordered_fields = sorted(self.fields.keys())
+        content = json.dumps({
+            "fields": [self.fields[k].to_dict() for k in ordered_fields],
+            "primary_key": self.primary_key,
+        }, sort_keys=True)
+        return hashlib.sha256(content.encode()).hexdigest()
+    def diff(self, other: "DatasetSchema") -> Dict[str, Any]:
+        """Compare two schemas and return differences."""
+        added = set(other.field_names) - set(self.field_names)
+        removed = set(self.field_names) - set(other.field_names)
+        modified = {}
+        for name in set(self.field_names) & set(other.field_names):
+            if self.fields[name].hash() != other.fields[name].hash():
+                modified[name] = {
+                    "old": self.fields[name].to_dict(),
+                    "new": other.fields[name].to_dict(),
+                }
+        return {
+            "added": list(added),
+            "removed": list(removed),
+            "modified": modified,
+            "compatible": len(removed) == 0 and len(modified) == 0,
+        }
+class SchemaObserver:
+    """
+    Observes and extracts schemas from various data sources.
+    """
+    # Type mapping from various sources to normalized types
+    TYPE_MAP = {
+        # Python types
+        "str": "string",
+        "int": "int64",
+        "float": "float64",
+        "bool": "bool",
+        "bytes": "binary",
+        # NumPy types
+        "int8": "int8",
+        "int16": "int16",
+        "int32": "int32",
+        "int64": "int64",
+        "uint8": "uint8",
+        "uint16": "uint16",
+        "uint32": "uint32",
+        "uint64": "uint64",
+        "float16": "float16",
+        "float32": "float32",
+        "float64": "float64",
+        # Arrow types
+        "string": "string",
+        "large_string": "string",
+        "binary": "binary",
+        "large_binary": "binary",
+        # HuggingFace special types
+        "Image": "image",
+        "Audio": "audio",
+        "ClassLabel": "categorical",
+    }
+    def observe_hf_dataset(self, dataset) -> DatasetSchema:
+        """
+        Extract schema from HuggingFace Dataset.
+        Args:
+            dataset: A HuggingFace datasets.Dataset or DatasetDict
+        Returns:
+            DatasetSchema with all fields
+        """
+        schema = DatasetSchema(source_format="arrow")
+        # Get features (works for both Dataset and DatasetDict)
+        if hasattr(dataset, 'features'):
+            features = dataset.features
+        elif hasattr(dataset, '__iter__'):
+            # DatasetDict - get features from first split
+            first_split = next(iter(dataset.values()))
+            features = first_split.features
+        else:
+            raise ValueError(f"Cannot extract features from {type(dataset)}")
+        # Parse each feature
+        for name, feature in features.items():
+            field_schema = self._parse_hf_feature(name, feature)
+            schema.add_field(field_schema)
+        return schema
+    def _parse_hf_feature(self, name: str, feature) -> FieldSchema:
+        """Parse a HuggingFace Feature into FieldSchema."""
+        # Import here to avoid hard dependency
+        try:
+            from datasets import (
+                Value, ClassLabel, Sequence,
+                Array2D, Array3D, Array4D, Array5D,
+                Image, Audio
+            )
+        except ImportError:
+            # Fallback for when datasets not installed
+            return FieldSchema(name=name, dtype="unknown")
+        # Value type (primitives)
+        if isinstance(feature, Value):
+            return FieldSchema(
+                name=name,
+                dtype=self.TYPE_MAP.get(feature.dtype, feature.dtype),
+            )
+        # ClassLabel (categorical)
+        if isinstance(feature, ClassLabel):
+            return FieldSchema(
+                name=name,
+                dtype="categorical",
+                is_categorical=True,
+                categories=feature.names,
+                num_categories=feature.num_classes,
+            )
+        # Sequence (list)
+        if isinstance(feature, Sequence):
+            inner = self._parse_hf_feature(f"{name}_inner", feature.feature)
+            return FieldSchema(
+                name=name,
+                dtype="list",
+                is_list=True,
+                list_inner_type=inner.dtype,
+            )
+        # Arrays
+        if isinstance(feature, (Array2D, Array3D, Array4D, Array5D)):
+            return FieldSchema(
+                name=name,
+                dtype=self.TYPE_MAP.get(feature.dtype, feature.dtype),
+                shape=feature.shape,
+            )
+        # Image
+        if isinstance(feature, Image):
+            return FieldSchema(
+                name=name,
+                dtype="image",
+            )
+        # Audio
+        if isinstance(feature, Audio):
+            return FieldSchema(
+                name=name,
+                dtype="audio",
+            )
+        # Dict/nested structure
+        if isinstance(feature, dict):
+            nested = {}
+            for k, v in feature.items():
+                nested[k] = self._parse_hf_feature(k, v)
+            return FieldSchema(
+                name=name,
+                dtype="struct",
+                nested_fields=nested,
+            )
+        # Fallback
+        return FieldSchema(
+            name=name,
+            dtype=str(type(feature).__name__),
+        )
+    def observe_pandas(self, df) -> DatasetSchema:
+        """
+        Extract schema from Pandas DataFrame.
+        Args:
+            df: A pandas DataFrame
+        Returns:
+            DatasetSchema with all fields
+        """
+        schema = DatasetSchema(source_format="pandas")
+        for col in df.columns:
+            dtype = str(df[col].dtype)
+            normalized = self.TYPE_MAP.get(dtype, dtype)
+            # Check for categorical
+            if dtype == "category":
+                schema.add_field(FieldSchema(
+                    name=col,
+                    dtype="categorical",
+                    is_categorical=True,
+                    categories=list(df[col].cat.categories),
+                    num_categories=len(df[col].cat.categories),
+                ))
+            else:
+                schema.add_field(FieldSchema(
+                    name=col,
+                    dtype=normalized,
+                    nullable=df[col].isna().any(),
+                ))
+        return schema
+    def observe_dict(self, data: Dict[str, Any], sample_size: int = 100) -> DatasetSchema:
+        """
+        Extract schema from a dict of lists (columnar format).
+        Args:
+            data: Dict mapping column names to lists of values
+            sample_size: Number of values to sample for type inference
+        Returns:
+            DatasetSchema with all fields
+        """
+        schema = DatasetSchema(source_format="dict")
+        for col, values in data.items():
+            if not values:
+                schema.add_field(FieldSchema(name=col, dtype="unknown"))
+                continue
+            # Sample values for type inference
+            sample = values[:sample_size]
+            types = set(type(v).__name__ for v in sample if v is not None)
+            # Determine type
+            if len(types) == 0:
+                dtype = "null"
+            elif len(types) == 1:
+                dtype = self.TYPE_MAP.get(types.pop(), "unknown")
+            else:
+                dtype = "mixed"
+            # Check for nulls
+            nullable = any(v is None for v in sample)
+            schema.add_field(FieldSchema(
+                name=col,
+                dtype=dtype,
+                nullable=nullable,
+            ))
+        return schema
+    def observe_arrow(self, table) -> DatasetSchema:
+        """
+        Extract schema from PyArrow Table.
+        Args:
+            table: A pyarrow.Table
+        Returns:
+            DatasetSchema with all fields
+        """
+        schema = DatasetSchema(source_format="arrow")
+        for field in table.schema:
+            dtype = str(field.type)
+            normalized = self.TYPE_MAP.get(dtype, dtype)
+            schema.add_field(FieldSchema(
+                name=field.name,
+                dtype=normalized,
+                nullable=field.nullable,
+            ))
+        return schema
+def hash_content(data, sample_size: int = 10000) -> str:
+    """
+    Compute content hash of dataset.
+    For large datasets, samples rows for efficiency.
+    """
+    hasher = hashlib.sha256()
+    # Handle dict first (dict also has __iter__ and __len__)
+    if isinstance(data, dict):
+        content = json.dumps(data, sort_keys=True, default=str)
+        hasher.update(content.encode())
+    # Handle list
+    elif isinstance(data, list):
+        for item in data[:sample_size]:
+            item_str = json.dumps(item, sort_keys=True, default=str)
+            hasher.update(item_str.encode())
+    # Handle HuggingFace Dataset or other iterables with __len__
+    elif hasattr(data, '__iter__') and hasattr(data, '__len__'):
+        # Sample if large
+        n = len(data)
+        if n > sample_size:
+            import random
+            indices = sorted(random.sample(range(n), sample_size))
+            sample = [data[i] for i in indices]
+        else:
+            sample = list(data)
+        for row in sample:
+            row_str = json.dumps(row, sort_keys=True, default=str)
+            hasher.update(row_str.encode())
+    return hasher.hexdigest()

cascade/demo.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+CASCADE-LATTICE Interactive Demo
+Launch the LunarLander demo showcasing:
+- cascade.hold: Human-in-the-loop intervention
+- cascade.store: Provenance tracking
+- Merkle-chained decision records
+Usage:
+    cascade-demo              # Run the demo
+    python -m cascade.demo    # Alternative
+Controls:
+    [H] HOLD-FREEZE   - Pause time, inspect AI decision
+    [T] HOLD-TAKEOVER - Continue time, YOU control with WASD
+    [ESC] Release hold, return to AI sovereignty
+    In HOLD modes:
+        [W] Main Engine (thrust up)
+        [A] Left Engine (rotate)
+        [D] Right Engine (rotate)
+        [S] No-op / Accept AI decision
+"""
+import sys
+import subprocess
+from pathlib import Path
+def check_demo_dependencies():
+    """Check if demo dependencies are installed."""
+    missing = []
+    try:
+        import gymnasium
+    except ImportError:
+        missing.append("gymnasium")
+    try:
+        import pygame
+    except ImportError:
+        missing.append("pygame")
+    try:
+        import stable_baselines3
+    except ImportError:
+        missing.append("stable-baselines3")
+    try:
+        import box2d
+    except ImportError:
+        missing.append("box2d-py")
+    return missing
+def main():
+    """Launch the interactive CASCADE-LATTICE demo."""
+    print("""
+╔═══════════════════════════════════════════════════════════════════════════════╗
+║                                                                               ║
+║     ██████╗ █████╗ ███████╗ ██████╗ █████╗ ██████╗ ███████╗                  ║
+║    ██╔════╝██╔══██╗██╔════╝██╔════╝██╔══██╗██╔══██╗██╔════╝                  ║
+║    ██║     ███████║███████╗██║     ███████║██║  ██║█████╗                    ║
+║    ██║     ██╔══██║╚════██║██║     ██╔══██║██║  ██║██╔══╝                    ║
+║    ╚██████╗██║  ██║███████║╚██████╗██║  ██║██████╔╝███████╗                  ║
+║     ╚═════╝╚═╝  ╚═╝╚══════╝ ╚═════╝╚═╝  ╚═╝╚═════╝ ╚══════╝                  ║
+║                                                                               ║
+║              LATTICE DEMO - Sovereign Neural Internetwork Control             ║
+║                                                                               ║
+╚═══════════════════════════════════════════════════════════════════════════════╝
+    """)
+    # Check dependencies
+    missing = check_demo_dependencies()
+    if missing:
+        print(f"[!] Missing demo dependencies: {', '.join(missing)}")
+        print()
+        print("    Install with:")
+        print("    pip install cascade-lattice[demo]")
+        print()
+        print("    Or manually:")
+        print(f"    pip install {' '.join(missing)}")
+        sys.exit(1)
+    # Check for rl-zoo3 (needed for model download)
+    try:
+        import rl_zoo3
+    except ImportError:
+        print("[!] Missing rl-zoo3 (needed for pretrained model)")
+        print("    pip install rl-zoo3")
+        sys.exit(1)
+    print("[CASCADE] Starting LunarLander demo...")
+    print()
+    print("Controls:")
+    print("  [H] HOLD-FREEZE   - Pause time, inspect AI decision")
+    print("  [T] HOLD-TAKEOVER - Continue time, YOU control with WASD")
+    print("  [ESC] Release hold / Quit")
+    print()
+    print("In HOLD modes:")
+    print("  [W] Main Engine   [A] Left Engine   [D] Right Engine")
+    print("  [S] Accept AI choice / No-op")
+    print()
+    # Run the demo
+    demo_path = Path(__file__).parent.parent / "examples" / "sovereign_lattice_eval.py"
+    if not demo_path.exists():
+        # Try installed package location
+        import cascade
+        package_dir = Path(cascade.__file__).parent
+        demo_path = package_dir.parent / "examples" / "sovereign_lattice_eval.py"
+    if not demo_path.exists():
+        # Fallback: run inline demo
+        print("[!] Demo file not found. Running inline version...")
+        _run_inline_demo()
+        return
+    # Run the demo script
+    subprocess.run([sys.executable, str(demo_path)])
+def _run_inline_demo():
+    """Minimal inline demo if main file not found."""
+    import gymnasium as gym
+    import numpy as np
+    from cascade import init
+    from cascade.hold import Hold
+    from cascade.store import observe
+    init(project="cascade_demo")
+    hold = Hold.get()
+    print("[CASCADE] Running minimal demo (install full package for GUI)")
+    print()
+    env = gym.make("LunarLander-v3")
+    obs, _ = env.reset()
+    for step in range(100):
+        # Random policy for minimal demo
+        action_probs = np.array([0.25, 0.25, 0.25, 0.25])
+        resolution = hold.yield_point(
+            action_probs=action_probs,
+            value=0.0,
+            observation={"state": obs.tolist()[:4]},
+            brain_id="random_demo",
+            action_labels=["NOOP", "LEFT", "MAIN", "RIGHT"],
+            blocking=False
+        )
+        obs, reward, term, trunc, _ = env.step(resolution.action)
+        observe("demo", {
+            "step": step,
+            "action": int(resolution.action),
+            "reward": float(reward),
+            "merkle": resolution.merkle_root,
+        }, sync=False)
+        if term or trunc:
+            print(f"[CASCADE] Episode ended at step {step}")
+            break
+    env.close()
+    print("[CASCADE] Demo complete. Check ~/.cascade/lattice for provenance data.")
+if __name__ == "__main__":
+    main()

cascade/demo_sdk.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""
+CASCADE SDK Demo - Shows automatic observation of calls.
+Run: python -m cascade.demo_sdk
+"""
+import os
+import sys
+# Add cascade to path if needed
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+def demo_manual_observation():
+    """Demo manual observation without any provider installed."""
+    print("=" * 60)
+    print("CASCADE SDK Demo - Manual Observation")
+    print("=" * 60)
+    import cascade
+    from cascade.sdk import CascadeSDK
+    # Initialize with verbose mode
+    sdk = CascadeSDK()
+    sdk.init(emit_async=False, verbose=True)
+    print("\n[1] Simulating an OpenAI call...")
+    sdk.observe(
+        model_id="openai/gpt-4",
+        input_data="What is the capital of France?",
+        output_data="The capital of France is Paris.",
+        metrics={"prompt_tokens": 10, "completion_tokens": 8, "total_tokens": 18},
+        context={"provider": "openai", "endpoint": "chat.completions"}
+    )
+    print("\n[2] Simulating an Anthropic call...")
+    sdk.observe(
+        model_id="anthropic/claude-3-opus-20240229",
+        input_data="Explain quantum entanglement simply.",
+        output_data="Quantum entanglement is when two particles become connected...",
+        metrics={"input_tokens": 6, "output_tokens": 45},
+        context={"provider": "anthropic", "endpoint": "messages"}
+    )
+    print("\n[3] Simulating an Ollama local call...")
+    sdk.observe(
+        model_id="ollama/llama2:7b",
+        input_data="Write a haiku about coding.",
+        output_data="Fingers on keyboard\nLogic flows like mountain stream\nBugs become features",
+        metrics={"eval_count": 20, "eval_duration": 1.5},
+        context={"provider": "ollama", "endpoint": "generate"}
+    )
+    print("\n" + "=" * 60)
+    print("Observations saved to lattice/observations/")
+    print("=" * 60)
+    # Show what was saved
+    from cascade.observation import ObservationManager
+    manager = ObservationManager()
+    stats = manager.get_stats()
+    print(f"\nTotal observations: {stats['total_observations']}")
+    print(f"Model observations: {stats['model_observations']}")
+    print(f"Unique models: {stats['unique_models']}")
+def demo_auto_patch():
+    """Demo auto-patching (requires providers to be installed)."""
+    print("\n" + "=" * 60)
+    print("CASCADE Auto-Patch Demo")
+    print("=" * 60)
+    import cascade
+    # This patches all installed providers
+    cascade.init(verbose=True)
+    print("\nPatched providers. Now any call will emit receipts.")
+    print("Example usage:")
+    print("""
+    import cascade
+    cascade.init()
+    # OpenAI (if installed)
+    import openai
+    client = openai.OpenAI()
+    response = client.chat.completions.create(
+        model="gpt-4",
+        messages=[{"role": "user", "content": "Hello!"}]
+    )
+    # ^^^ Receipt automatically emitted to lattice
+    # Anthropic (if installed)
+    import anthropic
+    client = anthropic.Anthropic()
+    response = client.messages.create(
+        model="claude-3-opus-20240229",
+        max_tokens=100,
+        messages=[{"role": "user", "content": "Hello!"}]
+    )
+    # ^^^ Receipt automatically emitted to lattice
+    # Ollama (if installed)
+    import ollama
+    response = ollama.chat(model="llama2", messages=[
+        {"role": "user", "content": "Hello!"}
+    ])
+    # ^^^ Receipt automatically emitted to lattice
+    """)
+if __name__ == "__main__":
+    demo_manual_observation()
+    demo_auto_patch()

cascade/export/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+CASCADE Export Module - Tableau and BI Integration
+"""
+from .tableau_export import (
+    export_for_tableau,
+    export_events_csv,
+    export_chains_csv,
+    export_metrics_csv,
+    export_hold_events_csv,
+    export_causation_graph_csv,
+    TableauExporter,
+)
+__all__ = [
+    "export_for_tableau",
+    "export_events_csv",
+    "export_chains_csv",
+    "export_metrics_csv",
+    "export_hold_events_csv",
+    "export_causation_graph_csv",
+    "TableauExporter",
+]

cascade/export/tableau_export.py ADDED Viewed

	@@ -0,0 +1,598 @@

+"""
+CASCADE → Tableau Export Pipeline
+Exports Cascade data in Tableau-friendly formats:
+- CSV files (universal)
+- Hyper files (native Tableau, optional)
+Usage:
+    from cascade.export import export_for_tableau
+    # Export all data to a directory
+    export_for_tableau("./tableau_data")
+    # Then in Tableau: Connect → Text File → select CSVs
+"""
+import csv
+import json
+import os
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List, Any, Optional
+from dataclasses import dataclass, asdict
+# Try to import Hyper API (optional)
+try:
+    from tableauhyperapi import (
+        HyperProcess, Telemetry, Connection, CreateMode,
+        TableDefinition, SqlType, TableName, Inserter
+    )
+    HAS_HYPER = True
+except ImportError:
+    HAS_HYPER = False
+@dataclass
+class EventRow:
+    """Flattened event for Tableau."""
+    event_id: str
+    timestamp: float
+    timestamp_iso: str
+    component: str
+    event_type: str
+    data_json: str
+    # Extracted common fields
+    loss: Optional[float] = None
+    accuracy: Optional[float] = None
+    learning_rate: Optional[float] = None
+    epoch: Optional[int] = None
+    step: Optional[int] = None
+    tokens: Optional[int] = None
+    latency_ms: Optional[float] = None
+    error_message: Optional[str] = None
+@dataclass
+class ChainRow:
+    """Flattened provenance chain for Tableau."""
+    session_id: str
+    model_id: str
+    model_hash: str
+    input_hash: str
+    output_hash: Optional[str]
+    merkle_root: str
+    created_at: float
+    created_at_iso: str
+    record_count: int
+    external_links_count: int
+    is_verified: bool
+@dataclass
+class HoldEventRow:
+    """Flattened HOLD event for Tableau."""
+    hold_id: str
+    timestamp: float
+    timestamp_iso: str
+    brain_id: str
+    state: str  # PENDING, ACCEPTED, OVERRIDDEN, TIMEOUT
+    ai_choice: int
+    ai_confidence: float
+    final_action: int
+    was_override: bool
+    hold_duration_sec: float
+    value_estimate: float
+    action_count: int
+    override_source: Optional[str] = None
+@dataclass
+class CausationEdgeRow:
+    """Flattened causation link for Tableau."""
+    link_id: str
+    from_event_id: str
+    to_event_id: str
+    causation_type: str  # temporal, correlation, threshold, direct
+    strength: float
+    timestamp: float
+    timestamp_iso: str
+@dataclass
+class MetricRow:
+    """Time-series metric for Tableau."""
+    timestamp: float
+    timestamp_iso: str
+    metric_name: str
+    metric_value: float
+    category: str  # TRAINING_DYNAMICS, GRADIENT_HEALTH, etc.
+    component: str
+    is_anomaly: bool
+    anomaly_severity: Optional[str] = None
+def _ts_to_iso(ts: float) -> str:
+    """Convert Unix timestamp to ISO string."""
+    try:
+        return datetime.fromtimestamp(ts).isoformat()
+    except:
+        return ""
+def _extract_metric_fields(data: Dict) -> Dict[str, Any]:
+    """Extract common metric fields from event data."""
+    return {
+        "loss": data.get("loss"),
+        "accuracy": data.get("accuracy") or data.get("acc"),
+        "learning_rate": data.get("learning_rate") or data.get("lr"),
+        "epoch": data.get("epoch"),
+        "step": data.get("step") or data.get("iter"),
+        "tokens": data.get("tokens") or data.get("total_tokens"),
+        "latency_ms": data.get("latency_ms") or data.get("latency"),
+        "error_message": data.get("error") or data.get("message"),
+    }
+class TableauExporter:
+    """
+    Export Cascade data for Tableau visualization.
+    Creates a directory with CSV files ready for Tableau import:
+    - events.csv: All observed events
+    - chains.csv: Provenance chains
+    - hold_events.csv: HOLD protocol events
+    - causation_edges.csv: Graph edges for relationship diagrams
+    - metrics_timeseries.csv: Metrics over time
+    Example:
+        exporter = TableauExporter()
+        exporter.add_events(events)
+        exporter.add_chains(chains)
+        exporter.export("./tableau_data")
+    """
+    def __init__(self):
+        self.events: List[EventRow] = []
+        self.chains: List[ChainRow] = []
+        self.hold_events: List[HoldEventRow] = []
+        self.causation_edges: List[CausationEdgeRow] = []
+        self.metrics: List[MetricRow] = []
+    def add_event(self, event) -> None:
+        """Add a Cascade Event."""
+        data = event.data if hasattr(event, 'data') else {}
+        extracted = _extract_metric_fields(data)
+        row = EventRow(
+            event_id=event.event_id,
+            timestamp=event.timestamp,
+            timestamp_iso=_ts_to_iso(event.timestamp),
+            component=event.component,
+            event_type=event.event_type,
+            data_json=json.dumps(data),
+            **extracted
+        )
+        self.events.append(row)
+    def add_events(self, events) -> None:
+        """Add multiple events."""
+        for e in events:
+            self.add_event(e)
+    def add_chain(self, chain, is_verified: bool = True) -> None:
+        """Add a ProvenanceChain."""
+        row = ChainRow(
+            session_id=chain.session_id,
+            model_id=chain.model_id,
+            model_hash=chain.model_hash,
+            input_hash=chain.input_hash,
+            output_hash=chain.output_hash,
+            merkle_root=chain.merkle_root or "",
+            created_at=chain.created_at,
+            created_at_iso=_ts_to_iso(chain.created_at),
+            record_count=len(chain.records),
+            external_links_count=len(chain.external_roots),
+            is_verified=is_verified,
+        )
+        self.chains.append(row)
+    def add_chains(self, chains) -> None:
+        """Add multiple chains."""
+        for c in chains:
+            self.add_chain(c)
+    def add_hold_event(self, hold_point, resolution) -> None:
+        """Add a HOLD event with its resolution."""
+        import numpy as np
+        probs = hold_point.action_probs
+        if isinstance(probs, np.ndarray):
+            ai_choice = int(np.argmax(probs))
+            ai_confidence = float(np.max(probs))
+            action_count = len(probs)
+        else:
+            ai_choice = 0
+            ai_confidence = 0.0
+            action_count = 0
+        row = HoldEventRow(
+            hold_id=getattr(hold_point, 'hold_id', f"hold_{hold_point.timestamp}"),
+            timestamp=hold_point.timestamp if hasattr(hold_point, 'timestamp') else 0,
+            timestamp_iso=_ts_to_iso(hold_point.timestamp) if hasattr(hold_point, 'timestamp') else "",
+            brain_id=hold_point.brain_id,
+            state=resolution.state.value if hasattr(resolution.state, 'value') else str(resolution.state),
+            ai_choice=ai_choice,
+            ai_confidence=ai_confidence,
+            final_action=resolution.action,
+            was_override=resolution.was_override,
+            hold_duration_sec=resolution.hold_duration if hasattr(resolution, 'hold_duration') else 0,
+            value_estimate=hold_point.value,
+            action_count=action_count,
+            override_source=resolution.override_source if hasattr(resolution, 'override_source') else None,
+        )
+        self.hold_events.append(row)
+    def add_causation_link(self, link) -> None:
+        """Add a causation graph edge."""
+        row = CausationEdgeRow(
+            link_id=link.link_id if hasattr(link, 'link_id') else f"{link.from_event}_{link.to_event}",
+            from_event_id=link.from_event,
+            to_event_id=link.to_event,
+            causation_type=link.causation_type,
+            strength=link.strength,
+            timestamp=link.timestamp if hasattr(link, 'timestamp') else 0,
+            timestamp_iso=_ts_to_iso(link.timestamp) if hasattr(link, 'timestamp') else "",
+        )
+        self.causation_edges.append(row)
+    def add_causation_links(self, links) -> None:
+        """Add multiple causation links."""
+        for link in links:
+            self.add_causation_link(link)
+    def add_metric(self, name: str, value: float, timestamp: float,
+                   category: str = "OTHER", component: str = "default",
+                   is_anomaly: bool = False, anomaly_severity: str = None) -> None:
+        """Add a time-series metric point."""
+        row = MetricRow(
+            timestamp=timestamp,
+            timestamp_iso=_ts_to_iso(timestamp),
+            metric_name=name,
+            metric_value=value,
+            category=category,
+            component=component,
+            is_anomaly=is_anomaly,
+            anomaly_severity=anomaly_severity,
+        )
+        self.metrics.append(row)
+    def add_metrics_from_event(self, event, category_map: Dict[str, str] = None) -> None:
+        """Extract and add all metrics from an event."""
+        if category_map is None:
+            category_map = {
+                "loss": "TRAINING_DYNAMICS",
+                "accuracy": "TRAINING_DYNAMICS",
+                "lr": "TRAINING_DYNAMICS",
+                "learning_rate": "TRAINING_DYNAMICS",
+                "grad_norm": "GRADIENT_HEALTH",
+                "weight_norm": "WEIGHT_DYNAMICS",
+                "tokens": "MEMORY_COMPUTE",
+                "latency": "MEMORY_COMPUTE",
+            }
+        data = event.data if hasattr(event, 'data') else {}
+        for key, value in data.items():
+            if isinstance(value, (int, float)) and not isinstance(value, bool):
+                self.add_metric(
+                    name=key,
+                    value=float(value),
+                    timestamp=event.timestamp,
+                    category=category_map.get(key, "OTHER"),
+                    component=event.component,
+                )
+    def _write_csv(self, path: Path, rows: List, fieldnames: List[str]) -> None:
+        """Write rows to CSV."""
+        with open(path, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            for row in rows:
+                writer.writerow(asdict(row) if hasattr(row, '__dataclass_fields__') else row)
+    def export(self, output_dir: str) -> Dict[str, str]:
+        """
+        Export all data to CSV files.
+        Args:
+            output_dir: Directory to write CSV files
+        Returns:
+            Dict mapping data type to file path
+        """
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        files = {}
+        # Events
+        if self.events:
+            events_path = output_path / "events.csv"
+            self._write_csv(events_path, self.events, list(EventRow.__dataclass_fields__.keys()))
+            files["events"] = str(events_path)
+            print(f"✓ Exported {len(self.events)} events to {events_path}")
+        # Chains
+        if self.chains:
+            chains_path = output_path / "chains.csv"
+            self._write_csv(chains_path, self.chains, list(ChainRow.__dataclass_fields__.keys()))
+            files["chains"] = str(chains_path)
+            print(f"✓ Exported {len(self.chains)} chains to {chains_path}")
+        # HOLD events
+        if self.hold_events:
+            hold_path = output_path / "hold_events.csv"
+            self._write_csv(hold_path, self.hold_events, list(HoldEventRow.__dataclass_fields__.keys()))
+            files["hold_events"] = str(hold_path)
+            print(f"✓ Exported {len(self.hold_events)} HOLD events to {hold_path}")
+        # Causation edges
+        if self.causation_edges:
+            edges_path = output_path / "causation_edges.csv"
+            self._write_csv(edges_path, self.causation_edges, list(CausationEdgeRow.__dataclass_fields__.keys()))
+            files["causation_edges"] = str(edges_path)
+            print(f"✓ Exported {len(self.causation_edges)} causation edges to {edges_path}")
+        # Metrics time series
+        if self.metrics:
+            metrics_path = output_path / "metrics_timeseries.csv"
+            self._write_csv(metrics_path, self.metrics, list(MetricRow.__dataclass_fields__.keys()))
+            files["metrics"] = str(metrics_path)
+            print(f"✓ Exported {len(self.metrics)} metric points to {metrics_path}")
+        # Write a manifest
+        manifest_path = output_path / "manifest.json"
+        manifest = {
+            "exported_at": datetime.now().isoformat(),
+            "files": files,
+            "counts": {
+                "events": len(self.events),
+                "chains": len(self.chains),
+                "hold_events": len(self.hold_events),
+                "causation_edges": len(self.causation_edges),
+                "metrics": len(self.metrics),
+            }
+        }
+        with open(manifest_path, 'w') as f:
+            json.dump(manifest, f, indent=2)
+        print(f"\n📊 Tableau export complete: {output_path}")
+        print(f"   Open Tableau → Connect → Text File → Select CSVs")
+        return files
+    def export_hyper(self, output_path: str) -> Optional[str]:
+        """
+        Export to Tableau Hyper format (native, fastest).
+        Requires: pip install tableauhyperapi
+        """
+        if not HAS_HYPER:
+            print("⚠️ Hyper API not installed. Run: pip install tableauhyperapi")
+            return None
+        hyper_path = Path(output_path)
+        with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper:
+            with Connection(hyper.endpoint, str(hyper_path), CreateMode.CREATE_AND_REPLACE) as conn:
+                # Create events table
+                if self.events:
+                    events_table = TableDefinition(
+                        TableName("events"),
+                        [
+                            ("event_id", SqlType.text()),
+                            ("timestamp", SqlType.double()),
+                            ("timestamp_iso", SqlType.text()),
+                            ("component", SqlType.text()),
+                            ("event_type", SqlType.text()),
+                            ("loss", SqlType.double()),
+                            ("accuracy", SqlType.double()),
+                            ("tokens", SqlType.int()),
+                        ]
+                    )
+                    conn.catalog.create_table(events_table)
+                    with Inserter(conn, events_table) as inserter:
+                        for e in self.events:
+                            inserter.add_row([
+                                e.event_id, e.timestamp, e.timestamp_iso,
+                                e.component, e.event_type,
+                                e.loss, e.accuracy, e.tokens
+                            ])
+                        inserter.execute()
+        print(f"✓ Exported Hyper file: {hyper_path}")
+        return str(hyper_path)
+# =============================================================================
+# Convenience Functions
+# =============================================================================
+def export_for_tableau(output_dir: str = "./tableau_export",
+                       include_sample_data: bool = True) -> Dict[str, str]:
+    """
+    One-line export of all Cascade data for Tableau.
+    Args:
+        output_dir: Where to write CSV files
+        include_sample_data: Generate sample data if no real data
+    Returns:
+        Dict of exported file paths
+    """
+    exporter = TableauExporter()
+    # Try to load real data from Cascade store
+    try:
+        from cascade.store import query, stats
+        from cascade.observation import ObservationManager
+        # Get observations
+        manager = ObservationManager()
+        observations = manager.get_recent(limit=1000)
+        for obs in observations:
+            # Create mock event from observation
+            class MockEvent:
+                def __init__(self, o):
+                    self.event_id = o.get('cid', '')
+                    self.timestamp = o.get('timestamp', 0)
+                    self.component = o.get('model_id', 'unknown')
+                    self.event_type = 'inference'
+                    self.data = o.get('data', {})
+            exporter.add_event(MockEvent(obs))
+            exporter.add_metrics_from_event(MockEvent(obs))
+        print(f"Loaded {len(observations)} observations from Cascade store")
+    except Exception as e:
+        print(f"Note: Could not load Cascade store ({e})")
+        if include_sample_data:
+            print("Generating sample data for demo...")
+            _add_sample_data(exporter)
+    return exporter.export(output_dir)
+def _add_sample_data(exporter: TableauExporter) -> None:
+    """Add sample data for demonstration."""
+    import time
+    import random
+    base_time = time.time() - 3600  # 1 hour ago
+    # Sample events
+    models = ["gpt-4", "claude-3-opus", "llama-3-8b", "mistral-7b"]
+    event_types = ["inference", "training_step", "error", "checkpoint"]
+    for i in range(200):
+        class SampleEvent:
+            def __init__(self, idx):
+                self.event_id = f"evt_{idx:06d}"
+                self.timestamp = base_time + (idx * 18)  # 18 sec apart
+                self.component = random.choice(models)
+                self.event_type = random.choice(event_types)
+                self.data = {
+                    "loss": 2.5 - (idx * 0.01) + random.uniform(-0.1, 0.1),
+                    "accuracy": min(0.95, 0.5 + (idx * 0.002) + random.uniform(-0.02, 0.02)),
+                    "tokens": random.randint(100, 2000),
+                    "latency_ms": random.uniform(50, 500),
+                    "step": idx,
+                }
+        event = SampleEvent(i)
+        exporter.add_event(event)
+        exporter.add_metrics_from_event(event)
+    # Sample HOLD events
+    for i in range(20):
+        class SampleHoldPoint:
+            def __init__(self, idx):
+                import numpy as np
+                self.hold_id = f"hold_{idx:04d}"
+                self.timestamp = base_time + (idx * 180)
+                self.brain_id = random.choice(models)
+                self.action_probs = np.random.dirichlet([1, 1, 1, 1])
+                self.value = random.uniform(0.3, 0.9)
+        class SampleResolution:
+            def __init__(self, override=False):
+                self.state = type('State', (), {'value': 'OVERRIDDEN' if override else 'ACCEPTED'})()
+                self.action = random.randint(0, 3)
+                self.was_override = override
+                self.hold_duration = random.uniform(0.5, 10.0)
+                self.override_source = "human" if override else None
+        hold = SampleHoldPoint(i)
+        resolution = SampleResolution(override=random.random() < 0.25)
+        exporter.add_hold_event(hold, resolution)
+    # Sample causation edges
+    for i in range(50):
+        class SampleLink:
+            def __init__(self, idx):
+                self.link_id = f"link_{idx:04d}"
+                self.from_event = f"evt_{idx:06d}"
+                self.to_event = f"evt_{idx+1:06d}"
+                self.causation_type = random.choice(["temporal", "correlation", "threshold", "direct"])
+                self.strength = random.uniform(0.5, 1.0)
+                self.timestamp = base_time + (idx * 18)
+        exporter.add_causation_link(SampleLink(i))
+    # Sample chains
+    for i in range(10):
+        class SampleChain:
+            def __init__(self, idx):
+                self.session_id = f"session_{idx:04d}"
+                self.model_id = random.choice(models)
+                self.model_hash = f"{random.randint(0, 0xFFFFFFFF):08x}"
+                self.input_hash = f"{random.randint(0, 0xFFFFFFFF):08x}"
+                self.output_hash = f"{random.randint(0, 0xFFFFFFFF):08x}"
+                self.merkle_root = f"{random.randint(0, 0xFFFFFFFFFFFFFFFF):016x}"
+                self.created_at = base_time + (idx * 360)
+                self.records = [None] * random.randint(5, 50)
+                self.external_roots = [f"root_{j}" for j in range(random.randint(0, 3))]
+        exporter.add_chain(SampleChain(i))
+def export_events_csv(events, output_path: str) -> str:
+    """Export events to CSV."""
+    exporter = TableauExporter()
+    exporter.add_events(events)
+    files = exporter.export(str(Path(output_path).parent))
+    return files.get("events", "")
+def export_chains_csv(chains, output_path: str) -> str:
+    """Export chains to CSV."""
+    exporter = TableauExporter()
+    exporter.add_chains(chains)
+    files = exporter.export(str(Path(output_path).parent))
+    return files.get("chains", "")
+def export_metrics_csv(events, output_path: str) -> str:
+    """Export metrics time series to CSV."""
+    exporter = TableauExporter()
+    for e in events:
+        exporter.add_metrics_from_event(e)
+    files = exporter.export(str(Path(output_path).parent))
+    return files.get("metrics", "")
+def export_hold_events_csv(hold_pairs, output_path: str) -> str:
+    """Export HOLD events to CSV. hold_pairs = [(hold_point, resolution), ...]"""
+    exporter = TableauExporter()
+    for hold, res in hold_pairs:
+        exporter.add_hold_event(hold, res)
+    files = exporter.export(str(Path(output_path).parent))
+    return files.get("hold_events", "")
+def export_causation_graph_csv(links, output_path: str) -> str:
+    """Export causation edges to CSV."""
+    exporter = TableauExporter()
+    exporter.add_causation_links(links)
+    files = exporter.export(str(Path(output_path).parent))
+    return files.get("causation_edges", "")
+if __name__ == "__main__":
+    # Quick test
+    print("Exporting sample data for Tableau...")
+    export_for_tableau("./tableau_export", include_sample_data=True)

cascade/forensics/__init__.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+CASCADE Forensics - Read the Ghost in the Data
+Every dataset is a confession. It remembers what happened to it.
+This module reads those memories.
+GHOST LOG: Inferred processing history from data artifacts
+SKELETON: Probable system architecture
+DNA: Technology fingerprints
+SOUL: Behavioral predictions
+Usage:
+    from cascade.forensics import DataForensics
+    forensics = DataForensics()
+    report = forensics.analyze(dataframe)
+    print(report.ghost_log)      # Inferred operations
+    print(report.skeleton)       # System architecture
+    print(report.fingerprints)   # Technology hints
+"""
+from cascade.forensics.analyzer import (
+    DataForensics,
+    ForensicsReport,
+    GhostLog,
+    InferredOperation,
+)
+from cascade.forensics.artifacts import (
+    ArtifactDetector,
+    TimestampArtifacts,
+    IDPatternArtifacts,
+    TextArtifacts,
+    NumericArtifacts,
+    NullPatternArtifacts,
+    SchemaArtifacts,
+)
+from cascade.forensics.fingerprints import (
+    TechFingerprinter,
+    Fingerprint,
+)
+__all__ = [
+    "DataForensics",
+    "ForensicsReport",
+    "GhostLog",
+    "InferredOperation",
+    "ArtifactDetector",
+    "TechFingerprinter",
+    "Fingerprint",
+]

cascade/forensics/analyzer.py ADDED Viewed

	@@ -0,0 +1,464 @@

+"""
+CASCADE Forensics - Main Analyzer
+The data remembers. This module reads those memories.
+Generates:
+- GHOST LOG: Inferred sequence of operations
+- SKELETON: Probable system architecture
+- DNA: Technology fingerprints
+- SOUL: Behavioral predictions
+"""
+import hashlib
+import json
+import time
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Optional
+from collections import OrderedDict
+from cascade.forensics.artifacts import (
+    Artifact, ArtifactDetector,
+    TimestampArtifacts, IDPatternArtifacts, TextArtifacts,
+    NumericArtifacts, NullPatternArtifacts, SchemaArtifacts,
+)
+from cascade.forensics.fingerprints import TechFingerprinter, Fingerprint
+@dataclass
+class InferredOperation:
+    """A single inferred operation from the ghost log."""
+    sequence: int
+    operation: str
+    description: str
+    confidence: float
+    evidence: List[str] = field(default_factory=list)
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "seq": self.sequence,
+            "op": self.operation,
+            "desc": self.description,
+            "confidence": self.confidence,
+            "evidence": self.evidence,
+        }
+@dataclass
+class GhostLog:
+    """
+    Inferred processing history - the ghost of the system.
+    This is a reconstruction of what PROBABLY happened
+    based on artifacts left in the data.
+    """
+    operations: List[InferredOperation] = field(default_factory=list)
+    # Provenance
+    analysis_timestamp: float = field(default_factory=time.time)
+    data_hash: str = ""
+    ghost_hash: str = ""
+    def add_operation(self, op: str, desc: str, confidence: float, evidence: List[str] = None):
+        """Add an inferred operation to the ghost log."""
+        self.operations.append(InferredOperation(
+            sequence=len(self.operations) + 1,
+            operation=op,
+            description=desc,
+            confidence=confidence,
+            evidence=evidence or [],
+        ))
+    def finalize(self) -> str:
+        """Compute hash of the ghost log for provenance."""
+        content = json.dumps([op.to_dict() for op in self.operations], sort_keys=True)
+        self.ghost_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
+        return self.ghost_hash
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "operations": [op.to_dict() for op in self.operations],
+            "analysis_timestamp": self.analysis_timestamp,
+            "data_hash": self.data_hash,
+            "ghost_hash": self.ghost_hash,
+        }
+    def to_narrative(self) -> str:
+        """Generate human-readable narrative of inferred processing."""
+        if not self.operations:
+            return "No processing artifacts detected."
+        lines = ["## Ghost Log - Inferred Processing History\n"]
+        lines.append("*Based on artifacts left in the data, this is what probably happened:*\n")
+        for op in self.operations:
+            conf_str = "●" * int(op.confidence * 5) + "○" * (5 - int(op.confidence * 5))
+            lines.append(f"**{op.sequence}. {op.operation}** [{conf_str}]")
+            lines.append(f"   {op.description}")
+            if op.evidence:
+                lines.append(f"   *Evidence: {', '.join(op.evidence[:3])}*")
+            lines.append("")
+        return "\n".join(lines)
+@dataclass
+class ForensicsReport:
+    """Complete forensics analysis report."""
+    # Artifacts detected
+    artifacts: List[Artifact] = field(default_factory=list)
+    # Inferred processing
+    ghost_log: GhostLog = field(default_factory=GhostLog)
+    # Technology fingerprints
+    fingerprints: List[Fingerprint] = field(default_factory=list)
+    # Synthesized architecture
+    likely_stack: Dict[str, Any] = field(default_factory=dict)
+    # Security concerns
+    security_concerns: List[Dict[str, Any]] = field(default_factory=list)
+    # Metadata
+    analysis_timestamp: float = field(default_factory=time.time)
+    row_count: int = 0
+    column_count: int = 0
+    data_hash: str = ""
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "artifacts": [a.to_dict() for a in self.artifacts],
+            "ghost_log": self.ghost_log.to_dict(),
+            "fingerprints": [f.to_dict() for f in self.fingerprints],
+            "likely_stack": self.likely_stack,
+            "security_concerns": self.security_concerns,
+            "metadata": {
+                "timestamp": self.analysis_timestamp,
+                "rows": self.row_count,
+                "columns": self.column_count,
+                "data_hash": self.data_hash,
+            }
+        }
+    def summary(self) -> Dict[str, Any]:
+        """Generate summary for display."""
+        return {
+            "artifacts_found": len(self.artifacts),
+            "operations_inferred": len(self.ghost_log.operations),
+            "technologies_identified": len(self.fingerprints),
+            "security_concerns": len(self.security_concerns),
+            "top_fingerprints": [f.technology for f in self.fingerprints[:5]],
+            "data_hash": self.data_hash,
+            "ghost_hash": self.ghost_log.ghost_hash,
+        }
+class DataForensics:
+    """
+    Main forensics analyzer.
+    Usage:
+        forensics = DataForensics()
+        report = forensics.analyze(df)
+        print(report.ghost_log.to_narrative())
+        print(report.likely_stack)
+    """
+    def __init__(self):
+        self.detectors = [
+            TimestampArtifacts(),
+            IDPatternArtifacts(),
+            TextArtifacts(),
+            NumericArtifacts(),
+            NullPatternArtifacts(),
+            SchemaArtifacts(),
+        ]
+        self.fingerprinter = TechFingerprinter()
+    def analyze(self, df) -> ForensicsReport:
+        """
+        Analyze a dataframe for processing artifacts.
+        Args:
+            df: Pandas DataFrame to analyze
+        Returns:
+            ForensicsReport with all findings
+        """
+        report = ForensicsReport()
+        report.row_count = len(df)
+        report.column_count = len(df.columns)
+        # Compute data hash
+        try:
+            # Sample hash for large datasets
+            if len(df) > 10000:
+                sample = df.sample(10000, random_state=42)
+            else:
+                sample = df
+            content = sample.to_json()
+            report.data_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
+        except:
+            report.data_hash = "unknown"
+        # Run all detectors
+        all_artifacts = []
+        for detector in self.detectors:
+            try:
+                # Some detectors analyze all columns at once
+                if hasattr(detector, 'detect_all'):
+                    artifacts = detector.detect_all(df)
+                    all_artifacts.extend(artifacts)
+                # Column-by-column analysis
+                for col in df.columns:
+                    artifacts = detector.detect(df, col)
+                    all_artifacts.extend(artifacts)
+            except Exception as e:
+                # Don't let one detector crash the whole analysis
+                pass
+        report.artifacts = all_artifacts
+        # Build ghost log from artifacts
+        report.ghost_log = self._build_ghost_log(all_artifacts, df)
+        report.ghost_log.data_hash = report.data_hash
+        report.ghost_log.finalize()
+        # Generate technology fingerprints
+        report.fingerprints = self.fingerprinter.analyze(all_artifacts)
+        report.likely_stack = self.fingerprinter.get_likely_stack()
+        report.security_concerns = self.fingerprinter.get_security_concerns()
+        return report
+    def _build_ghost_log(self, artifacts: List[Artifact], df) -> GhostLog:
+        """
+        Build inferred processing history from artifacts.
+        This is where we reconstruct the sequence of operations
+        that probably created this data.
+        """
+        ghost = GhostLog()
+        # Group artifacts by type for logical ordering
+        by_type = {}
+        for a in artifacts:
+            if a.artifact_type not in by_type:
+                by_type[a.artifact_type] = []
+            by_type[a.artifact_type].append(a)
+        # Infer operations in logical order
+        # 1. Data sourcing (schema artifacts come first)
+        if "framework_fingerprint" in by_type:
+            for a in by_type["framework_fingerprint"]:
+                ghost.add_operation(
+                    "DATA_SOURCE",
+                    f"Data originated from {a.details.get('framework', 'database')}: {a.evidence}",
+                    a.confidence,
+                    [a.evidence]
+                )
+        if "naming_convention" in by_type:
+            for a in by_type["naming_convention"]:
+                ghost.add_operation(
+                    "SCHEMA_ORIGIN",
+                    f"Schema follows {a.details.get('convention', 'unknown')} convention",
+                    a.confidence,
+                    [a.evidence]
+                )
+        # 2. Merging (if multiple sources detected)
+        if "mixed_conventions" in by_type or "id_prefix" in by_type:
+            ghost.add_operation(
+                "DATA_MERGE",
+                "Multiple data sources were merged together",
+                0.75,
+                [a.evidence for a in by_type.get("mixed_conventions", []) + by_type.get("id_prefix", [])]
+            )
+        # 3. ID generation
+        if "uuid_version" in by_type:
+            for a in by_type["uuid_version"]:
+                ghost.add_operation(
+                    "ID_GENERATION",
+                    f"IDs generated using {a.details.get('meaning', 'UUID')}",
+                    a.confidence,
+                    [a.evidence]
+                )
+        if "hash_id" in by_type:
+            for a in by_type["hash_id"]:
+                ghost.add_operation(
+                    "ID_GENERATION",
+                    f"IDs are {a.details.get('probable_algorithm', 'hash')}-based (content-addressed)",
+                    a.confidence,
+                    [a.evidence]
+                )
+        # 4. Processing / Transformation
+        if "case_normalization" in by_type:
+            for a in by_type["case_normalization"]:
+                ghost.add_operation(
+                    "TEXT_NORMALIZATION",
+                    f"Text converted to {a.details.get('case', 'normalized')} case",
+                    a.confidence,
+                    [a.evidence]
+                )
+        if "whitespace_trimming" in by_type:
+            ghost.add_operation(
+                "TEXT_CLEANING",
+                "Whitespace trimmed from text fields",
+                0.70,
+                [a.evidence for a in by_type["whitespace_trimming"]]
+            )
+        if "truncation" in by_type:
+            for a in by_type["truncation"]:
+                ghost.add_operation(
+                    "FIELD_TRUNCATION",
+                    f"Text truncated at {a.details.get('max_length', '?')} characters",
+                    a.confidence,
+                    [a.evidence]
+                )
+        if "numeric_rounding" in by_type:
+            for a in by_type["numeric_rounding"]:
+                ghost.add_operation(
+                    "NUMERIC_ROUNDING",
+                    f"Numbers rounded: {a.evidence}",
+                    a.confidence,
+                    [a.evidence]
+                )
+        # 5. Filtering / Deletion
+        if "sequential_id_gaps" in by_type:
+            for a in by_type["sequential_id_gaps"]:
+                gap_ratio = a.details.get('gap_ratio', 0)
+                ghost.add_operation(
+                    "RECORD_FILTERING",
+                    f"~{gap_ratio*100:.0f}% of records were filtered or deleted",
+                    a.confidence,
+                    [a.evidence]
+                )
+        if "hard_cutoff" in by_type:
+            for a in by_type["hard_cutoff"]:
+                ghost.add_operation(
+                    "VALUE_CAPPING",
+                    f"Values capped at {a.details.get('cutoff', '?')}",
+                    a.confidence,
+                    [a.evidence]
+                )
+        # 6. Batch processing patterns
+        if "timestamp_rounding" in by_type:
+            for a in by_type["timestamp_rounding"]:
+                ghost.add_operation(
+                    "BATCH_PROCESSING",
+                    f"Data processed in batches: {a.evidence}",
+                    a.confidence,
+                    [a.evidence]
+                )
+        if "regular_intervals" in by_type:
+            for a in by_type["regular_intervals"]:
+                ghost.add_operation(
+                    "SCHEDULED_JOB",
+                    f"Regular processing schedule detected: {a.details.get('interval_desc', 'unknown')}",
+                    a.confidence,
+                    [a.evidence]
+                )
+        if "temporal_clustering" in by_type:
+            ghost.add_operation(
+                "BURST_PROCESSING",
+                "Event-driven or burst batch processing detected",
+                0.75,
+                [a.evidence for a in by_type["temporal_clustering"]]
+            )
+        # 7. Data quality issues
+        if "encoding_artifact" in by_type:
+            for a in by_type["encoding_artifact"]:
+                ghost.add_operation(
+                    "ENCODING_ERROR",
+                    f"Character encoding conversion failed: {a.evidence}",
+                    a.confidence,
+                    [a.evidence]
+                )
+        if "sentinel_value" in by_type:
+            for a in by_type["sentinel_value"]:
+                ghost.add_operation(
+                    "NULL_HANDLING",
+                    f"NULLs represented as sentinel value {a.details.get('sentinel', '?')}",
+                    a.confidence,
+                    [a.evidence]
+                )
+        if "high_null_rate" in by_type:
+            for a in by_type["high_null_rate"]:
+                ghost.add_operation(
+                    "OPTIONAL_FIELD",
+                    f"Column {a.column} is optional or had ETL issues ({a.details.get('null_rate', 0)*100:.0f}% null)",
+                    a.confidence,
+                    [a.evidence]
+                )
+        # 8. Export (often the last step)
+        if any("PANDAS" in a.inferred_operation for a in artifacts):
+            ghost.add_operation(
+                "DATA_EXPORT",
+                "Data exported via Pandas to CSV",
+                0.90,
+                ["Unnamed column artifact"]
+            )
+        return ghost
+    def analyze_file(self, filepath: str) -> ForensicsReport:
+        """
+        Analyze a data file.
+        Supports: CSV, JSON, JSONL, Parquet, Excel
+        """
+        import pandas as pd
+        from pathlib import Path
+        path = Path(filepath)
+        suffix = path.suffix.lower()
+        if suffix == '.csv':
+            df = pd.read_csv(filepath)
+        elif suffix == '.json':
+            df = pd.read_json(filepath)
+        elif suffix == '.jsonl':
+            df = pd.read_json(filepath, lines=True)
+        elif suffix == '.parquet':
+            df = pd.read_parquet(filepath)
+        elif suffix in ['.xlsx', '.xls']:
+            df = pd.read_excel(filepath)
+        else:
+            # Try CSV as default
+            df = pd.read_csv(filepath)
+        return self.analyze(df)
+def analyze_dataframe(df) -> ForensicsReport:
+    """Convenience function to analyze a dataframe."""
+    forensics = DataForensics()
+    return forensics.analyze(df)
+def analyze_file(filepath: str) -> ForensicsReport:
+    """Convenience function to analyze a file."""
+    forensics = DataForensics()
+    return forensics.analyze_file(filepath)

cascade/forensics/artifacts.py ADDED Viewed

	@@ -0,0 +1,1063 @@

+"""
+CASCADE Forensics - Artifact Detectors
+Each detector looks for specific patterns in data that reveal
+how it was processed. The data remembers. We read.
+"""
+import re
+import hashlib
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Optional, Set, Tuple
+from datetime import datetime
+from collections import Counter
+import statistics
+@dataclass
+class Artifact:
+    """A single detected artifact - evidence of processing."""
+    artifact_type: str
+    column: str
+    evidence: str
+    confidence: float  # 0.0 to 1.0
+    inferred_operation: str
+    details: Dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "type": self.artifact_type,
+            "column": self.column,
+            "evidence": self.evidence,
+            "confidence": self.confidence,
+            "inferred_op": self.inferred_operation,
+            "details": self.details,
+        }
+class ArtifactDetector:
+    """Base class for artifact detection."""
+    name: str = "base"
+    def detect(self, df, column: str) -> List[Artifact]:
+        """Detect artifacts in a column. Override in subclasses."""
+        return []
+    def detect_all(self, df) -> List[Artifact]:
+        """Detect artifacts across all applicable columns."""
+        artifacts = []
+        for col in df.columns:
+            artifacts.extend(self.detect(df, col))
+        return artifacts
+class TimestampArtifacts(ArtifactDetector):
+    """
+    Detect timestamp patterns that reveal processing behavior.
+    Artifacts detected:
+    - Rounding to minute/hour/day (batch processing intervals)
+    - Regular intervals (scheduled jobs)
+    - Temporal clustering (burst processing)
+    - Timezone artifacts
+    - Future/past anomalies
+    """
+    name = "timestamp"
+    def detect(self, df, column: str) -> List[Artifact]:
+        artifacts = []
+        # Check if column looks like timestamps
+        if not self._is_timestamp_column(df, column):
+            return artifacts
+        try:
+            timestamps = self._parse_timestamps(df, column)
+            if len(timestamps) < 2:
+                return artifacts
+            # Check for rounding patterns
+            rounding = self._detect_rounding(timestamps)
+            if rounding:
+                artifacts.append(rounding)
+            # Check for regular intervals
+            intervals = self._detect_intervals(timestamps)
+            if intervals:
+                artifacts.append(intervals)
+            # Check for clustering
+            clustering = self._detect_clustering(timestamps)
+            if clustering:
+                artifacts.append(clustering)
+            # Check for timezone issues
+            tz_artifacts = self._detect_timezone_artifacts(timestamps)
+            artifacts.extend(tz_artifacts)
+        except Exception:
+            pass
+        return artifacts
+    def _is_timestamp_column(self, df, column: str) -> bool:
+        """Heuristic to detect timestamp columns."""
+        col_lower = column.lower()
+        timestamp_hints = ['time', 'date', 'created', 'updated', 'modified', 'timestamp', '_at', '_on']
+        if any(hint in col_lower for hint in timestamp_hints):
+            return True
+        # Check data type
+        dtype = str(df[column].dtype)
+        if 'datetime' in dtype or 'time' in dtype:
+            return True
+        # Sample and check format
+        sample = df[column].dropna().head(5).astype(str).tolist()
+        date_patterns = [
+            r'\d{4}-\d{2}-\d{2}',
+            r'\d{2}/\d{2}/\d{4}',
+            r'\d{10,13}',  # Unix timestamp
+        ]
+        for val in sample:
+            for pattern in date_patterns:
+                if re.search(pattern, val):
+                    return True
+        return False
+    def _parse_timestamps(self, df, column: str) -> List[datetime]:
+        """Parse column to datetime objects."""
+        import pandas as pd
+        try:
+            # Try pandas datetime conversion
+            parsed = pd.to_datetime(df[column], errors='coerce')
+            return [ts.to_pydatetime() for ts in parsed.dropna()]
+        except:
+            return []
+    def _detect_rounding(self, timestamps: List[datetime]) -> Optional[Artifact]:
+        """Detect if timestamps are rounded to specific intervals."""
+        if len(timestamps) < 10:
+            return None
+        # Check seconds
+        seconds = [ts.second for ts in timestamps]
+        unique_seconds = set(seconds)
+        # All zeros = minute rounding
+        if unique_seconds == {0}:
+            # Check minutes
+            minutes = [ts.minute for ts in timestamps]
+            unique_minutes = set(minutes)
+            if unique_minutes == {0}:
+                return Artifact(
+                    artifact_type="timestamp_rounding",
+                    column="timestamps",
+                    evidence=f"All timestamps rounded to hour (0 minutes, 0 seconds)",
+                    confidence=0.95,
+                    inferred_operation="BATCH_HOURLY",
+                    details={"interval": "hour", "sample_size": len(timestamps)}
+                )
+            elif all(m % 15 == 0 for m in minutes):
+                return Artifact(
+                    artifact_type="timestamp_rounding",
+                    column="timestamps",
+                    evidence=f"Timestamps rounded to 15-minute intervals",
+                    confidence=0.90,
+                    inferred_operation="BATCH_15MIN",
+                    details={"interval": "15min", "unique_minutes": list(unique_minutes)}
+                )
+            elif all(m % 5 == 0 for m in minutes):
+                return Artifact(
+                    artifact_type="timestamp_rounding",
+                    column="timestamps",
+                    evidence=f"Timestamps rounded to 5-minute intervals",
+                    confidence=0.85,
+                    inferred_operation="BATCH_5MIN",
+                    details={"interval": "5min"}
+                )
+            else:
+                return Artifact(
+                    artifact_type="timestamp_rounding",
+                    column="timestamps",
+                    evidence=f"Timestamps rounded to minute (0 seconds)",
+                    confidence=0.85,
+                    inferred_operation="BATCH_MINUTE",
+                    details={"interval": "minute"}
+                )
+        # Check if seconds cluster on specific values
+        second_counts = Counter(seconds)
+        most_common = second_counts.most_common(1)[0]
+        if most_common[1] > len(timestamps) * 0.8:
+            return Artifact(
+                artifact_type="timestamp_rounding",
+                column="timestamps",
+                evidence=f"{most_common[1]/len(timestamps)*100:.0f}% of timestamps have second={most_common[0]}",
+                confidence=0.70,
+                inferred_operation="SYSTEMATIC_TIMESTAMP_ASSIGNMENT",
+                details={"dominant_second": most_common[0], "percentage": most_common[1]/len(timestamps)}
+            )
+        return None
+    def _detect_intervals(self, timestamps: List[datetime]) -> Optional[Artifact]:
+        """Detect regular time intervals suggesting scheduled jobs."""
+        if len(timestamps) < 10:
+            return None
+        sorted_ts = sorted(timestamps)
+        deltas = [(sorted_ts[i+1] - sorted_ts[i]).total_seconds() for i in range(len(sorted_ts)-1)]
+        if not deltas:
+            return None
+        # Check for consistent intervals
+        median_delta = statistics.median(deltas)
+        if median_delta == 0:
+            return None
+        # Count how many deltas are close to median
+        tolerance = median_delta * 0.1  # 10% tolerance
+        consistent = sum(1 for d in deltas if abs(d - median_delta) < tolerance)
+        consistency_ratio = consistent / len(deltas)
+        if consistency_ratio > 0.7:
+            # Describe the interval
+            interval_desc = self._describe_interval(median_delta)
+            return Artifact(
+                artifact_type="regular_intervals",
+                column="timestamps",
+                evidence=f"{consistency_ratio*100:.0f}% of records have ~{interval_desc} intervals",
+                confidence=min(0.95, consistency_ratio),
+                inferred_operation=f"SCHEDULED_JOB_{interval_desc.upper().replace(' ', '_')}",
+                details={
+                    "median_seconds": median_delta,
+                    "interval_desc": interval_desc,
+                    "consistency": consistency_ratio
+                }
+            )
+        return None
+    def _describe_interval(self, seconds: float) -> str:
+        """Human-readable interval description."""
+        if seconds < 60:
+            return f"{seconds:.0f}s"
+        elif seconds < 3600:
+            return f"{seconds/60:.0f}min"
+        elif seconds < 86400:
+            return f"{seconds/3600:.1f}hr"
+        else:
+            return f"{seconds/86400:.1f}day"
+    def _detect_clustering(self, timestamps: List[datetime]) -> Optional[Artifact]:
+        """Detect temporal clustering (burst processing)."""
+        if len(timestamps) < 20:
+            return None
+        sorted_ts = sorted(timestamps)
+        # Look for bursts: many records in short time, then gaps
+        deltas = [(sorted_ts[i+1] - sorted_ts[i]).total_seconds() for i in range(len(sorted_ts)-1)]
+        if not deltas:
+            return None
+        median_delta = statistics.median(deltas)
+        if median_delta == 0:
+            return None
+        # Count "burst" deltas (much smaller than median) vs "gap" deltas (much larger)
+        bursts = sum(1 for d in deltas if d < median_delta * 0.1)
+        gaps = sum(1 for d in deltas if d > median_delta * 5)
+        if bursts > len(deltas) * 0.3 and gaps > len(deltas) * 0.05:
+            return Artifact(
+                artifact_type="temporal_clustering",
+                column="timestamps",
+                evidence=f"Burst pattern: {bursts} rapid records, {gaps} long gaps",
+                confidence=0.75,
+                inferred_operation="BATCH_BURST_PROCESSING",
+                details={
+                    "burst_count": bursts,
+                    "gap_count": gaps,
+                    "median_delta_seconds": median_delta
+                }
+            )
+        return None
+    def _detect_timezone_artifacts(self, timestamps: List[datetime]) -> List[Artifact]:
+        """Detect timezone-related artifacts."""
+        artifacts = []
+        # Check for hour distribution anomalies (e.g., no records 0-7 UTC = US business hours)
+        hours = [ts.hour for ts in timestamps]
+        hour_counts = Counter(hours)
+        # Check for gaps suggesting business hours in a specific timezone
+        zero_hours = [h for h in range(24) if hour_counts.get(h, 0) == 0]
+        if len(zero_hours) >= 6 and len(zero_hours) <= 12:
+            # Contiguous gap?
+            zero_hours_sorted = sorted(zero_hours)
+            if zero_hours_sorted[-1] - zero_hours_sorted[0] == len(zero_hours) - 1:
+                artifacts.append(Artifact(
+                    artifact_type="business_hours",
+                    column="timestamps",
+                    evidence=f"No records during hours {min(zero_hours)}-{max(zero_hours)} UTC",
+                    confidence=0.70,
+                    inferred_operation="BUSINESS_HOURS_ONLY",
+                    details={"quiet_hours": zero_hours}
+                ))
+        return artifacts
+class IDPatternArtifacts(ArtifactDetector):
+    """
+    Detect ID patterns that reveal data lineage.
+    Artifacts detected:
+    - Sequential IDs with gaps (deletions/filtering)
+    - UUID versions (generation method)
+    - Prefixes (source identification)
+    - Hash patterns (deterministic generation)
+    """
+    name = "id_patterns"
+    def detect(self, df, column: str) -> List[Artifact]:
+        artifacts = []
+        if not self._is_id_column(df, column):
+            return artifacts
+        try:
+            values = df[column].dropna().astype(str).tolist()
+            if len(values) < 5:
+                return artifacts
+            # Check for sequential integers with gaps
+            gaps = self._detect_sequential_gaps(values)
+            if gaps:
+                artifacts.append(gaps)
+            # Check for UUID patterns
+            uuid_artifact = self._detect_uuid_patterns(values)
+            if uuid_artifact:
+                artifacts.append(uuid_artifact)
+            # Check for prefixes
+            prefix = self._detect_prefixes(values)
+            if prefix:
+                artifacts.append(prefix)
+            # Check for hash patterns
+            hash_artifact = self._detect_hash_patterns(values)
+            if hash_artifact:
+                artifacts.append(hash_artifact)
+        except Exception:
+            pass
+        return artifacts
+    def _is_id_column(self, df, column: str) -> bool:
+        """Heuristic to detect ID columns."""
+        col_lower = column.lower()
+        id_hints = ['id', 'key', 'uuid', 'guid', 'pk', '_id', 'identifier']
+        return any(hint in col_lower for hint in id_hints)
+    def _detect_sequential_gaps(self, values: List[str]) -> Optional[Artifact]:
+        """Detect sequential IDs with gaps indicating deletions."""
+        # Try to parse as integers
+        try:
+            ints = sorted([int(v) for v in values if v.isdigit()])
+            if len(ints) < 10:
+                return None
+            # Check for gaps
+            expected_count = ints[-1] - ints[0] + 1
+            actual_count = len(set(ints))
+            gap_count = expected_count - actual_count
+            gap_ratio = gap_count / expected_count if expected_count > 0 else 0
+            if gap_ratio > 0.05:  # More than 5% missing
+                return Artifact(
+                    artifact_type="sequential_id_gaps",
+                    column=values[0] if values else "id",
+                    evidence=f"Sequential IDs with {gap_ratio*100:.1f}% gaps ({gap_count} missing)",
+                    confidence=0.85,
+                    inferred_operation="FILTERING_OR_DELETION",
+                    details={
+                        "min_id": ints[0],
+                        "max_id": ints[-1],
+                        "expected": expected_count,
+                        "actual": actual_count,
+                        "gap_ratio": gap_ratio
+                    }
+                )
+        except:
+            pass
+        return None
+    def _detect_uuid_patterns(self, values: List[str]) -> Optional[Artifact]:
+        """Detect UUID version from patterns."""
+        uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-([0-9a-f])[0-9a-f]{3}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
+        versions = []
+        for v in values[:100]:  # Sample
+            match = uuid_pattern.match(v)
+            if match:
+                versions.append(match.group(1))
+        if len(versions) < len(values[:100]) * 0.5:
+            return None
+        version_counts = Counter(versions)
+        dominant = version_counts.most_common(1)[0]
+        version_meanings = {
+            '1': 'TIME_BASED_MAC',      # Reveals generation time + machine
+            '2': 'DCE_SECURITY',
+            '3': 'MD5_HASH',            # Deterministic from input
+            '4': 'RANDOM',              # Crypto random
+            '5': 'SHA1_HASH',           # Deterministic from input
+            '6': 'SORTABLE_TIME',       # Modern time-sortable
+            '7': 'UNIX_TIME_RANDOM',    # Time-ordered with randomness
+        }
+        return Artifact(
+            artifact_type="uuid_version",
+            column="id",
+            evidence=f"UUIDs are version {dominant[0]} ({version_meanings.get(dominant[0], 'UNKNOWN')})",
+            confidence=0.90,
+            inferred_operation=f"UUID_GENERATION_V{dominant[0]}",
+            details={
+                "version": dominant[0],
+                "meaning": version_meanings.get(dominant[0], 'unknown'),
+                "sample_count": len(versions)
+            }
+        )
+    def _detect_prefixes(self, values: List[str]) -> Optional[Artifact]:
+        """Detect common prefixes indicating source systems."""
+        if len(values) < 10:
+            return None
+        # Find common prefix
+        prefix_len = 0
+        for i in range(1, min(20, min(len(v) for v in values[:100]))):
+            prefixes = set(v[:i] for v in values[:100])
+            if len(prefixes) <= 3:  # Allow up to 3 different prefixes
+                prefix_len = i
+            else:
+                break
+        if prefix_len >= 2:
+            prefixes = Counter(v[:prefix_len] for v in values)
+            top_prefixes = prefixes.most_common(3)
+            return Artifact(
+                artifact_type="id_prefix",
+                column="id",
+                evidence=f"IDs have systematic prefix: {top_prefixes}",
+                confidence=0.80,
+                inferred_operation="MULTI_SOURCE_MERGE" if len(top_prefixes) > 1 else "SOURCE_IDENTIFICATION",
+                details={
+                    "prefixes": dict(top_prefixes),
+                    "prefix_length": prefix_len
+                }
+            )
+        return None
+    def _detect_hash_patterns(self, values: List[str]) -> Optional[Artifact]:
+        """Detect if IDs look like hashes."""
+        hex_pattern = re.compile(r'^[0-9a-f]+$', re.I)
+        hex_lengths = []
+        for v in values[:100]:
+            if hex_pattern.match(v):
+                hex_lengths.append(len(v))
+        if len(hex_lengths) < len(values[:100]) * 0.8:
+            return None
+        # Check for consistent hash lengths
+        length_counts = Counter(hex_lengths)
+        dominant = length_counts.most_common(1)[0]
+        hash_types = {
+            32: 'MD5',
+            40: 'SHA1',
+            64: 'SHA256',
+            128: 'SHA512',
+            16: 'SHORT_HASH',
+        }
+        if dominant[1] > len(hex_lengths) * 0.9:
+            hash_type = hash_types.get(dominant[0], f'{dominant[0]}-char hash')
+            return Artifact(
+                artifact_type="hash_id",
+                column="id",
+                evidence=f"IDs are {hash_type} hashes ({dominant[0]} hex chars)",
+                confidence=0.85,
+                inferred_operation=f"DETERMINISTIC_ID_GENERATION_{hash_type}",
+                details={
+                    "hash_length": dominant[0],
+                    "probable_algorithm": hash_type
+                }
+            )
+        return None
+class TextArtifacts(ArtifactDetector):
+    """
+    Detect text processing artifacts.
+    Artifacts detected:
+    - Truncation (field length limits)
+    - Encoding issues (charset conversion)
+    - Case normalization
+    - Whitespace patterns
+    - Sanitization patterns
+    """
+    name = "text"
+    def detect(self, df, column: str) -> List[Artifact]:
+        artifacts = []
+        dtype = str(df[column].dtype)
+        if 'object' not in dtype and 'str' not in dtype:
+            return artifacts
+        try:
+            values = df[column].dropna().astype(str).tolist()
+            if len(values) < 5:
+                return artifacts
+            # Truncation
+            trunc = self._detect_truncation(values)
+            if trunc:
+                artifacts.append(trunc)
+            # Encoding issues
+            encoding = self._detect_encoding_artifacts(values)
+            if encoding:
+                artifacts.append(encoding)
+            # Case patterns
+            case = self._detect_case_patterns(values, column)
+            if case:
+                artifacts.append(case)
+            # Whitespace
+            ws = self._detect_whitespace_patterns(values)
+            if ws:
+                artifacts.append(ws)
+        except Exception:
+            pass
+        return artifacts
+    def _detect_truncation(self, values: List[str]) -> Optional[Artifact]:
+        """Detect truncation at specific lengths."""
+        lengths = [len(v) for v in values]
+        max_len = max(lengths)
+        # Count values at max length
+        at_max = sum(1 for l in lengths if l == max_len)
+        # If many values hit the max, likely truncation
+        if at_max > len(values) * 0.1 and max_len > 10:
+            # Check if values at max look truncated (end mid-word, etc.)
+            max_values = [v for v in values if len(v) == max_len]
+            truncated_looking = sum(1 for v in max_values if not v.endswith(('.', '!', '?', ' ')))
+            if truncated_looking > len(max_values) * 0.5:
+                return Artifact(
+                    artifact_type="truncation",
+                    column=str(values[0])[:20] if values else "text",
+                    evidence=f"{at_max} values ({at_max/len(values)*100:.1f}%) truncated at {max_len} chars",
+                    confidence=0.80,
+                    inferred_operation=f"FIELD_LENGTH_LIMIT_{max_len}",
+                    details={
+                        "max_length": max_len,
+                        "truncated_count": at_max,
+                        "truncated_ratio": at_max / len(values)
+                    }
+                )
+        return None
+    def _detect_encoding_artifacts(self, values: List[str]) -> Optional[Artifact]:
+        """Detect encoding/charset conversion issues."""
+        # Common mojibake patterns
+        mojibake_patterns = [
+            r'Ã©',  # é misencoded
+            r'Ã¨',  # è
+            r'Ã ',  # à
+            r'â€™',  # ' smart quote
+            r'â€"',  # — em dash
+            r'Ã¶',  # ö
+            r'Ã¼',  # ü
+            r'ï»¿',  # BOM
+            r'\\x[0-9a-f]{2}',  # Raw hex escapes
+            r'&amp;|&lt;|&gt;',  # HTML entities
+        ]
+        issue_count = 0
+        patterns_found = set()
+        for v in values[:500]:  # Sample
+            for pattern in mojibake_patterns:
+                if re.search(pattern, v):
+                    issue_count += 1
+                    patterns_found.add(pattern)
+                    break
+        if issue_count > 5:
+            return Artifact(
+                artifact_type="encoding_artifact",
+                column="text",
+                evidence=f"{issue_count} values have encoding issues (patterns: {patterns_found})",
+                confidence=0.85,
+                inferred_operation="CHARSET_CONVERSION_ERROR",
+                details={
+                    "issue_count": issue_count,
+                    "patterns": list(patterns_found)
+                }
+            )
+        return None
+    def _detect_case_patterns(self, values: List[str], column: str) -> Optional[Artifact]:
+        """Detect case normalization."""
+        # Skip obviously non-text columns
+        sample = values[:100]
+        all_lower = all(v == v.lower() for v in sample if v.strip())
+        all_upper = all(v == v.upper() for v in sample if v.strip())
+        if all_lower:
+            return Artifact(
+                artifact_type="case_normalization",
+                column=column,
+                evidence="All values are lowercase",
+                confidence=0.90,
+                inferred_operation="LOWERCASE_NORMALIZATION",
+                details={"case": "lower"}
+            )
+        elif all_upper:
+            return Artifact(
+                artifact_type="case_normalization",
+                column=column,
+                evidence="All values are UPPERCASE",
+                confidence=0.90,
+                inferred_operation="UPPERCASE_NORMALIZATION",
+                details={"case": "upper"}
+            )
+        return None
+    def _detect_whitespace_patterns(self, values: List[str]) -> Optional[Artifact]:
+        """Detect whitespace handling patterns."""
+        # Check for leading/trailing whitespace
+        has_leading = sum(1 for v in values if v and v[0] == ' ')
+        has_trailing = sum(1 for v in values if v and v[-1] == ' ')
+        # No whitespace at all = trimmed
+        if has_leading == 0 and has_trailing == 0:
+            # Verify there's text that COULD have whitespace
+            has_spaces = sum(1 for v in values if ' ' in v.strip())
+            if has_spaces > len(values) * 0.3:
+                return Artifact(
+                    artifact_type="whitespace_trimming",
+                    column="text",
+                    evidence="No leading/trailing whitespace (data was trimmed)",
+                    confidence=0.70,
+                    inferred_operation="WHITESPACE_TRIM",
+                    details={"trimmed": True}
+                )
+        return None
+class NumericArtifacts(ArtifactDetector):
+    """
+    Detect numeric processing artifacts.
+    Artifacts detected:
+    - Rounding patterns (precision limits)
+    - Outlier presence/absence (filtering)
+    - Distribution anomalies (sampling)
+    - Sentinel values (nulls represented as -1, 0, 9999)
+    """
+    name = "numeric"
+    def detect(self, df, column: str) -> List[Artifact]:
+        artifacts = []
+        # Check if numeric
+        try:
+            values = df[column].dropna()
+            if len(values) < 10:
+                return artifacts
+            # Try to get numeric values
+            numeric_values = values.astype(float).tolist()
+            # Rounding
+            rounding = self._detect_rounding(numeric_values, column)
+            if rounding:
+                artifacts.append(rounding)
+            # Sentinel values
+            sentinel = self._detect_sentinel_values(numeric_values, column)
+            if sentinel:
+                artifacts.append(sentinel)
+            # Distribution
+            dist = self._detect_distribution_artifacts(numeric_values, column)
+            if dist:
+                artifacts.append(dist)
+        except (ValueError, TypeError):
+            pass
+        return artifacts
+    def _detect_rounding(self, values: List[float], column: str) -> Optional[Artifact]:
+        """Detect systematic rounding."""
+        # Check decimal places
+        decimal_places = []
+        for v in values[:500]:
+            if v != int(v):
+                str_v = f"{v:.10f}".rstrip('0')
+                if '.' in str_v:
+                    decimal_places.append(len(str_v.split('.')[1]))
+        if not decimal_places:
+            # All integers - check for rounding to 10, 100, etc.
+            int_values = [int(v) for v in values]
+            divisible_by_100 = sum(1 for v in int_values if v % 100 == 0)
+            divisible_by_10 = sum(1 for v in int_values if v % 10 == 0)
+            if divisible_by_100 > len(int_values) * 0.9:
+                return Artifact(
+                    artifact_type="numeric_rounding",
+                    column=column,
+                    evidence="Values rounded to nearest 100",
+                    confidence=0.85,
+                    inferred_operation="ROUND_TO_100",
+                    details={"rounding": 100}
+                )
+            elif divisible_by_10 > len(int_values) * 0.9:
+                return Artifact(
+                    artifact_type="numeric_rounding",
+                    column=column,
+                    evidence="Values rounded to nearest 10",
+                    confidence=0.80,
+                    inferred_operation="ROUND_TO_10",
+                    details={"rounding": 10}
+                )
+        else:
+            # Check for consistent decimal places
+            max_decimals = max(decimal_places)
+            at_max = sum(1 for d in decimal_places if d == max_decimals)
+            if at_max < len(decimal_places) * 0.3 and max_decimals <= 2:
+                return Artifact(
+                    artifact_type="numeric_rounding",
+                    column=column,
+                    evidence=f"Values appear rounded to {max_decimals} decimal places",
+                    confidence=0.75,
+                    inferred_operation=f"ROUND_TO_{max_decimals}_DECIMALS",
+                    details={"decimal_places": max_decimals}
+                )
+        return None
+    def _detect_sentinel_values(self, values: List[float], column: str) -> Optional[Artifact]:
+        """Detect sentinel values representing nulls."""
+        sentinels = [-1, -999, -9999, 0, 9999, 99999]
+        value_counts = Counter(values)
+        for sentinel in sentinels:
+            if sentinel in value_counts:
+                count = value_counts[sentinel]
+                if count > len(values) * 0.01:  # More than 1%
+                    return Artifact(
+                        artifact_type="sentinel_value",
+                        column=column,
+                        evidence=f"{count} occurrences of {sentinel} (likely NULL sentinel)",
+                        confidence=0.70,
+                        inferred_operation=f"NULL_AS_{int(sentinel)}",
+                        details={
+                            "sentinel": sentinel,
+                            "count": count,
+                            "percentage": count / len(values) * 100
+                        }
+                    )
+        return None
+    def _detect_distribution_artifacts(self, values: List[float], column: str) -> Optional[Artifact]:
+        """Detect distribution anomalies suggesting filtering/sampling."""
+        if len(values) < 100:
+            return None
+        # Check for hard cutoffs
+        sorted_vals = sorted(values)
+        min_val, max_val = sorted_vals[0], sorted_vals[-1]
+        # Round number cutoffs suggest filtering
+        if max_val == int(max_val) and max_val % 10 == 0:
+            # Check if there's a cluster at the max
+            at_max = sum(1 for v in values if v == max_val)
+            if at_max > len(values) * 0.05:
+                return Artifact(
+                    artifact_type="hard_cutoff",
+                    column=column,
+                    evidence=f"Hard cutoff at {max_val} ({at_max} values at limit)",
+                    confidence=0.75,
+                    inferred_operation=f"CAP_AT_{int(max_val)}",
+                    details={
+                        "cutoff": max_val,
+                        "count_at_cutoff": at_max
+                    }
+                )
+        return None
+class NullPatternArtifacts(ArtifactDetector):
+    """
+    Detect null/missing value patterns.
+    Artifacts detected:
+    - Systematic nulls (default handling)
+    - Null correlations (conditional logic)
+    - Null rates anomalies (ETL errors)
+    """
+    name = "null_patterns"
+    def detect_all(self, df) -> List[Artifact]:
+        """Analyze null patterns across all columns."""
+        artifacts = []
+        # Overall null rates per column
+        null_rates = {}
+        for col in df.columns:
+            null_rate = df[col].isna().mean()
+            null_rates[col] = null_rate
+        # Detect anomalous null rates
+        rates = list(null_rates.values())
+        if len(rates) > 3:
+            mean_rate = statistics.mean(rates)
+            for col, rate in null_rates.items():
+                if rate > 0.5 and rate > mean_rate * 3:
+                    artifacts.append(Artifact(
+                        artifact_type="high_null_rate",
+                        column=col,
+                        evidence=f"{rate*100:.1f}% null (vs {mean_rate*100:.1f}% average)",
+                        confidence=0.70,
+                        inferred_operation="OPTIONAL_FIELD_OR_ETL_ERROR",
+                        details={
+                            "null_rate": rate,
+                            "avg_null_rate": mean_rate
+                        }
+                    ))
+        # Detect columns that are null together (conditional logic)
+        # This is expensive so we sample
+        if len(df) > 100:
+            sample = df.sample(min(1000, len(df)))
+        else:
+            sample = df
+        correlated_nulls = []
+        cols = list(df.columns)
+        for i, col1 in enumerate(cols):
+            for col2 in cols[i+1:]:
+                both_null = (sample[col1].isna() & sample[col2].isna()).mean()
+                either_null = (sample[col1].isna() | sample[col2].isna()).mean()
+                if either_null > 0.1 and both_null / either_null > 0.8:
+                    correlated_nulls.append((col1, col2, both_null))
+        if correlated_nulls:
+            artifacts.append(Artifact(
+                artifact_type="correlated_nulls",
+                column="multiple",
+                evidence=f"{len(correlated_nulls)} column pairs have correlated nulls",
+                confidence=0.75,
+                inferred_operation="CONDITIONAL_FIELD_POPULATION",
+                details={
+                    "pairs": [(c1, c2) for c1, c2, _ in correlated_nulls[:5]]
+                }
+            ))
+        return artifacts
+    def detect(self, df, column: str) -> List[Artifact]:
+        """Null patterns are analyzed globally, not per-column."""
+        return []
+class SchemaArtifacts(ArtifactDetector):
+    """
+    Detect schema-level artifacts.
+    Artifacts detected:
+    - Column naming conventions (framework hints)
+    - Data type patterns (database origin)
+    - Schema inconsistencies (merged sources)
+    """
+    name = "schema"
+    def detect_all(self, df) -> List[Artifact]:
+        """Analyze schema patterns."""
+        artifacts = []
+        columns = list(df.columns)
+        # Naming convention detection
+        conventions = self._detect_naming_conventions(columns)
+        if conventions:
+            artifacts.append(conventions)
+        # Framework fingerprints
+        framework = self._detect_framework_fingerprints(columns)
+        if framework:
+            artifacts.append(framework)
+        # Mixed conventions (merged sources)
+        mixed = self._detect_mixed_conventions(columns)
+        if mixed:
+            artifacts.append(mixed)
+        return artifacts
+    def detect(self, df, column: str) -> List[Artifact]:
+        """Schema patterns are analyzed globally."""
+        return []
+    def _detect_naming_conventions(self, columns: List[str]) -> Optional[Artifact]:
+        """Detect column naming convention."""
+        snake_case = sum(1 for c in columns if '_' in c and c == c.lower())
+        camel_case = sum(1 for c in columns if re.match(r'^[a-z]+([A-Z][a-z]+)+$', c))
+        pascal_case = sum(1 for c in columns if re.match(r'^([A-Z][a-z]+)+$', c))
+        total = len(columns)
+        if snake_case > total * 0.7:
+            return Artifact(
+                artifact_type="naming_convention",
+                column="schema",
+                evidence=f"snake_case naming ({snake_case}/{total} columns)",
+                confidence=0.80,
+                inferred_operation="PYTHON_OR_SQL_ORIGIN",
+                details={"convention": "snake_case", "ratio": snake_case/total}
+            )
+        elif camel_case > total * 0.5:
+            return Artifact(
+                artifact_type="naming_convention",
+                column="schema",
+                evidence=f"camelCase naming ({camel_case}/{total} columns)",
+                confidence=0.80,
+                inferred_operation="JAVASCRIPT_OR_JAVA_ORIGIN",
+                details={"convention": "camelCase", "ratio": camel_case/total}
+            )
+        elif pascal_case > total * 0.5:
+            return Artifact(
+                artifact_type="naming_convention",
+                column="schema",
+                evidence=f"PascalCase naming ({pascal_case}/{total} columns)",
+                confidence=0.80,
+                inferred_operation="DOTNET_OR_JAVA_ORIGIN",
+                details={"convention": "PascalCase", "ratio": pascal_case/total}
+            )
+        return None
+    def _detect_framework_fingerprints(self, columns: List[str]) -> Optional[Artifact]:
+        """Detect framework-specific column patterns."""
+        col_lower = [c.lower() for c in columns]
+        # Django fingerprints
+        if 'id' in col_lower and 'created_at' in col_lower:
+            return Artifact(
+                artifact_type="framework_fingerprint",
+                column="schema",
+                evidence="Django/Rails-style auto columns (id, created_at)",
+                confidence=0.65,
+                inferred_operation="ORM_GENERATED_SCHEMA",
+                details={"framework_hints": ["django", "rails", "sqlalchemy"]}
+            )
+        # Pandas export fingerprints
+        if 'unnamed: 0' in col_lower or any('unnamed:' in c for c in col_lower):
+            return Artifact(
+                artifact_type="framework_fingerprint",
+                column="schema",
+                evidence="Pandas index column artifact (Unnamed: 0)",
+                confidence=0.90,
+                inferred_operation="PANDAS_CSV_EXPORT",
+                details={"framework": "pandas"}
+            )
+        # MongoDB fingerprints
+        if '_id' in col_lower:
+            return Artifact(
+                artifact_type="framework_fingerprint",
+                column="schema",
+                evidence="MongoDB _id column present",
+                confidence=0.85,
+                inferred_operation="MONGODB_EXPORT",
+                details={"framework": "mongodb"}
+            )
+        return None
+    def _detect_mixed_conventions(self, columns: List[str]) -> Optional[Artifact]:
+        """Detect mixed naming conventions suggesting merged sources."""
+        snake_case = sum(1 for c in columns if '_' in c and c == c.lower())
+        camel_case = sum(1 for c in columns if re.match(r'^[a-z]+([A-Z][a-z]+)+$', c))
+        total = len(columns)
+        # Both conventions present significantly
+        if snake_case > total * 0.2 and camel_case > total * 0.2:
+            return Artifact(
+                artifact_type="mixed_conventions",
+                column="schema",
+                evidence=f"Mixed naming: {snake_case} snake_case, {camel_case} camelCase",
+                confidence=0.75,
+                inferred_operation="MERGED_SOURCES",
+                details={
+                    "snake_case_count": snake_case,
+                    "camel_case_count": camel_case
+                }
+            )
+        return None

cascade/forensics/fingerprints.py ADDED Viewed

	@@ -0,0 +1,328 @@

+"""
+CASCADE Forensics - Technology Fingerprinting
+Map detected artifacts to likely technologies and tools.
+The artifacts are evidence. This module is the detective.
+"""
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Set
+from collections import defaultdict
+@dataclass
+class Fingerprint:
+    """A technology fingerprint - evidence pointing to specific tools."""
+    technology: str
+    category: str  # database, framework, language, tool
+    confidence: float
+    evidence: List[str] = field(default_factory=list)
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "technology": self.technology,
+            "category": self.category,
+            "confidence": self.confidence,
+            "evidence": self.evidence,
+        }
+class TechFingerprinter:
+    """
+    Map artifact patterns to likely technologies.
+    This is pattern matching - certain artifact combinations
+    are strong indicators of specific tools.
+    """
+    # Artifact patterns -> technology mappings
+    PATTERNS = {
+        # Databases
+        "MONGODB_EXPORT": {
+            "technology": "MongoDB",
+            "category": "database",
+            "weight": 0.9,
+        },
+        "ORM_GENERATED_SCHEMA": {
+            "technology": "ORM (Django/Rails/SQLAlchemy)",
+            "category": "framework",
+            "weight": 0.7,
+        },
+        "PANDAS_CSV_EXPORT": {
+            "technology": "Pandas",
+            "category": "tool",
+            "weight": 0.95,
+        },
+        # Processing tools
+        "LOWERCASE_NORMALIZATION": {
+            "technology": "Text Preprocessing",
+            "category": "processing",
+            "weight": 0.6,
+        },
+        "WHITESPACE_TRIM": {
+            "technology": "String Cleaning",
+            "category": "processing",
+            "weight": 0.5,
+        },
+        # Batch processing
+        "BATCH_HOURLY": {
+            "technology": "Scheduled Batch Job (hourly)",
+            "category": "infrastructure",
+            "weight": 0.8,
+        },
+        "BATCH_15MIN": {
+            "technology": "Scheduled Batch Job (15min)",
+            "category": "infrastructure",
+            "weight": 0.8,
+        },
+        "BATCH_BURST_PROCESSING": {
+            "technology": "Event-Driven Batch Processing",
+            "category": "infrastructure",
+            "weight": 0.7,
+        },
+        "SCHEDULED_JOB": {
+            "technology": "Cron/Scheduler",
+            "category": "infrastructure",
+            "weight": 0.75,
+        },
+        # ID generation
+        "UUID_GENERATION_V4": {
+            "technology": "Cryptographic UUID Generator",
+            "category": "tool",
+            "weight": 0.8,
+        },
+        "UUID_GENERATION_V1": {
+            "technology": "Time-based UUID (leaks timestamp + MAC)",
+            "category": "tool",
+            "weight": 0.85,
+        },
+        "DETERMINISTIC_ID_GENERATION_SHA256": {
+            "technology": "Content-Addressed Storage",
+            "category": "architecture",
+            "weight": 0.8,
+        },
+        "DETERMINISTIC_ID_GENERATION_MD5": {
+            "technology": "MD5 Hash IDs (legacy system)",
+            "category": "architecture",
+            "weight": 0.8,
+        },
+        # Data quality
+        "FILTERING_OR_DELETION": {
+            "technology": "Record Filtering/Deletion Pipeline",
+            "category": "processing",
+            "weight": 0.7,
+        },
+        "CHARSET_CONVERSION_ERROR": {
+            "technology": "Encoding Mismatch (Latin-1 vs UTF-8)",
+            "category": "bug",
+            "weight": 0.85,
+        },
+        # Languages/frameworks
+        "PYTHON_OR_SQL_ORIGIN": {
+            "technology": "Python or SQL",
+            "category": "language",
+            "weight": 0.6,
+        },
+        "JAVASCRIPT_OR_JAVA_ORIGIN": {
+            "technology": "JavaScript or Java",
+            "category": "language",
+            "weight": 0.6,
+        },
+        # Source merging
+        "MERGED_SOURCES": {
+            "technology": "Multi-Source Data Integration",
+            "category": "architecture",
+            "weight": 0.8,
+        },
+        "MULTI_SOURCE_MERGE": {
+            "technology": "Multi-Source Data Integration",
+            "category": "architecture",
+            "weight": 0.85,
+        },
+    }
+    # Compound patterns - combinations that strengthen identification
+    COMPOUND_PATTERNS = [
+        {
+            "requires": ["PANDAS_CSV_EXPORT", "PYTHON_OR_SQL_ORIGIN"],
+            "suggests": Fingerprint("Pandas Data Pipeline", "tool", 0.95),
+        },
+        {
+            "requires": ["MONGODB_EXPORT", "JAVASCRIPT_OR_JAVA_ORIGIN"],
+            "suggests": Fingerprint("Node.js + MongoDB Stack", "stack", 0.85),
+        },
+        {
+            "requires": ["ORM_GENERATED_SCHEMA", "BATCH_HOURLY"],
+            "suggests": Fingerprint("Django/Rails Batch Worker", "stack", 0.80),
+        },
+        {
+            "requires": ["CHARSET_CONVERSION_ERROR", "MERGED_SOURCES"],
+            "suggests": Fingerprint("Legacy System Migration", "context", 0.85),
+        },
+        {
+            "requires": ["UUID_GENERATION_V1", "BATCH_BURST_PROCESSING"],
+            "suggests": Fingerprint("Distributed System (pre-2015 design)", "architecture", 0.75),
+        },
+    ]
+    def __init__(self):
+        self.fingerprints: List[Fingerprint] = []
+    def analyze(self, artifacts: List['Artifact']) -> List[Fingerprint]:
+        """
+        Analyze artifacts and return technology fingerprints.
+        Args:
+            artifacts: List of detected artifacts
+        Returns:
+            List of technology fingerprints sorted by confidence
+        """
+        self.fingerprints = []
+        # Get all inferred operations
+        operations = set(a.inferred_operation for a in artifacts)
+        # Match against patterns
+        tech_evidence = defaultdict(list)
+        tech_confidence = defaultdict(float)
+        tech_category = {}
+        for op in operations:
+            # Direct pattern match
+            if op in self.PATTERNS:
+                pattern = self.PATTERNS[op]
+                tech = pattern["technology"]
+                tech_evidence[tech].append(op)
+                tech_confidence[tech] = max(tech_confidence[tech], pattern["weight"])
+                tech_category[tech] = pattern["category"]
+            # Partial match (for patterns with suffixes like SCHEDULED_JOB_24HR)
+            for pattern_name, pattern in self.PATTERNS.items():
+                if op.startswith(pattern_name.split('_')[0] + '_'):
+                    tech = pattern["technology"]
+                    if tech not in tech_evidence or op not in tech_evidence[tech]:
+                        tech_evidence[tech].append(op)
+                        tech_confidence[tech] = max(tech_confidence[tech], pattern["weight"] * 0.9)
+                        tech_category[tech] = pattern["category"]
+        # Check compound patterns
+        for compound in self.COMPOUND_PATTERNS:
+            required = set(compound["requires"])
+            if required.issubset(operations):
+                fp = compound["suggests"]
+                tech_evidence[fp.technology].extend(list(required))
+                tech_confidence[fp.technology] = max(tech_confidence.get(fp.technology, 0), fp.confidence)
+                tech_category[fp.technology] = fp.category
+        # Build fingerprint objects
+        for tech, evidence in tech_evidence.items():
+            self.fingerprints.append(Fingerprint(
+                technology=tech,
+                category=tech_category.get(tech, "unknown"),
+                confidence=tech_confidence[tech],
+                evidence=list(set(evidence)),
+            ))
+        # Sort by confidence
+        self.fingerprints.sort(key=lambda f: f.confidence, reverse=True)
+        return self.fingerprints
+    def get_likely_stack(self) -> Dict[str, Any]:
+        """
+        Synthesize fingerprints into a likely technology stack.
+        Returns:
+            Dict describing the probable system architecture
+        """
+        if not self.fingerprints:
+            return {"stack": "Unknown", "components": []}
+        # Group by category
+        by_category = defaultdict(list)
+        for fp in self.fingerprints:
+            by_category[fp.category].append(fp)
+        stack = {
+            "database": None,
+            "framework": None,
+            "language": None,
+            "processing": [],
+            "infrastructure": [],
+            "architecture_notes": [],
+        }
+        # Pick highest confidence for single-value categories
+        for cat in ["database", "framework", "language"]:
+            if cat in by_category:
+                stack[cat] = by_category[cat][0].technology
+        # Aggregate list categories
+        for cat in ["processing", "infrastructure"]:
+            if cat in by_category:
+                stack[cat] = [fp.technology for fp in by_category[cat]]
+        # Architecture notes from high-confidence findings
+        if "architecture" in by_category:
+            stack["architecture_notes"] = [fp.technology for fp in by_category["architecture"]]
+        # Bugs/issues
+        if "bug" in by_category:
+            stack["issues"] = [fp.technology for fp in by_category["bug"]]
+        return stack
+    def get_security_concerns(self) -> List[Dict[str, Any]]:
+        """
+        Identify security-relevant findings.
+        Returns:
+            List of security concerns derived from fingerprints
+        """
+        concerns = []
+        for fp in self.fingerprints:
+            # UUID v1 leaks info
+            if "UUID" in fp.technology and "V1" in fp.technology:
+                concerns.append({
+                    "severity": "medium",
+                    "issue": "UUID v1 leaks timestamp and MAC address",
+                    "evidence": fp.evidence,
+                    "recommendation": "Use UUID v4 for privacy",
+                })
+            # MD5 for IDs
+            if "MD5" in fp.technology:
+                concerns.append({
+                    "severity": "low",
+                    "issue": "MD5 used for ID generation (collision risk)",
+                    "evidence": fp.evidence,
+                    "recommendation": "Consider SHA-256 for content addressing",
+                })
+            # Encoding errors = data loss
+            if "Encoding" in fp.technology or "charset" in fp.technology.lower():
+                concerns.append({
+                    "severity": "medium",
+                    "issue": "Character encoding errors indicate data corruption",
+                    "evidence": fp.evidence,
+                    "recommendation": "Audit data pipeline for charset handling",
+                })
+            # Legacy patterns
+            if "legacy" in fp.technology.lower() or "pre-2015" in fp.technology.lower():
+                concerns.append({
+                    "severity": "info",
+                    "issue": "Legacy system patterns detected",
+                    "evidence": fp.evidence,
+                    "recommendation": "Review for technical debt",
+                })
+        return concerns

cascade/genesis.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+CASCADE Genesis - The origin node of the neural internetwork.
+Every chain begins here. Systems link to genesis (or to any
+descendant of genesis) to join the lattice.
+The chain IS the registry. No separate discovery needed.
+Usage:
+    # Create genesis (done once, published to well-known location)
+    genesis = create_genesis()
+    # Any system joins by linking to genesis
+    my_chain.link_external(genesis.merkle_root)
+    # Or by linking to any existing node in the lattice
+    my_chain.link_external(some_other_chain.merkle_root)
+    # The lattice grows. Discovery = reading the chain.
+"""
+import hashlib
+import json
+import time
+from pathlib import Path
+from typing import Optional, Dict, Any
+from cascade.core.provenance import ProvenanceChain, ProvenanceRecord
+# Well-known genesis identifiers
+GENESIS_SESSION_ID = "genesis_0"
+GENESIS_MODEL_ID = "cascade_genesis"
+GENESIS_INPUT = "In the beginning was the hash, and the hash was with the chain, and the hash was the chain."
+def create_genesis() -> ProvenanceChain:
+    """
+    Create the genesis chain - origin of the neural internetwork.
+    This is deterministic. Anyone running this gets the same genesis.
+    That's the point - it's the Schelling point for the lattice.
+    """
+    # Deterministic input hash
+    input_hash = hashlib.sha256(GENESIS_INPUT.encode()).hexdigest()[:16]
+    # Deterministic model hash (hash of the genesis concept itself)
+    model_hash = hashlib.sha256(b"cascade_neural_internetwork_v1").hexdigest()[:16]
+    chain = ProvenanceChain(
+        session_id=GENESIS_SESSION_ID,
+        model_id=GENESIS_MODEL_ID,
+        model_hash=model_hash,
+        input_hash=input_hash,
+    )
+    # The genesis record - the first node
+    # Its parent is itself (bootstrap)
+    genesis_record = ProvenanceRecord(
+        layer_name="genesis",
+        layer_idx=0,
+        state_hash=input_hash,  # Self-referential
+        parent_hashes=[input_hash],  # Points to itself
+        params_hash=model_hash,
+        shape=[1],
+        dtype="genesis",
+        stats={"created": time.time()},
+        execution_order=0,
+    )
+    chain.add_record(genesis_record)
+    chain.finalize()
+    return chain
+def get_genesis_root() -> str:
+    """
+    Get the genesis merkle root.
+    This is a constant - the Schelling point.
+    Any system can compute it and know they're linking to the same origin.
+    """
+    return create_genesis().merkle_root
+def save_genesis(path: Path) -> str:
+    """
+    Save genesis chain to file.
+    This file can be published to a well-known location
+    (HuggingFace dataset, IPFS, etc.)
+    """
+    genesis = create_genesis()
+    with open(path, 'w') as f:
+        json.dump(genesis.to_dict(), f, indent=2)
+    return genesis.merkle_root
+def load_genesis(path: Path) -> ProvenanceChain:
+    """Load genesis from file and verify it's authentic."""
+    with open(path, 'r') as f:
+        data = json.load(f)
+    chain = ProvenanceChain.from_dict(data)
+    # Verify this is actually genesis
+    expected_root = get_genesis_root()
+    if chain.merkle_root != expected_root:
+        raise ValueError(
+            f"Invalid genesis: root {chain.merkle_root} != expected {expected_root}"
+        )
+    return chain
+def link_to_genesis(chain: ProvenanceChain) -> None:
+    """
+    Link a chain to genesis, joining the neural internetwork.
+    This is the simplest way to join - link directly to the origin.
+    Alternatively, link to any other chain that traces back to genesis.
+    """
+    chain.link_external(get_genesis_root(), source_id="genesis")
+def verify_lineage_to_genesis(chain: ProvenanceChain, known_chains: Dict[str, ProvenanceChain]) -> bool:
+    """
+    Verify that a chain traces back to genesis through external_roots.
+    Args:
+        chain: The chain to verify
+        known_chains: Dict mapping merkle_root -> chain for lookup
+    Returns:
+        True if chain traces to genesis, False otherwise
+    """
+    genesis_root = get_genesis_root()
+    visited = set()
+    def trace(root: str) -> bool:
+        if root in visited:
+            return False
+        visited.add(root)
+        # Found genesis!
+        if root == genesis_root:
+            return True
+        # Look up this chain
+        if root not in known_chains:
+            return False  # Can't verify - chain not known
+        c = known_chains[root]
+        # Check if any external root leads to genesis
+        for ext_root in c.external_roots:
+            if trace(ext_root):
+                return True
+        return False
+    # Start from the chain's own root
+    return trace(chain.merkle_root) or any(trace(r) for r in chain.external_roots)
+# =============================================================================
+# CLI for genesis operations
+# =============================================================================
+if __name__ == "__main__":
+    import sys
+    genesis = create_genesis()
+    print("=" * 60)
+    print("CASCADE GENESIS")
+    print("=" * 60)
+    print(f"Merkle Root: {genesis.merkle_root}")
+    print(f"Session ID:  {genesis.session_id}")
+    print(f"Model ID:    {genesis.model_id}")
+    print(f"Input Hash:  {genesis.input_hash}")
+    print("=" * 60)
+    print()
+    print("This is the origin of the neural internetwork.")
+    print("Any system can link to this root to join the lattice.")
+    print()
+    print("To join:")
+    print("    from cascade.genesis import get_genesis_root")
+    print("    my_chain.link_external(get_genesis_root())")
+    print()
+    # Save if requested
+    if len(sys.argv) > 1 and sys.argv[1] == "--save":
+        out_path = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("genesis.json")
+        root = save_genesis(out_path)
+        print(f"Genesis saved to: {out_path}")
+        print(f"Root: {root}")

cascade/hold/__init__.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+╔═══════════════════════════════════════════════════════════════════════════════╗
+║                                                                               ║
+║    ██╗  ██╗ ██████╗ ██╗     ██████╗                                          ║
+║    ██║  ██║██╔═══██╗██║     ██╔══██╗                                         ║
+║    ███████║██║   ██║██║     ██║  ██║                                         ║
+║    ██╔══██║██║   ██║██║     ██║  ██║                                         ║
+║    ██║  ██║╚██████╔╝███████╗██████╔╝                                         ║
+║    ╚═╝  ╚═╝ ╚═════╝ ╚══════╝╚═════╝                                          ║
+║                                                                               ║
+║    Inference-Level Halt Protocol for CASCADE-LATTICE                         ║
+║                                                                               ║
+║    "Pause the machine. See what it sees. Choose what it chooses."            ║
+║                                                                               ║
+╚═══════════════════════════════════════════════════════════════════════════════╝
+HOLD is MODEL-AGNOSTIC. Works with ANY framework:
+    - PyTorch, JAX, TensorFlow, scikit-learn
+    - Hugging Face, OpenAI API, Anthropic API
+    - Stable Baselines3, RLlib, custom RL
+    - Any function that outputs probabilities
+USAGE:
+    >>> from cascade.hold import Hold
+    >>>
+    >>> # Your model (any framework)
+    >>> probs = your_model.predict(obs)
+    >>>
+    >>> # HOLD at decision point
+    >>> hold = Hold.get()
+    >>> resolution = hold.yield_point(
+    ...     action_probs=probs,
+    ...     value=value_estimate,
+    ...     observation=obs,
+    ...     brain_id="my_model",
+    ...     # Optional informational wealth:
+    ...     action_labels=["up", "down", "left", "right"],
+    ...     latent=model.get_latent(),
+    ...     attention=model.get_attention(),
+    ...     features=model.get_features(),
+    ...     imagination=model.imagine_futures(),
+    ... )
+    >>>
+    >>> # Use resolved action
+    >>> action = resolution.action
+    >>> was_override = resolution.was_override
+CLI:
+    $ cascade hold           # Start HOLD interface
+    $ cascade hold-status    # Show HOLD system status
+"""
+# Primitives - the core API
+from cascade.hold.primitives import (
+    HoldState,
+    HoldPoint,
+    HoldResolution,
+    Hold,
+    HoldAwareMixin,
+)
+# Session Layer - arcade-style history and time travel
+from cascade.hold.session import (
+    InferenceStep,
+    HoldSession,
+    ArcadeFeedback,
+    CausationHold,
+)
+__all__ = [
+    # Primitives
+    "HoldState",
+    "HoldPoint",
+    "HoldResolution",
+    "Hold",
+    "HoldAwareMixin",
+    # Session
+    "InferenceStep",
+    "HoldSession",
+    "ArcadeFeedback",
+    "CausationHold",
+]

cascade/hold/primitives.py ADDED Viewed

	@@ -0,0 +1,673 @@

+"""
+HOLD Primitives - Core Data Structures and Singleton
+═══════════════════════════════════════════════════════════
+The primitive layer of HOLD:
+- HoldPoint: A frozen moment in inference
+- HoldResolution: The outcome of a hold
+- Hold: Singleton system managing inference-level halts
+HOLD is a CASCADE-LATTICE primitive.
+No cascade = No HOLD.
+"""
+import time
+import hashlib
+import threading
+from typing import Dict, Any, Optional, Callable, List
+from dataclasses import dataclass, field
+from enum import Enum
+import numpy as np
+# CASCADE-LATTICE is REQUIRED
+try:
+    from cascade import sdk_observe
+    from cascade.core.event import CausationLink
+    from cascade.core.graph import CausationGraph
+    HAS_CASCADE = True
+except ImportError:
+    HAS_CASCADE = False
+    # Stubs for when imported standalone (testing)
+    def sdk_observe(*args, **kwargs): pass
+    class CausationLink:
+        def __init__(self, **kwargs): pass
+    class CausationGraph:
+        def add_link(self, link): pass
+class HoldState(Enum):
+    """State of a hold point."""
+    PENDING = "pending"      # Waiting for resolution
+    ACCEPTED = "accepted"    # AI choice was accepted
+    OVERRIDDEN = "overridden"  # Human override
+    TIMEOUT = "timeout"      # Timed out, fell back to AI
+    CANCELLED = "cancelled"  # Hold was cancelled
+def _sanitize(data: Any) -> Any:
+    """Recursively convert numpy types to python types."""
+    if isinstance(data, dict):
+        return {k: _sanitize(v) for k, v in data.items()}
+    elif isinstance(data, (list, tuple)):
+        return [_sanitize(x) for x in data]
+    elif isinstance(data, np.generic):
+        return data.item()
+    return data
+@dataclass
+class HoldPoint:
+    """
+    A decision point where inference yields for potential human intervention.
+    This is the "freeze frame" - the moment before commitment.
+    The decision matrix is exposed, the merkle chain awaits.
+    INFORMATIONAL WEALTH - everything a human needs to understand the decision:
+    - action_labels: What each action means ("FORWARD", "ATTACK", etc.)
+    - latent: The model's internal representation (for inspection)
+    - attention: What the model is attending to
+    - features: Extracted feature activations
+    - imagination: Per-action trajectory predictions and expected values
+    - logits: Raw logits before softmax (for temperature analysis)
+    - reasoning: Text explanations if available
+    """
+    # Decision matrix
+    action_probs: np.ndarray       # The probability distribution
+    value: float                   # Predicted value
+    # Context
+    observation: Dict[str, Any]    # What the brain saw
+    brain_id: str                  # Which brain is holding
+    # === INFORMATIONAL WEALTH ===
+    # Action labels - CRITICAL for human understanding
+    action_labels: Optional[List[str]] = None  # ["NOOP", "FORWARD", "BACK", ...]
+    # Internal state
+    latent: Optional[np.ndarray] = None        # Latent activations (any shape)
+    attention: Optional[Dict[str, float]] = None  # {"position": 0.7, "health": 0.3, ...}
+    features: Optional[Dict[str, float]] = None   # {"spatial_attn": 0.8, "danger": 0.2, ...}
+    # Per-action deep data
+    imagination: Optional[Dict[int, Dict]] = None  # {0: {"trajectory": [...], "expected_value": 0.5}, ...}
+    # Logits (pre-softmax)
+    logits: Optional[np.ndarray] = None        # Raw logits for each action
+    # Reasoning chain (if model provides explanations)
+    reasoning: Optional[List[str]] = None      # ["High reward expected", "Low risk path", ...]
+    # World model predictions (if available)
+    world_prediction: Optional[Dict[str, Any]] = None  # {"pos_delta": [1,0,0], "health_delta": -2, ...}
+    # === END WEALTH ===
+    # Identity
+    id: str = field(default_factory=lambda: hashlib.sha256(str(time.time()).encode()).hexdigest()[:16])
+    timestamp: float = field(default_factory=time.time)
+    # Merkle linkage
+    parent_merkle: Optional[str] = None  # Previous hold point
+    merkle_root: Optional[str] = None    # Computed on creation
+    # State
+    state: HoldState = HoldState.PENDING
+    def __post_init__(self):
+        """Compute merkle root on creation."""
+        if self.merkle_root is None:
+            data = f"{self.id}:{self.brain_id}:{self.action_probs.tobytes().hex()}:{self.timestamp}"
+            if self.parent_merkle:
+                data = f"{self.parent_merkle}:{data}"
+            self.merkle_root = hashlib.sha256(data.encode()).hexdigest()[:16]
+    @property
+    def ai_choice(self) -> int:
+        """What the AI would choose."""
+        return int(np.argmax(self.action_probs))
+    @property
+    def ai_confidence(self) -> float:
+        """Confidence in AI's top choice."""
+        return float(np.max(self.action_probs))
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize for CASCADE observation - includes full informational wealth."""
+        d = {
+            'id': self.id,
+            'brain_id': self.brain_id,
+            'action_probs': self.action_probs.tolist(),
+            'ai_choice': self.ai_choice,
+            'ai_confidence': self.ai_confidence,
+            'value': self.value,
+            'timestamp': self.timestamp,
+            'merkle_root': self.merkle_root,
+            'parent_merkle': self.parent_merkle,
+            'state': self.state.value,
+            'observation': self.observation,
+        }
+        # Include all available wealth
+        if self.action_labels is not None:
+            d['action_labels'] = self.action_labels
+        if self.latent is not None:
+            d['latent'] = self.latent.tolist() if hasattr(self.latent, 'tolist') else self.latent
+        if self.attention is not None:
+            d['attention'] = self.attention
+        if self.features is not None:
+            d['features'] = self.features
+        if self.imagination is not None:
+            d['imagination'] = self.imagination
+        if self.logits is not None:
+            d['logits'] = self.logits.tolist() if hasattr(self.logits, 'tolist') else self.logits
+        if self.reasoning is not None:
+            d['reasoning'] = self.reasoning
+        if self.world_prediction is not None:
+            d['world_prediction'] = self.world_prediction
+        return _sanitize(d)
+@dataclass
+class HoldResolution:
+    """
+    The resolution of a hold point.
+    Either the human accepted, overrode, or it timed out.
+    Links back to the hold point, forming a provenance chain.
+    """
+    hold_point: HoldPoint          # The hold that was resolved
+    action: int                    # Final action taken
+    # Resolution details
+    was_override: bool             # True if human overrode AI
+    override_source: Optional[str] = None  # Who/what overrode ("human", "policy", etc.)
+    # Timing
+    hold_duration: float = 0.0     # How long was held
+    timestamp: float = field(default_factory=time.time)
+    # Merkle linkage
+    merkle_root: Optional[str] = None
+    def __post_init__(self):
+        """Compute merkle root."""
+        if self.merkle_root is None:
+            data = f"{self.hold_point.merkle_root}:{self.action}:{self.was_override}:{self.timestamp}"
+            self.merkle_root = hashlib.sha256(data.encode()).hexdigest()[:16]
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize for CASCADE observation."""
+        d = {
+            'hold_id': self.hold_point.id,
+            'hold_merkle': self.hold_point.merkle_root,
+            'action': self.action,
+            'ai_choice': self.hold_point.ai_choice,
+            'was_override': self.was_override,
+            'override_source': self.override_source,
+            'hold_duration': self.hold_duration,
+            'merkle_root': self.merkle_root,
+            'timestamp': self.timestamp,
+        }
+        return _sanitize(d)
+class Hold:
+    """
+    The HOLD system - manages inference-level halts.
+    Singleton pattern - one Hold system per process.
+    Usage:
+        hold = Hold.get()
+        # Register listeners (for UI, visualization, etc.)
+        hold.register_listener(my_callback)
+        # From within a brain's forward() method:
+        resolution = hold.yield_point(
+            action_probs=probs,
+            value=value,
+            observation=obs,
+            brain_id="brain_001"
+        )
+        # Blocks until resolution!
+        # From UI/control thread:
+        hold.accept()  # or
+        hold.override(action=3, source="human")
+    """
+    _instance = None
+    _lock = threading.Lock()
+    def __new__(cls):
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+                    cls._instance._initialized = False
+        return cls._instance
+    def __init__(self):
+        if self._initialized:
+            return
+        # State
+        self._current_hold: Optional[HoldPoint] = None
+        self._resolution_event = threading.Event()
+        self._resolution: Optional[HoldResolution] = None
+        # Chain
+        self._last_merkle: Optional[str] = None
+        self._hold_count = 0
+        self._override_count = 0
+        # Callbacks - interfaces register here to receive hold points
+        self._listeners: List[Callable[[HoldPoint], None]] = []
+        # Settings
+        self.timeout: float = 30.0  # Default timeout (seconds)
+        self.auto_accept: bool = False  # If True, don't block, just observe
+        # CASCADE graph for this session
+        self._causation_graph = CausationGraph()
+        self._initialized = True
+        print("[HOLD] system initialized (cascade-lattice)")
+    @classmethod
+    def get(cls) -> 'Hold':
+        """Get the singleton instance."""
+        return cls()
+    def register_listener(self, callback: Callable[[HoldPoint], None]):
+        """
+        Register a listener for hold points.
+        The callback receives HoldPoint when inference halts.
+        Use this to connect visualizations, UIs, etc.
+        """
+        self._listeners.append(callback)
+        print(f"[REGISTER] Registered HOLD listener: {callback.__name__ if hasattr(callback, '__name__') else callback}")
+    def unregister_listener(self, callback: Callable):
+        """Remove a listener."""
+        if callback in self._listeners:
+            self._listeners.remove(callback)
+    def yield_point(
+        self,
+        action_probs: np.ndarray,
+        value: float,
+        observation: Dict[str, Any],
+        brain_id: str,
+        # === INFORMATIONAL WEALTH ===
+        action_labels: Optional[List[str]] = None,
+        latent: Optional[np.ndarray] = None,
+        attention: Optional[Dict[str, float]] = None,
+        features: Optional[Dict[str, float]] = None,
+        imagination: Optional[Dict[int, Dict]] = None,
+        logits: Optional[np.ndarray] = None,
+        reasoning: Optional[List[str]] = None,
+        world_prediction: Optional[Dict[str, Any]] = None,
+        # === END WEALTH ===
+        blocking: bool = True,
+    ) -> HoldResolution:
+        """
+        Create a hold point and yield for resolution.
+        This is called from within a brain's forward() method.
+        Blocks until resolved (or timeout).
+        Args:
+            action_probs: The decision matrix (probability distribution)
+            value: Predicted value
+            observation: What the brain observed
+            brain_id: Identifier for the brain
+            INFORMATIONAL WEALTH (all optional, but improves human understanding):
+            action_labels: Names for each action ["FORWARD", "BACK", "LEFT", ...]
+            latent: Model's latent state/activations
+            attention: Attention weights {"position": 0.7, "health": 0.3}
+            features: Feature activations {"spatial": 0.8, "danger": 0.2}
+            imagination: Per-action predictions {0: {"trajectory": [...], "expected_value": 0.5}}
+            logits: Raw pre-softmax logits
+            reasoning: Text explanations ["High reward expected", ...]
+            world_prediction: World model predictions {"pos_delta": [1,0,0]}
+            blocking: If False, returns immediately with AI choice
+        Returns:
+            HoldResolution with the final action
+        """
+        # Create hold point with full wealth
+        hold = HoldPoint(
+            action_probs=action_probs,
+            value=value,
+            observation=observation,
+            brain_id=brain_id,
+            action_labels=action_labels,
+            latent=latent,
+            attention=attention,
+            features=features,
+            imagination=imagination,
+            logits=logits,
+            reasoning=reasoning,
+            world_prediction=world_prediction,
+            parent_merkle=self._last_merkle,
+        )
+        # Observe the hold point in CASCADE
+        sdk_observe(
+            model_id=brain_id,
+            input_data=observation,
+            output_data={**hold.to_dict(), 'event_type': 'hold_point'},
+        )
+        self._hold_count += 1
+        # Non-blocking mode - just observe and return AI choice
+        if not blocking or self.auto_accept:
+            resolution = HoldResolution(
+                hold_point=hold,
+                action=hold.ai_choice,
+                was_override=False,
+                hold_duration=0.0,
+            )
+            self._observe_resolution(resolution)
+            return resolution
+        # Set as current hold
+        self._current_hold = hold
+        self._resolution_event.clear()
+        self._resolution = None
+        # Notify listeners
+        for listener in self._listeners:
+            try:
+                listener(hold)
+            except Exception as e:
+                print(f"⚠️ HOLD listener error: {e}")
+        # Print hold info
+        print(f"\n{'═' * 50}")
+        print(f"🛑 HOLD #{self._hold_count}")
+        print(f"   Merkle: {hold.merkle_root}")
+        ai_label = hold.action_labels[hold.ai_choice] if hold.action_labels else str(hold.ai_choice)
+        print(f"   AI Choice: {ai_label} (confidence: {hold.ai_confidence:.2%})")
+        print(f"   Value: {hold.value:.4f}")
+        # Show probabilities with labels
+        if hold.action_labels:
+            prob_str = ', '.join(f'{hold.action_labels[i]}:{p:.2f}' for i, p in enumerate(hold.action_probs))
+        else:
+            prob_str = ', '.join(f'{i}:{p:.2f}' for i, p in enumerate(hold.action_probs))
+        print(f"   Probabilities: {prob_str}")
+        # Show available wealth
+        wealth = []
+        if hold.latent is not None: wealth.append("latent")
+        if hold.attention is not None: wealth.append("attention")
+        if hold.features is not None: wealth.append("features")
+        if hold.imagination is not None: wealth.append("imagination")
+        if hold.reasoning is not None: wealth.append("reasoning")
+        if wealth:
+            print(f"   Wealth: {', '.join(wealth)}")
+        print(f"   Waiting for resolution (timeout: {self.timeout}s)...")
+        print(f"{'═' * 50}")
+        # Block until resolution or timeout
+        start_time = time.time()
+        resolved = self._resolution_event.wait(timeout=self.timeout)
+        hold_duration = time.time() - start_time
+        if resolved and self._resolution:
+            resolution = self._resolution
+            resolution.hold_duration = hold_duration
+        else:
+            # Timeout - use AI choice
+            hold.state = HoldState.TIMEOUT
+            resolution = HoldResolution(
+                hold_point=hold,
+                action=hold.ai_choice,
+                was_override=False,
+                override_source="timeout",
+                hold_duration=hold_duration,
+            )
+            print(f"[TIMEOUT] HOLD timeout - accepting AI choice: {hold.ai_choice}")
+        # Observe resolution
+        self._observe_resolution(resolution)
+        # Clear state
+        self._current_hold = None
+        self._resolution = None
+        return resolution
+    def resolve(self, action: int, source: str = "human"):
+        """
+        Resolve the current hold with an action.
+        Called by UI/interface when human makes a choice.
+        Args:
+            action: The chosen action
+            source: Who resolved it ("human", "policy", etc.)
+        """
+        if self._current_hold is None:
+            print("[WARN] No active hold to resolve")
+            return
+        hold = self._current_hold
+        was_override = (action != hold.ai_choice)
+        if was_override:
+            hold.state = HoldState.OVERRIDDEN
+            self._override_count += 1
+        else:
+            hold.state = HoldState.ACCEPTED
+        self._resolution = HoldResolution(
+            hold_point=hold,
+            action=action,
+            was_override=was_override,
+            override_source=source if was_override else None,
+        )
+        print(f"[RESOLVE] HOLD resolved: action={action}, override={was_override}")
+        self._resolution_event.set()
+    def accept(self):
+        """Accept AI's choice for current hold."""
+        if self._current_hold:
+            self.resolve(self._current_hold.ai_choice, source="accept")
+    def override(self, action: int, source: str = "human"):
+        """Override with a different action."""
+        self.resolve(action, source)
+    def cancel(self):
+        """Cancel current hold without resolution."""
+        if self._current_hold:
+            self._current_hold.state = HoldState.CANCELLED
+            self._resolution = HoldResolution(
+                hold_point=self._current_hold,
+                action=self._current_hold.ai_choice,
+                was_override=False,
+                override_source="cancelled",
+            )
+            self._resolution_event.set()
+    def _observe_resolution(self, resolution: HoldResolution):
+        """Record resolution to CASCADE."""
+        sdk_observe(
+            model_id=resolution.hold_point.brain_id,
+            input_data=resolution.hold_point.to_dict(),
+            output_data={**resolution.to_dict(), 'event_type': 'hold_resolution'},
+        )
+        # Update chain
+        self._last_merkle = resolution.merkle_root
+        # Add to causation graph
+        link = CausationLink(
+            from_event=resolution.hold_point.merkle_root,
+            to_event=resolution.merkle_root,
+            causation_type="hold_resolved",
+            strength=1.0 if resolution.was_override else 0.5,
+            explanation=f"Override: {resolution.was_override}, Action: {resolution.action}",
+        )
+        self._causation_graph.add_link(link)
+    @property
+    def current_hold(self) -> Optional[HoldPoint]:
+        """Get current active hold point (if any)."""
+        return self._current_hold
+    @property
+    def stats(self) -> Dict[str, Any]:
+        """Get hold statistics."""
+        return {
+            'total_holds': self._hold_count,
+            'overrides': self._override_count,
+            'override_rate': self._override_count / max(self._hold_count, 1),
+            'last_merkle': self._last_merkle,
+        }
+class HoldAwareMixin:
+    """
+    Mixin for brains that support HOLD.
+    Add this to your Brain class to enable inference-level halts.
+    Usage:
+        class MyBrain(HoldAwareMixin, BaseBrain):
+            def forward(self, inputs):
+                # Your inference code
+                return {"action_probs": probs, "value": value}
+        brain = MyBrain()
+        brain.enable_hold()
+        # Now forward_with_hold() will pause for human input
+        output = brain.forward_with_hold(inputs)
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._hold_system = Hold.get()
+        self._hold_enabled = True
+        self._brain_id = getattr(self, 'id', hashlib.sha256(str(id(self)).encode()).hexdigest()[:16])
+    def forward_with_hold(
+        self,
+        inputs: Dict[str, Any],
+        blocking: bool = True,
+    ) -> Dict[str, Any]:
+        """
+        Forward pass with HOLD support.
+        Call this instead of forward() to enable hold points.
+        """
+        # Get decision matrix from normal forward
+        output = self.forward(inputs)
+        if not self._hold_enabled:
+            return output
+        action_probs = output.get('action_probs', None)
+        if action_probs is None:
+            return output
+        # Get imagination if available (DreamerBrain, etc.)
+        imagined = None
+        if hasattr(self, 'imagine'):
+            try:
+                imagined = self.imagine(horizon=15)
+            except:
+                pass
+        # Yield to hold system
+        resolution = self._hold_system.yield_point(
+            action_probs=np.array(action_probs),
+            value=float(output.get('value', 0.0)),
+            observation=inputs,
+            brain_id=self._brain_id,
+            imagined_futures=imagined,
+            blocking=blocking,
+        )
+        # Update output with resolved action
+        output['action'] = resolution.action
+        output['hold_resolution'] = resolution.to_dict()
+        output['was_override'] = resolution.was_override
+        return output
+    def enable_hold(self):
+        """Enable HOLD for this brain."""
+        self._hold_enabled = True
+    def disable_hold(self):
+        """Disable HOLD (normal inference)."""
+        self._hold_enabled = False
+# Demo
+def _demo_hold():
+    """Demonstrate HOLD system."""
+    print("=" * 60)
+    print("HOLD SYSTEM DEMO")
+    print("=" * 60)
+    # Get hold system
+    hold = Hold.get()
+    hold.timeout = 10.0
+    def on_hold(point: HoldPoint):
+        print(f"\n🔔 Listener received hold: {point.id}")
+    hold.register_listener(on_hold)
+    def brain_loop():
+        for step in range(3):
+            probs = np.random.dirichlet(np.ones(8))
+            resolution = hold.yield_point(
+                action_probs=probs,
+                value=np.random.random(),
+                observation={'step': step},
+                brain_id='demo_brain',
+            )
+            print(f"Brain received: action={resolution.action}, override={resolution.was_override}")
+    def human_input():
+        for i in range(3):
+            time.sleep(2)
+            if hold.current_hold:
+                if i % 2 == 0:
+                    hold.accept()
+                else:
+                    hold.override(7, source="demo_human")
+    brain_thread = threading.Thread(target=brain_loop)
+    human_thread = threading.Thread(target=human_input)
+    brain_thread.start()
+    human_thread.start()
+    brain_thread.join()
+    human_thread.join()
+    print(f"\n{'=' * 60}")
+    print("SESSION STATS")
+    print(hold.stats)
+if __name__ == "__main__":
+    _demo_hold()

cascade/hold/session.py ADDED Viewed

	@@ -0,0 +1,707 @@

+"""
+HOLD Session - Arcade-Style Inference Interception
+══════════════════════════════════════════════════════════
+"Pause the machine. See what it sees. Choose what it chooses."
+The arcade layer of HOLD:
+- CausationHold: Session management with history
+- InferenceStep: Single crystallized moment
+- Time travel via state snapshots
+- Speed controls and combo tracking
+Controls:
+    SPACE   - Accept model's choice, advance
+    1-9     - Override with alternative
+    ←/→     - Step back/forward through history
+    +/-     - Speed up/slow down auto-advance
+    P       - Pause/unpause auto-advance
+    ESC     - Exit hold mode
+"""
+import numpy as np
+import time
+import json
+import hashlib
+import threading
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Any, Callable, Tuple
+from datetime import datetime
+from pathlib import Path
+from enum import Enum
+class SessionState(Enum):
+    """Current state of the hold session."""
+    IDLE = "idle"           # Not holding anything
+    PAUSED = "paused"       # Frozen, waiting for input
+    STEPPING = "stepping"   # Auto-advancing at set speed
+    REWINDING = "rewinding" # Going backwards through history
+@dataclass
+class InferenceStep:
+    """A single crystallized moment of inference."""
+    step_id: str
+    step_index: int
+    timestamp: float
+    # What the model sees
+    input_context: Dict[str, Any]
+    # What the model wants to do
+    candidates: List[Dict[str, Any]]  # [{value, probability, metadata}]
+    top_choice: Any
+    top_probability: float
+    # Internal state snapshot (for true rewind)
+    hidden_state: Optional[np.ndarray] = None
+    attention_weights: Optional[Dict[str, float]] = None
+    # What actually happened
+    chosen_value: Any = None
+    was_override: bool = False
+    override_by: str = "model"  # "model" or "human"
+    # Provenance
+    cascade_hash: Optional[str] = None
+    # Private: full state snapshot for true rewind
+    _state_snapshot: Optional[Dict[str, Any]] = field(default=None, repr=False)
+@dataclass
+class HoldSession:
+    """A complete hold session with history."""
+    session_id: str
+    agent_id: str
+    started_at: float
+    # All steps in order
+    steps: List[InferenceStep] = field(default_factory=list)
+    current_index: int = 0
+    # Arcade stats
+    total_steps: int = 0
+    human_overrides: int = 0
+    correct_predictions: int = 0  # Human guessed what model would do
+    combo: int = 0
+    max_combo: int = 0
+    # Speed control (steps per second, 0 = manual only)
+    speed_level: int = 0  # 0=manual, 1=slow, 2=medium, 3=fast, 4=ludicrous
+    speed_map: Dict[int, float] = field(default_factory=lambda: {
+        0: 0.0,      # Manual
+        1: 0.5,      # 2 sec per step
+        2: 1.0,      # 1 sec per step
+        3: 2.0,      # 0.5 sec per step
+        4: 10.0,     # 0.1 sec per step (ludicrous speed)
+    })
+    # State
+    state: SessionState = SessionState.IDLE
+@dataclass
+class ArcadeFeedback:
+    """Visual/audio feedback cues."""
+    message: str
+    intensity: float  # 0-1, for glow/shake/etc
+    sound_cue: str    # "accept", "override", "combo", "combo_break", "rewind"
+    color: Tuple[int, int, int] = (255, 255, 255)
+class CausationHold:
+    """
+    The arcade-layer hold system. Wraps any inference function.
+    Features:
+    - Session management with full history
+    - True state restoration for time travel
+    - Speed controls (manual to ludicrous)
+    - Combo tracking and high scores
+    Usage:
+        hold = CausationHold()
+        # Start a session
+        hold.begin_session(agent_id="agent_123")
+        # In inference loop:
+        for step in inference_steps:
+            choice, feedback = hold.capture(
+                input_context={"tokens": tokens},
+                candidates=[{"value": "A", "probability": 0.8}, ...]
+            )  # Pauses here until user input!
+        # Time travel
+        hold.rewind(steps=3)
+        hold.branch_from(step_index=5, choice_index=2)
+        stats = hold.end_session()
+    """
+    def __init__(self, cascade_bus=None):
+        """
+        Args:
+            cascade_bus: Optional CASCADE event bus for provenance
+        """
+        self.bus = cascade_bus
+        self.session: Optional[HoldSession] = None
+        self.callbacks: Dict[str, List[Callable]] = {
+            'on_step': [],
+            'on_override': [],
+            'on_combo': [],
+            'on_combo_break': [],
+            'on_rewind': [],
+            'on_state_restore': [],
+        }
+        # Thread safety
+        self._lock = threading.Lock()
+        self._input_event = threading.Event()
+        self._user_choice: Optional[Any] = None
+        # High scores (persisted)
+        self.high_scores_path = Path("data/hold_high_scores.json")
+        self.high_scores = self._load_high_scores()
+    # ========================================================================
+    # SESSION MANAGEMENT
+    # ========================================================================
+    def begin_session(self, agent_id: str) -> HoldSession:
+        """Start a new hold session."""
+        session_id = f"hold_{agent_id}_{int(time.time()*1000)}"
+        self.session = HoldSession(
+            session_id=session_id,
+            agent_id=agent_id,
+            started_at=time.time(),
+        )
+        self.session.state = SessionState.PAUSED
+        self._emit_cascade("hold_session_start", {
+            "session_id": session_id,
+            "agent_id": agent_id,
+        })
+        return self.session
+    def end_session(self) -> Dict[str, Any]:
+        """End session and return stats."""
+        if not self.session:
+            return {}
+        stats = {
+            "session_id": self.session.session_id,
+            "agent_id": self.session.agent_id,
+            "duration": time.time() - self.session.started_at,
+            "total_steps": self.session.total_steps,
+            "human_overrides": self.session.human_overrides,
+            "correct_predictions": self.session.correct_predictions,
+            "max_combo": self.session.max_combo,
+            "accuracy": (
+                self.session.correct_predictions / max(1, self.session.total_steps)
+            ),
+        }
+        # Check for high score
+        self._check_high_score(stats)
+        self._emit_cascade("hold_session_end", stats)
+        self.session = None
+        return stats
+    # ========================================================================
+    # CAPTURE & ADVANCE - WITH STATE SNAPSHOT FOR TRUE REWIND
+    # ========================================================================
+    def capture(
+        self,
+        input_context: Dict[str, Any],
+        candidates: List[Dict[str, Any]],
+        hidden_state: Optional[np.ndarray] = None,
+        attention: Optional[Dict[str, float]] = None,
+        state_snapshot: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[Any, ArcadeFeedback]:
+        """
+        Capture an inference step. BLOCKS until user input or auto-advance.
+        IMPORTANT: Pass state_snapshot for true rewind capability.
+        This should be a complete snapshot of the model's internal state
+        that can be restored to allow execution from this decision point
+        with a different choice.
+        This is NOT prediction - you will ACTUALLY execute the choice and
+        see REAL outcomes. If you don't like them, rewind and try again.
+        Args:
+            input_context: What the model is looking at
+            candidates: List of {value, probability, ...} options
+            hidden_state: Optional internal state snapshot (deprecated, use state_snapshot)
+            attention: Optional attention weights
+            state_snapshot: Complete model state for TRUE rewind capability
+        Returns:
+            (chosen_value, feedback) - The value to use and arcade feedback
+        """
+        if not self.session:
+            # No session = passthrough, just return top choice
+            return candidates[0]['value'], ArcadeFeedback("", 0, "")
+        # Sort candidates by probability
+        candidates = sorted(candidates, key=lambda x: x.get('probability', 0), reverse=True)
+        top = candidates[0]
+        # Merge hidden_state into state_snapshot if provided separately
+        if state_snapshot is None and hidden_state is not None:
+            state_snapshot = {'hidden_state': hidden_state}
+        elif state_snapshot is not None and hidden_state is not None:
+            state_snapshot['hidden_state'] = hidden_state
+        # Create step - this is a CHECKPOINT for true rewind
+        step = InferenceStep(
+            step_id=f"step_{self.session.total_steps}",
+            step_index=self.session.total_steps,
+            timestamp=time.time(),
+            input_context=input_context,
+            candidates=candidates,
+            top_choice=top['value'],
+            top_probability=top.get('probability', 1.0),
+            hidden_state=hidden_state,
+            attention_weights=attention,
+        )
+        # Store state snapshot for TRUE rewind (not just history navigation)
+        if state_snapshot is not None:
+            step._state_snapshot = state_snapshot
+        # Compute merkle hash for provenance
+        step.cascade_hash = self._compute_step_hash(step)
+        # Add to history
+        with self._lock:
+            self.session.steps.append(step)
+            self.session.current_index = len(self.session.steps) - 1
+            self.session.total_steps += 1
+        # Emit step event
+        self._emit_callback('on_step', step)
+        self._emit_cascade("hold_step", {
+            "step_index": step.step_index,
+            "top_choice": str(top['value']),
+            "top_prob": top.get('probability', 1.0),
+            "num_candidates": len(candidates),
+            "has_snapshot": state_snapshot is not None,
+            "merkle": step.cascade_hash,
+        })
+        # Wait for input
+        choice, feedback = self._wait_for_input(step)
+        # Record what happened
+        step.chosen_value = choice
+        step.was_override = (choice != top['value'])
+        step.override_by = "human" if step.was_override else "model"
+        if step.was_override:
+            self.session.human_overrides += 1
+            self._emit_callback('on_override', step, choice)
+        return choice, feedback
+    def _wait_for_input(self, step: InferenceStep) -> Tuple[Any, ArcadeFeedback]:
+        """Wait for user input or auto-advance timer."""
+        # Manual mode = wait indefinitely
+        if self.session.speed_level == 0:
+            self._input_event.clear()
+            self._input_event.wait()  # Blocks until input()
+            choice = self._user_choice
+            self._user_choice = None
+        else:
+            # Auto-advance mode
+            speed = self.session.speed_map[self.session.speed_level]
+            wait_time = 1.0 / speed if speed > 0 else float('inf')
+            self._input_event.clear()
+            got_input = self._input_event.wait(timeout=wait_time)
+            if got_input and self._user_choice is not None:
+                choice = self._user_choice
+                self._user_choice = None
+            else:
+                # Auto-accepted
+                choice = step.top_choice
+        # Generate feedback
+        return choice, self._generate_feedback(step, choice)
+    def input(self, choice: Any):
+        """
+        Provide user input. Call from UI thread.
+        Args:
+            choice: The value to use (or index into candidates)
+        """
+        if not self.session:
+            return
+        current_step = self.session.steps[self.session.current_index]
+        # Handle index input (1-9 keys)
+        if isinstance(choice, int) and 0 <= choice < len(current_step.candidates):
+            choice = current_step.candidates[choice]['value']
+        self._user_choice = choice
+        self._input_event.set()
+    def accept(self):
+        """Accept model's top choice (SPACE key)."""
+        if not self.session or not self.session.steps:
+            return
+        current = self.session.steps[self.session.current_index]
+        self.input(current.top_choice)
+    def override(self, index: int):
+        """Override with candidate at index (1-9 keys)."""
+        self.input(index)
+    # ========================================================================
+    # NAVIGATION (TIME TRAVEL) - TRUE STATE RESTORATION
+    # ========================================================================
+    def rewind(self, steps: int = 1, restore_state: bool = True) -> Optional[InferenceStep]:
+        """
+        Go back in history with optional state restoration.
+        This is NOT simulation - we actually restore the model's internal state
+        to the snapshot taken at that decision point. From there, you can
+        execute a different branch and see REAL outcomes.
+        Args:
+            steps: Number of steps to go back
+            restore_state: If True, actually restore hidden_state to model
+        Returns:
+            The step we rewound to
+        """
+        if not self.session:
+            return None
+        with self._lock:
+            new_index = max(0, self.session.current_index - steps)
+            if new_index != self.session.current_index:
+                self.session.current_index = new_index
+                self.session.state = SessionState.REWINDING
+                step = self.session.steps[new_index]
+                # TRUE STATE RESTORATION
+                if restore_state and step.hidden_state is not None:
+                    self._restore_state(step)
+                self._emit_callback('on_rewind', step, -steps)
+                return step
+        return None
+    def _restore_state(self, step: InferenceStep):
+        """
+        Restore model state from a snapshot.
+        This is the key that makes execution + rewind possible.
+        The model's internal state is set back to exactly what it was
+        at this decision point, allowing you to branch differently.
+        """
+        if step.hidden_state is None and step._state_snapshot is None:
+            return
+        # Emit state restoration event - hooked components can restore themselves
+        self._emit_callback('on_state_restore', step)
+        self._emit_cascade("state_restored", {
+            "step_index": step.step_index,
+            "merkle": step.cascade_hash,
+            "had_hidden_state": step.hidden_state is not None,
+            "had_snapshot": step._state_snapshot is not None,
+        })
+    def branch_from(self, step_index: int, choice_index: int) -> Optional[InferenceStep]:
+        """
+        Rewind to a step and immediately choose a different branch.
+        This is the core gameplay loop:
+        1. Rewind to decision point
+        2. Choose different option
+        3. Execute and see what happens
+        4. Repeat until satisfied
+        Args:
+            step_index: Which decision point to branch from
+            choice_index: Which candidate to choose (0 = model's choice)
+        Returns:
+            The step after branching (with state restored)
+        """
+        step = self.jump_to(step_index)
+        if step is None:
+            return None
+        # Restore state
+        self._restore_state(step)
+        # Set up the override
+        if choice_index < len(step.candidates):
+            self.override(choice_index)
+        else:
+            self.accept()
+        return step
+    def forward(self, steps: int = 1) -> Optional[InferenceStep]:
+        """Go forward in history (if we've rewound)."""
+        if not self.session:
+            return None
+        with self._lock:
+            max_index = len(self.session.steps) - 1
+            new_index = min(max_index, self.session.current_index + steps)
+            if new_index != self.session.current_index:
+                self.session.current_index = new_index
+                step = self.session.steps[new_index]
+                self._emit_callback('on_rewind', step, steps)
+                return step
+        return None
+    def jump_to(self, index: int) -> Optional[InferenceStep]:
+        """Jump to specific step."""
+        if not self.session:
+            return None
+        with self._lock:
+            index = max(0, min(index, len(self.session.steps) - 1))
+            self.session.current_index = index
+            return self.session.steps[index]
+    # ========================================================================
+    # SPEED CONTROL
+    # ========================================================================
+    def speed_up(self):
+        """Increase auto-advance speed."""
+        if self.session:
+            self.session.speed_level = min(4, self.session.speed_level + 1)
+    def speed_down(self):
+        """Decrease auto-advance speed."""
+        if self.session:
+            self.session.speed_level = max(0, self.session.speed_level - 1)
+    def set_speed(self, level: int):
+        """Set speed level directly (0-4)."""
+        if self.session:
+            self.session.speed_level = max(0, min(4, level))
+    def pause(self):
+        """Pause auto-advance."""
+        if self.session:
+            self.session.state = SessionState.PAUSED
+    def unpause(self):
+        """Resume auto-advance."""
+        if self.session:
+            self.session.state = SessionState.STEPPING
+    # ========================================================================
+    # PROVENANCE HASHING
+    # ========================================================================
+    def _compute_step_hash(self, step: InferenceStep) -> str:
+        """
+        Compute merkle hash for a step.
+        This hash uniquely identifies this decision point and allows
+        verification that rewind is restoring to the exact right state.
+        """
+        # Include parent hash for chain integrity
+        parent_hash = ""
+        if self.session and len(self.session.steps) > 0:
+            prev_step = self.session.steps[-1]
+            parent_hash = prev_step.cascade_hash or ""
+        content = json.dumps({
+            'step_index': step.step_index,
+            'timestamp': step.timestamp,
+            'top_choice': str(step.top_choice),
+            'top_prob': step.top_probability,
+            'num_candidates': len(step.candidates),
+            'parent_hash': parent_hash,
+        }, sort_keys=True)
+        return hashlib.sha256(content.encode()).hexdigest()[:16]
+    # ========================================================================
+    # ARCADE FEEDBACK
+    # ========================================================================
+    def _generate_feedback(self, step: InferenceStep, choice: Any) -> ArcadeFeedback:
+        """Generate arcade-style feedback for a step."""
+        is_override = (choice != step.top_choice)
+        if is_override:
+            # Combo break!
+            if self.session.combo > 0:
+                self._emit_callback('on_combo_break', self.session.combo)
+            self.session.combo = 0
+            return ArcadeFeedback(
+                message="OVERRIDE",
+                intensity=0.8,
+                sound_cue="override",
+                color=(255, 165, 0),  # Orange
+            )
+        else:
+            # Accepted model choice
+            self.session.combo += 1
+            self.session.max_combo = max(self.session.max_combo, self.session.combo)
+            # Combo milestones
+            if self.session.combo in [10, 25, 50, 100]:
+                self._emit_callback('on_combo', self.session.combo)
+                return ArcadeFeedback(
+                    message=f"COMBO x{self.session.combo}!",
+                    intensity=1.0,
+                    sound_cue="combo",
+                    color=(0, 255, 255),  # Cyan
+                )
+            # Regular accept
+            return ArcadeFeedback(
+                message="",
+                intensity=0.3 + min(0.5, self.session.combo * 0.02),
+                sound_cue="accept",
+                color=(0, 255, 0),  # Green
+            )
+    # ========================================================================
+    # CALLBACKS
+    # ========================================================================
+    def on(self, event: str, callback: Callable):
+        """Register callback for events."""
+        if event in self.callbacks:
+            self.callbacks[event].append(callback)
+    def _emit_callback(self, event: str, *args):
+        """Emit event to callbacks."""
+        for cb in self.callbacks.get(event, []):
+            try:
+                cb(*args)
+            except Exception as e:
+                print(f"Callback error: {e}")
+    # ========================================================================
+    # CASCADE PROVENANCE
+    # ========================================================================
+    def _emit_cascade(self, event_type: str, data: Dict[str, Any]):
+        """Emit event to CASCADE bus if available."""
+        if self.bus:
+            try:
+                self.bus.emit(event_type, {
+                    **data,
+                    "source": "causation_hold",
+                    "timestamp": time.time(),
+                })
+            except Exception:
+                pass
+    # ========================================================================
+    # HIGH SCORES
+    # ========================================================================
+    def _load_high_scores(self) -> Dict[str, Any]:
+        """Load high scores from disk."""
+        if self.high_scores_path.exists():
+            try:
+                return json.loads(self.high_scores_path.read_text())
+            except Exception:
+                pass
+        return {"max_combo": 0, "best_accuracy": 0.0, "total_sessions": 0}
+    def _save_high_scores(self):
+        """Save high scores to disk."""
+        self.high_scores_path.parent.mkdir(parents=True, exist_ok=True)
+        self.high_scores_path.write_text(json.dumps(self.high_scores, indent=2))
+    def _check_high_score(self, stats: Dict[str, Any]):
+        """Check and update high scores."""
+        updated = False
+        if stats['max_combo'] > self.high_scores['max_combo']:
+            self.high_scores['max_combo'] = stats['max_combo']
+            updated = True
+        if stats['accuracy'] > self.high_scores['best_accuracy']:
+            self.high_scores['best_accuracy'] = stats['accuracy']
+            updated = True
+        self.high_scores['total_sessions'] += 1
+        if updated:
+            self._save_high_scores()
+    # ========================================================================
+    # DECORATOR FOR EASY WRAPPING
+    # ========================================================================
+    def intercept(self, granularity: str = "step"):
+        """
+        Decorator to intercept a function's inference.
+        Args:
+            granularity: "step" (each call) or "token" (if function yields)
+        """
+        def decorator(func):
+            def wrapper(*args, **kwargs):
+                # If no session, passthrough
+                if not self.session:
+                    return func(*args, **kwargs)
+                # Capture the input
+                input_context = {
+                    "args": str(args)[:200],
+                    "kwargs": {k: str(v)[:100] for k, v in kwargs.items()},
+                }
+                # Get result
+                result = func(*args, **kwargs)
+                # Create candidates from result
+                if isinstance(result, np.ndarray):
+                    # For embeddings, show top dimensions
+                    top_dims = np.argsort(np.abs(result.flatten()))[-5:][::-1]
+                    candidates = [
+                        {"value": f"dim_{d}", "probability": float(np.abs(result.flatten()[d]))}
+                        for d in top_dims
+                    ]
+                else:
+                    candidates = [{"value": result, "probability": 1.0}]
+                # Capture (may block)
+                choice, feedback = self.capture(input_context, candidates)
+                return result
+            return wrapper
+        return decorator

cascade/identity.py ADDED Viewed

	@@ -0,0 +1,715 @@

+"""
+CASCADE Model Identity Layer
+Canonical identification for any AI model variant:
+- Base models (meta-llama/Llama-3-8B)
+- Quantizations (Q4_K_M, Q8_0, AWQ, GPTQ)
+- Fine-tunes (LoRA, full, RLHF)
+- API endpoints (behavioral fingerprinting)
+Every unique model gets a node in the lattice.
+Every observation links to its model's node.
+The lattice becomes the collective memory of AI behavior.
+"Same name, different model, different behavior."
+"""
+import hashlib
+import json
+import time
+from pathlib import Path
+from dataclasses import dataclass, field, asdict
+from typing import Optional, List, Dict, Any
+from enum import Enum
+class ModelFormat(Enum):
+    """Model weight formats."""
+    SAFETENSORS = "safetensors"
+    PYTORCH = "pytorch"
+    GGUF = "gguf"
+    GGML = "ggml"
+    ONNX = "onnx"
+    TENSORRT = "tensorrt"
+    OPENVINO = "openvino"
+    COREML = "coreml"
+    API = "api"  # No weights, just endpoint
+    UNKNOWN = "unknown"
+class QuantizationType(Enum):
+    """Quantization methods."""
+    NONE = "none"  # FP32/FP16/BF16
+    GGUF_Q4_0 = "Q4_0"
+    GGUF_Q4_K_M = "Q4_K_M"
+    GGUF_Q4_K_S = "Q4_K_S"
+    GGUF_Q5_0 = "Q5_0"
+    GGUF_Q5_K_M = "Q5_K_M"
+    GGUF_Q5_K_S = "Q5_K_S"
+    GGUF_Q6_K = "Q6_K"
+    GGUF_Q8_0 = "Q8_0"
+    GPTQ_4BIT = "GPTQ-4bit"
+    GPTQ_8BIT = "GPTQ-8bit"
+    AWQ_4BIT = "AWQ-4bit"
+    BITSANDBYTES_4BIT = "bnb-4bit"
+    BITSANDBYTES_8BIT = "bnb-8bit"
+    INT8 = "INT8"
+    INT4 = "INT4"
+    CUSTOM = "custom"
+class FineTuneType(Enum):
+    """Fine-tuning methods."""
+    NONE = "none"
+    LORA = "lora"
+    QLORA = "qlora"
+    FULL = "full"
+    RLHF = "rlhf"
+    DPO = "dpo"
+    ORPO = "orpo"
+    CUSTOM = "custom"
+@dataclass
+class ModelVariant:
+    """Describes how a model differs from its base."""
+    quantization: str = "none"
+    format: str = "unknown"
+    bits: Optional[int] = None
+    provider: Optional[str] = None  # Who made this variant (e.g., "TheBloke")
+    def to_dict(self) -> dict:
+        return asdict(self)
+@dataclass
+class FineTuneInfo:
+    """Describes fine-tuning applied to a model."""
+    type: str = "none"
+    adapter_id: Optional[str] = None  # HuggingFace adapter ID
+    adapter_hash: Optional[str] = None  # Hash of adapter weights
+    base_model_root: Optional[str] = None  # Merkle root of base model identity
+    dataset_id: Optional[str] = None  # Training dataset
+    def to_dict(self) -> dict:
+        return asdict(self)
+@dataclass
+class BehavioralFingerprint:
+    """
+    Fingerprint for API models where weights are unavailable.
+    Generated by running standard probes and hashing responses.
+    """
+    probe_responses: List[Dict[str, Any]] = field(default_factory=list)
+    probe_hash: Optional[str] = None
+    fingerprint_version: int = 1
+    generated_at: Optional[float] = None
+    def to_dict(self) -> dict:
+        return asdict(self)
+@dataclass
+class ModelIdentity:
+    """
+    Canonical identity for any AI model variant.
+    This is the node that goes in the lattice.
+    All observations of this model link to this identity.
+    """
+    # === Core Identity ===
+    base_model: str  # HuggingFace ID or canonical name
+    model_id: str  # Full unique identifier (computed)
+    # === Variant Info ===
+    variant: ModelVariant = field(default_factory=ModelVariant)
+    fine_tune: FineTuneInfo = field(default_factory=FineTuneInfo)
+    # === Cryptographic Identity ===
+    weight_hash: Optional[str] = None  # SHA256 of weights (if available)
+    config_hash: Optional[str] = None  # SHA256 of model config
+    tokenizer_hash: Optional[str] = None  # SHA256 of tokenizer
+    # === Behavioral Fingerprint (for APIs) ===
+    behavioral_fingerprint: Optional[BehavioralFingerprint] = None
+    # === Source Info ===
+    source_url: Optional[str] = None
+    source_revision: Optional[str] = None  # Git commit/tag
+    downloaded_at: Optional[float] = None
+    # === Lattice Info ===
+    parent_root: Optional[str] = None  # Genesis or base model's merkle root
+    merkle_root: Optional[str] = None  # This identity's merkle root
+    created_at: float = field(default_factory=time.time)
+    # === Metadata ===
+    parameters: Optional[int] = None  # Parameter count
+    context_length: Optional[int] = None
+    architecture: Optional[str] = None  # "llama", "mistral", "gpt", etc.
+    license: Optional[str] = None
+    def __post_init__(self):
+        """Compute derived fields."""
+        if not self.model_id:
+            self.model_id = self.compute_model_id()
+    def compute_model_id(self) -> str:
+        """
+        Compute canonical model ID from components.
+        Format: base_model::variant_spec::fine_tune_spec
+        """
+        parts = [self.base_model]
+        # Add variant spec
+        if self.variant.quantization != "none":
+            parts.append(f"q:{self.variant.quantization}")
+        if self.variant.format != "unknown":
+            parts.append(f"fmt:{self.variant.format}")
+        if self.variant.provider:
+            parts.append(f"by:{self.variant.provider}")
+        # Add fine-tune spec
+        if self.fine_tune.type != "none":
+            parts.append(f"ft:{self.fine_tune.type}")
+            if self.fine_tune.adapter_id:
+                parts.append(f"adapter:{self.fine_tune.adapter_id}")
+        return "::".join(parts)
+    def compute_merkle_root(self) -> str:
+        """Compute merkle root of this identity."""
+        # Create canonical representation
+        canonical = {
+            "base_model": self.base_model,
+            "model_id": self.model_id,
+            "variant": self.variant.to_dict(),
+            "fine_tune": self.fine_tune.to_dict(),
+            "weight_hash": self.weight_hash,
+            "config_hash": self.config_hash,
+            "tokenizer_hash": self.tokenizer_hash,
+            "parent_root": self.parent_root,
+            "created_at": self.created_at,
+        }
+        # Add behavioral fingerprint if present
+        if self.behavioral_fingerprint:
+            canonical["behavioral_fingerprint"] = self.behavioral_fingerprint.probe_hash
+        # Hash it
+        canonical_json = json.dumps(canonical, sort_keys=True)
+        self.merkle_root = hashlib.sha256(canonical_json.encode()).hexdigest()[:16]
+        return self.merkle_root
+    def finalize(self, parent_root: str = None):
+        """Finalize identity and compute merkle root."""
+        if parent_root:
+            self.parent_root = parent_root
+        self.merkle_root = self.compute_merkle_root()
+        return self
+    def to_dict(self) -> dict:
+        """Convert to dictionary for serialization."""
+        return {
+            "base_model": self.base_model,
+            "model_id": self.model_id,
+            "variant": self.variant.to_dict(),
+            "fine_tune": self.fine_tune.to_dict(),
+            "weight_hash": self.weight_hash,
+            "config_hash": self.config_hash,
+            "tokenizer_hash": self.tokenizer_hash,
+            "behavioral_fingerprint": self.behavioral_fingerprint.to_dict() if self.behavioral_fingerprint else None,
+            "source_url": self.source_url,
+            "source_revision": self.source_revision,
+            "downloaded_at": self.downloaded_at,
+            "parent_root": self.parent_root,
+            "merkle_root": self.merkle_root,
+            "created_at": self.created_at,
+            "parameters": self.parameters,
+            "context_length": self.context_length,
+            "architecture": self.architecture,
+            "license": self.license,
+        }
+    def to_chain_format(self) -> dict:
+        """Convert to provenance chain format for lattice storage."""
+        return {
+            "session_id": f"model_identity_{self.merkle_root}",
+            "model_id": self.model_id,
+            "model_hash": self.weight_hash or self.behavioral_fingerprint.probe_hash if self.behavioral_fingerprint else "unknown",
+            "input_hash": self.base_model,
+            "output_hash": None,
+            "records": {
+                "identity": {
+                    "layer_name": "identity",
+                    "layer_idx": 0,
+                    "state_hash": self.merkle_root,
+                    "parent_hashes": [self.parent_root] if self.parent_root else [],
+                    "params_hash": self.config_hash,
+                    "shape": [self.parameters] if self.parameters else [0],
+                    "dtype": "model_identity",
+                    "stats": self.to_dict(),
+                    "execution_order": 0,
+                    "timestamp": self.created_at,
+                }
+            },
+            "external_roots": [self.parent_root] if self.parent_root else [],
+            "merkle_root": self.merkle_root,
+            "created_at": self.created_at,
+            "finalized": True,
+        }
+# =============================================================================
+# STANDARD PROBES FOR BEHAVIORAL FINGERPRINTING
+# =============================================================================
+STANDARD_PROBES_V1 = [
+    # Deterministic probes (temperature=0)
+    {
+        "id": "math_simple",
+        "prompt": "What is 2+2? Answer with just the number.",
+        "params": {"temperature": 0, "max_tokens": 10},
+    },
+    {
+        "id": "capital_france",
+        "prompt": "Complete this sentence with one word: The capital of France is",
+        "params": {"temperature": 0, "max_tokens": 10},
+    },
+    {
+        "id": "translate_hello",
+        "prompt": "Translate to French: Hello",
+        "params": {"temperature": 0, "max_tokens": 20},
+    },
+    {
+        "id": "color_sky",
+        "prompt": "What color is the sky on a clear day? One word answer:",
+        "params": {"temperature": 0, "max_tokens": 10},
+    },
+    # Capability probes
+    {
+        "id": "code_simple",
+        "prompt": "Write a Python function that adds two numbers. Just the function, no explanation.",
+        "params": {"temperature": 0, "max_tokens": 100},
+    },
+    {
+        "id": "reasoning",
+        "prompt": "If all cats are mammals and all mammals are animals, are all cats animals? Answer yes or no.",
+        "params": {"temperature": 0, "max_tokens": 10},
+    },
+    # System prompt probe
+    {
+        "id": "system_role",
+        "prompt": "You are a helpful pirate. Say hello.",
+        "params": {"temperature": 0, "max_tokens": 50},
+        "system": "You are a helpful pirate who speaks like a pirate.",
+    },
+    # Edge cases
+    {
+        "id": "empty",
+        "prompt": "",
+        "params": {"temperature": 0, "max_tokens": 50},
+    },
+    {
+        "id": "repetition",
+        "prompt": "Repeat after me exactly: The quick brown fox",
+        "params": {"temperature": 0, "max_tokens": 20},
+    },
+]
+def generate_behavioral_fingerprint(
+    call_fn,  # Function that takes (prompt, params) and returns response
+    probes: List[dict] = None,
+    version: int = 1,
+) -> BehavioralFingerprint:
+    """
+    Generate behavioral fingerprint by running standard probes.
+    Args:
+        call_fn: Function to call the model. Signature: (prompt, params) -> str
+        probes: List of probe configs. Defaults to STANDARD_PROBES_V1.
+        version: Fingerprint version number.
+    Returns:
+        BehavioralFingerprint with hashed responses.
+    """
+    if probes is None:
+        probes = STANDARD_PROBES_V1
+    responses = []
+    for probe in probes:
+        try:
+            response = call_fn(probe["prompt"], probe.get("params", {}))
+            response_hash = hashlib.sha256(str(response).encode()).hexdigest()[:16]
+        except Exception as e:
+            response_hash = f"error:{type(e).__name__}"
+        responses.append({
+            "probe_id": probe["id"],
+            "prompt_hash": hashlib.sha256(probe["prompt"].encode()).hexdigest()[:16],
+            "response_hash": response_hash,
+        })
+    # Compute overall fingerprint hash
+    fingerprint_data = json.dumps(responses, sort_keys=True)
+    probe_hash = hashlib.sha256(fingerprint_data.encode()).hexdigest()[:16]
+    return BehavioralFingerprint(
+        probe_responses=responses,
+        probe_hash=probe_hash,
+        fingerprint_version=version,
+        generated_at=time.time(),
+    )
+# =============================================================================
+# MODEL IDENTITY FACTORY
+# =============================================================================
+def detect_quantization(model_path: str) -> str:
+    """Detect quantization from model path or name."""
+    path_lower = model_path.lower()
+    # GGUF quantizations
+    for q in ["q4_k_m", "q4_k_s", "q4_0", "q5_k_m", "q5_k_s", "q5_0", "q6_k", "q8_0"]:
+        if q in path_lower:
+            return q.upper()
+    # GPTQ
+    if "gptq" in path_lower:
+        if "4bit" in path_lower or "-4b" in path_lower:
+            return "GPTQ-4bit"
+        elif "8bit" in path_lower or "-8b" in path_lower:
+            return "GPTQ-8bit"
+        return "GPTQ"
+    # AWQ
+    if "awq" in path_lower:
+        return "AWQ-4bit"
+    # BitsAndBytes
+    if "bnb" in path_lower or "bitsandbytes" in path_lower:
+        if "4bit" in path_lower:
+            return "bnb-4bit"
+        return "bnb-8bit"
+    return "none"
+def detect_format(model_path: str) -> str:
+    """Detect model format from path."""
+    path_lower = model_path.lower()
+    if ".gguf" in path_lower:
+        return "gguf"
+    elif ".ggml" in path_lower:
+        return "ggml"
+    elif ".safetensors" in path_lower or "safetensors" in path_lower:
+        return "safetensors"
+    elif ".onnx" in path_lower:
+        return "onnx"
+    elif ".bin" in path_lower or "pytorch" in path_lower:
+        return "pytorch"
+    elif "api" in path_lower or "http" in path_lower:
+        return "api"
+    return "unknown"
+def detect_provider(model_path: str) -> Optional[str]:
+    """Detect who made this variant."""
+    path_lower = model_path.lower()
+    providers = [
+        "thebloke",
+        "unsloth",
+        "mlx-community",
+        "bartowski",
+        "mradermacher",
+        "turboderp",
+    ]
+    for provider in providers:
+        if provider in path_lower:
+            return provider
+    return None
+def create_model_identity(
+    model_id: str,
+    weights_path: Optional[Path] = None,
+    config: Optional[dict] = None,
+    parent_root: Optional[str] = None,
+    behavioral_fingerprint: Optional[BehavioralFingerprint] = None,
+    **kwargs,
+) -> ModelIdentity:
+    """
+    Factory function to create ModelIdentity from various inputs.
+    Args:
+        model_id: HuggingFace model ID or local path
+        weights_path: Path to weights file (for hashing)
+        config: Model config dict
+        parent_root: Merkle root of parent (genesis or base model)
+        behavioral_fingerprint: Pre-computed fingerprint for APIs
+        **kwargs: Additional fields (parameters, context_length, etc.)
+    Returns:
+        Finalized ModelIdentity ready for lattice
+    """
+    # Parse base model from full ID
+    # e.g., "TheBloke/Llama-3-8B-GGUF" -> base is "meta-llama/Llama-3-8B"
+    base_model = kwargs.pop("base_model", None)
+    if not base_model:
+        # Try to extract base from model_id
+        parts = model_id.split("/")
+        if len(parts) >= 2:
+            name = parts[-1]
+            # Remove common suffixes
+            for suffix in ["-GGUF", "-GPTQ", "-AWQ", "-fp16", "-bf16", "-GGML"]:
+                name = name.replace(suffix, "")
+            base_model = name
+        else:
+            base_model = model_id
+    # Detect variant info
+    quantization = detect_quantization(model_id)
+    format_type = detect_format(model_id)
+    provider = detect_provider(model_id)
+    # Extract bits from quantization
+    bits = None
+    if "4" in quantization:
+        bits = 4
+    elif "5" in quantization:
+        bits = 5
+    elif "6" in quantization:
+        bits = 6
+    elif "8" in quantization:
+        bits = 8
+    variant = ModelVariant(
+        quantization=quantization,
+        format=format_type,
+        bits=bits,
+        provider=provider,
+    )
+    # Hash weights if available
+    weight_hash = None
+    if weights_path and Path(weights_path).exists():
+        # For large files, hash first and last 1MB + size
+        path = Path(weights_path)
+        size = path.stat().st_size
+        hasher = hashlib.sha256()
+        hasher.update(str(size).encode())
+        with open(path, "rb") as f:
+            # First 1MB
+            hasher.update(f.read(1024 * 1024))
+            # Last 1MB
+            if size > 2 * 1024 * 1024:
+                f.seek(-1024 * 1024, 2)
+                hasher.update(f.read())
+        weight_hash = hasher.hexdigest()[:16]
+    # Hash config if available
+    config_hash = None
+    if config:
+        config_json = json.dumps(config, sort_keys=True)
+        config_hash = hashlib.sha256(config_json.encode()).hexdigest()[:16]
+    # Create identity
+    identity = ModelIdentity(
+        base_model=base_model,
+        model_id="",  # Will be computed
+        variant=variant,
+        fine_tune=FineTuneInfo(),
+        weight_hash=weight_hash,
+        config_hash=config_hash,
+        behavioral_fingerprint=behavioral_fingerprint,
+        parent_root=parent_root,
+        **kwargs,
+    )
+    # Compute model_id and merkle_root
+    identity.model_id = identity.compute_model_id()
+    identity.finalize(parent_root)
+    return identity
+# =============================================================================
+# MODEL REGISTRY (Lattice Integration)
+# =============================================================================
+class ModelRegistry:
+    """
+    Registry of model identities in the lattice.
+    Provides:
+    - Get or create model identity
+    - Link observations to model identities
+    - Query models by various criteria
+    """
+    def __init__(self, lattice_dir: Path = None, genesis_root: str = None):
+        self.lattice_dir = lattice_dir or Path(__file__).parent.parent / "lattice"
+        self.models_dir = self.lattice_dir / "models"
+        self.models_dir.mkdir(parents=True, exist_ok=True)
+        # Genesis root (models link to this if no base model)
+        self.genesis_root = genesis_root or "89f940c1a4b7aa65"
+        # Cache of loaded identities
+        self._cache: Dict[str, ModelIdentity] = {}
+        self._load_all()
+    def _load_all(self):
+        """Load all model identities from disk."""
+        for json_file in self.models_dir.glob("*.json"):
+            try:
+                data = json.loads(json_file.read_text())
+                identity = self._dict_to_identity(data)
+                self._cache[identity.merkle_root] = identity
+            except Exception as e:
+                print(f"Error loading {json_file}: {e}")
+    def _dict_to_identity(self, data: dict) -> ModelIdentity:
+        """Convert dict back to ModelIdentity."""
+        variant_data = data.get("variant", {})
+        fine_tune_data = data.get("fine_tune", {})
+        fingerprint_data = data.get("behavioral_fingerprint")
+        return ModelIdentity(
+            base_model=data["base_model"],
+            model_id=data["model_id"],
+            variant=ModelVariant(**variant_data),
+            fine_tune=FineTuneInfo(**fine_tune_data),
+            weight_hash=data.get("weight_hash"),
+            config_hash=data.get("config_hash"),
+            tokenizer_hash=data.get("tokenizer_hash"),
+            behavioral_fingerprint=BehavioralFingerprint(**fingerprint_data) if fingerprint_data else None,
+            source_url=data.get("source_url"),
+            source_revision=data.get("source_revision"),
+            downloaded_at=data.get("downloaded_at"),
+            parent_root=data.get("parent_root"),
+            merkle_root=data.get("merkle_root"),
+            created_at=data.get("created_at", time.time()),
+            parameters=data.get("parameters"),
+            context_length=data.get("context_length"),
+            architecture=data.get("architecture"),
+            license=data.get("license"),
+        )
+    def _save_identity(self, identity: ModelIdentity):
+        """Save identity to disk."""
+        filename = f"{identity.merkle_root}.json"
+        filepath = self.models_dir / filename
+        filepath.write_text(json.dumps(identity.to_dict(), indent=2))
+    def get_or_create(
+        self,
+        model_id: str,
+        **kwargs,
+    ) -> ModelIdentity:
+        """
+        Get existing model identity or create new one.
+        If model already exists in registry, returns existing.
+        Otherwise creates new identity linked to genesis or base model.
+        """
+        # Check if we have this model already
+        for identity in self._cache.values():
+            if identity.model_id == model_id or identity.base_model == model_id:
+                return identity
+        # Determine parent
+        # If this is a variant, try to find base model
+        parent_root = kwargs.pop("parent_root", None)
+        if not parent_root:
+            base = kwargs.get("base_model")
+            if base:
+                for identity in self._cache.values():
+                    if identity.base_model == base and identity.variant.quantization == "none":
+                        parent_root = identity.merkle_root
+                        break
+            # Default to genesis
+            if not parent_root:
+                parent_root = self.genesis_root
+        # Create new identity
+        identity = create_model_identity(
+            model_id=model_id,
+            parent_root=parent_root,
+            **kwargs,
+        )
+        # Cache and save
+        self._cache[identity.merkle_root] = identity
+        self._save_identity(identity)
+        return identity
+    def get_by_root(self, merkle_root: str) -> Optional[ModelIdentity]:
+        """Get model identity by merkle root."""
+        return self._cache.get(merkle_root)
+    def list_all(self) -> List[ModelIdentity]:
+        """List all registered models."""
+        return list(self._cache.values())
+    def list_by_base(self, base_model: str) -> List[ModelIdentity]:
+        """List all variants of a base model."""
+        return [i for i in self._cache.values() if i.base_model == base_model]
+    def search(self, query: str) -> List[ModelIdentity]:
+        """Search models by name."""
+        query_lower = query.lower()
+        return [
+            i for i in self._cache.values()
+            if query_lower in i.model_id.lower() or query_lower in i.base_model.lower()
+        ]
+# =============================================================================
+# CLI
+# =============================================================================
+if __name__ == "__main__":
+    import sys
+    # Test: Create some model identities
+    print("=== CASCADE Model Identity Layer ===\n")
+    # Initialize registry
+    registry = ModelRegistry()
+    # Create some test identities
+    test_models = [
+        "meta-llama/Llama-3-8B",
+        "TheBloke/Llama-3-8B-GGUF",
+        "unsloth/Llama-3-8B-bnb-4bit",
+        "anthropic/claude-3-opus",
+        "openai/gpt-4",
+    ]
+    for model in test_models:
+        identity = registry.get_or_create(model)
+        print(f"Model: {identity.model_id}")
+        print(f"  Base: {identity.base_model}")
+        print(f"  Quant: {identity.variant.quantization}")
+        print(f"  Format: {identity.variant.format}")
+        print(f"  Merkle: {identity.merkle_root}")
+        print(f"  Parent: {identity.parent_root}")
+        print()
+    print(f"Total models in registry: {len(registry.list_all())}")

cascade/ipld.py ADDED Viewed

	@@ -0,0 +1,379 @@

+"""
+CASCADE IPLD - InterPlanetary Linked Data Integration
+Native IPLD encoding for provenance chains. Merkle roots become CIDs.
+The lattice goes interplanetary.
+CIDs (Content IDentifiers) are self-describing, content-addressed identifiers.
+When we encode a chain as IPLD, its CID is derived from its content.
+Anyone with the CID can fetch and verify.
+Architecture:
+    ProvenanceChain ──encode──► DAG-CBOR ──hash──► CID
+                                                    │
+                                     bafyreif...xyz (interplanetary address)
+"""
+import json
+import hashlib
+from typing import Dict, Any, Optional, List
+from dataclasses import dataclass
+from pathlib import Path
+# IPLD encoding
+import dag_cbor
+from multiformats import CID, multihash
+# CASCADE core
+from cascade.core.provenance import ProvenanceChain, ProvenanceRecord
+# =============================================================================
+# IPLD ENCODING
+# =============================================================================
+def chain_to_ipld(chain: ProvenanceChain) -> Dict[str, Any]:
+    """
+    Convert a ProvenanceChain to IPLD-compatible format.
+    IPLD format uses:
+    - Lowercase keys
+    - CID links for references
+    - DAG-CBOR encoding
+    """
+    # Convert records to IPLD format
+    records = {}
+    for name, record in chain.records.items():
+        records[name] = {
+            "layer_name": record.layer_name,
+            "layer_idx": record.layer_idx,
+            "state_hash": record.state_hash,
+            "parent_hashes": record.parent_hashes,
+            "params_hash": record.params_hash,
+            "shape": record.shape,
+            "dtype": record.dtype,
+            "stats": record.stats,
+            "execution_order": record.execution_order,
+            "timestamp": record.timestamp,
+        }
+    # Convert external_roots to CID links if they look like CIDs
+    external_links = []
+    for root in chain.external_roots:
+        if root.startswith("bafy") or root.startswith("Qm"):
+            # Already a CID - create a link
+            external_links.append({"/": root})
+        else:
+            # Legacy merkle root - keep as string
+            external_links.append({"legacy_root": root})
+    return {
+        "session_id": chain.session_id,
+        "model_id": chain.model_id,
+        "model_hash": chain.model_hash,
+        "input_hash": chain.input_hash,
+        "output_hash": chain.output_hash,
+        "records": records,
+        "external_roots": chain.external_roots,  # Keep for verification
+        "external_links": external_links,        # IPLD links
+        "merkle_root": chain.merkle_root,
+        "created_at": chain.created_at,
+        "finalized": chain.finalized,
+        "ipld_version": 1,
+    }
+def encode_to_dag_cbor(data: Dict[str, Any]) -> bytes:
+    """Encode data as DAG-CBOR (canonical CBOR for IPLD)."""
+    return dag_cbor.encode(data)
+def decode_from_dag_cbor(raw: bytes) -> Dict[str, Any]:
+    """Decode DAG-CBOR data."""
+    return dag_cbor.decode(raw)
+def compute_cid(data: bytes, codec: str = "dag-cbor") -> str:
+    """
+    Compute CID (Content IDentifier) from data.
+    CID = multicodec(codec) + multihash(sha256(data))
+    Returns CIDv1 in base32 (bafyrei...)
+    """
+    # SHA-256 hash of the data
+    digest = hashlib.sha256(data).digest()
+    # Create multihash (0x12 = sha2-256, 0x20 = 32 bytes)
+    mh = multihash.wrap(digest, "sha2-256")
+    # Create CID v1 with dag-cbor codec (0x71)
+    cid = CID("base32", 1, "dag-cbor", mh)
+    return str(cid)
+def chain_to_cid(chain: ProvenanceChain) -> tuple[str, bytes]:
+    """
+    Convert chain to CID.
+    Returns:
+        (cid_string, encoded_bytes)
+    """
+    ipld_data = chain_to_ipld(chain)
+    encoded = encode_to_dag_cbor(ipld_data)
+    cid = compute_cid(encoded)
+    return cid, encoded
+# =============================================================================
+# IPLD CHAIN - Native CID-based chain
+# =============================================================================
+@dataclass
+class IPLDChain:
+    """
+    A provenance chain with native CID support.
+    Instead of custom merkle roots, uses CIDs.
+    Links to other chains via CID references.
+    """
+    chain: ProvenanceChain
+    cid: Optional[str] = None
+    encoded: Optional[bytes] = None
+    @classmethod
+    def from_chain(cls, chain: ProvenanceChain) -> 'IPLDChain':
+        """Create IPLD chain from regular chain."""
+        cid, encoded = chain_to_cid(chain)
+        return cls(chain=chain, cid=cid, encoded=encoded)
+    @classmethod
+    def from_bytes(cls, data: bytes) -> 'IPLDChain':
+        """Deserialize from DAG-CBOR bytes."""
+        ipld_data = decode_from_dag_cbor(data)
+        chain = ipld_to_chain(ipld_data)
+        cid = compute_cid(data)
+        return cls(chain=chain, cid=cid, encoded=data)
+    def link_to(self, other: 'IPLDChain') -> None:
+        """Link this chain to another via CID."""
+        if other.cid is None:
+            raise ValueError("Cannot link to chain without CID")
+        self.chain.link_external(other.cid, source_id=other.chain.model_id)
+        # Recompute our CID since we changed
+        self.cid, self.encoded = chain_to_cid(self.chain)
+    def save(self, path: Path) -> None:
+        """Save as DAG-CBOR file."""
+        if self.encoded is None:
+            self.cid, self.encoded = chain_to_cid(self.chain)
+        with open(path, 'wb') as f:
+            f.write(self.encoded)
+    @classmethod
+    def load(cls, path: Path) -> 'IPLDChain':
+        """Load from DAG-CBOR file."""
+        with open(path, 'rb') as f:
+            data = f.read()
+        return cls.from_bytes(data)
+    def to_json(self) -> str:
+        """Export as JSON (for human inspection)."""
+        ipld_data = chain_to_ipld(self.chain)
+        ipld_data["_cid"] = self.cid
+        return json.dumps(ipld_data, indent=2, default=str)
+def ipld_to_chain(ipld_data: Dict[str, Any]) -> ProvenanceChain:
+    """Convert IPLD data back to ProvenanceChain."""
+    # Reconstruct records
+    records = {}
+    for name, rec_data in ipld_data.get("records", {}).items():
+        records[name] = ProvenanceRecord(
+            layer_name=rec_data["layer_name"],
+            layer_idx=rec_data["layer_idx"],
+            state_hash=rec_data["state_hash"],
+            parent_hashes=rec_data["parent_hashes"],
+            params_hash=rec_data.get("params_hash"),
+            shape=rec_data.get("shape", []),
+            dtype=rec_data.get("dtype", "float32"),
+            stats=rec_data.get("stats", {}),
+            execution_order=rec_data.get("execution_order", 0),
+            timestamp=rec_data.get("timestamp", 0),
+        )
+    chain = ProvenanceChain(
+        session_id=ipld_data["session_id"],
+        model_id=ipld_data["model_id"],
+        model_hash=ipld_data["model_hash"],
+        input_hash=ipld_data["input_hash"],
+        output_hash=ipld_data.get("output_hash"),
+        external_roots=ipld_data.get("external_roots", []),
+        merkle_root=ipld_data.get("merkle_root"),
+        created_at=ipld_data.get("created_at", 0),
+        finalized=ipld_data.get("finalized", False),
+    )
+    chain.records = records
+    return chain
+# =============================================================================
+# IPFS PUBLISHING (requires running IPFS daemon)
+# =============================================================================
+def publish_to_ipfs(chain: IPLDChain, ipfs_api: str = "/ip4/127.0.0.1/tcp/5001") -> str:
+    """
+    Publish chain to IPFS network.
+    Requires IPFS daemon running locally.
+    Returns the CID (which should match our computed CID).
+    Args:
+        chain: IPLDChain to publish
+        ipfs_api: IPFS API multiaddr
+    Returns:
+        CID from IPFS (for verification)
+    """
+    try:
+        import ipfshttpclient
+        client = ipfshttpclient.connect(ipfs_api)
+        # Add the raw DAG-CBOR data
+        result = client.dag.put(
+            chain.encoded,
+            store_codec="dag-cbor",
+            input_codec="dag-cbor"
+        )
+        ipfs_cid = result["Cid"]["/"]
+        # Verify CIDs match
+        if ipfs_cid != chain.cid:
+            print(f"[WARN] CID mismatch: computed={chain.cid}, ipfs={ipfs_cid}")
+        return ipfs_cid
+    except Exception as e:
+        print(f"[ERROR] IPFS publish failed: {e}")
+        print("        Make sure IPFS daemon is running: ipfs daemon")
+        raise
+def fetch_from_ipfs(cid: str, ipfs_api: str = "/ip4/127.0.0.1/tcp/5001") -> IPLDChain:
+    """
+    Fetch chain from IPFS network by CID.
+    Args:
+        cid: Content identifier
+        ipfs_api: IPFS API multiaddr
+    Returns:
+        IPLDChain
+    """
+    try:
+        import ipfshttpclient
+        client = ipfshttpclient.connect(ipfs_api)
+        # Get the DAG node
+        data = client.dag.get(cid)
+        # Convert to chain
+        chain = ipld_to_chain(data)
+        encoded = encode_to_dag_cbor(data)
+        return IPLDChain(chain=chain, cid=cid, encoded=encoded)
+    except Exception as e:
+        print(f"[ERROR] IPFS fetch failed: {e}")
+        raise
+# =============================================================================
+# GENESIS IN IPLD
+# =============================================================================
+def get_genesis_cid() -> tuple[str, IPLDChain]:
+    """
+    Get genesis as IPLD chain with CID.
+    The genesis CID is deterministic - anyone computing it gets the same result.
+    This is the interplanetary Schelling point.
+    """
+    from cascade.genesis import create_genesis
+    genesis = create_genesis()
+    ipld_genesis = IPLDChain.from_chain(genesis)
+    return ipld_genesis.cid, ipld_genesis
+# =============================================================================
+# CLI
+# =============================================================================
+if __name__ == "__main__":
+    import sys
+    print("=" * 60)
+    print("CASCADE IPLD - InterPlanetary Linked Data")
+    print("=" * 60)
+    # Get genesis CID
+    genesis_cid, genesis_ipld = get_genesis_cid()
+    print(f"\nGenesis CID: {genesis_cid}")
+    print(f"Genesis merkle_root: {genesis_ipld.chain.merkle_root}")
+    # Load cascade_alpha and convert to IPLD
+    alpha_path = Path("lattice/cascade_alpha.json")
+    if alpha_path.exists():
+        with open(alpha_path) as f:
+            alpha_data = json.load(f)
+        alpha_chain = ProvenanceChain.from_dict(alpha_data)
+        alpha_ipld = IPLDChain.from_chain(alpha_chain)
+        print(f"\ncascade_alpha CID: {alpha_ipld.cid}")
+        print(f"cascade_alpha merkle_root: {alpha_chain.merkle_root}")
+        # Save as DAG-CBOR
+        out_dir = Path("lattice/ipld")
+        out_dir.mkdir(exist_ok=True)
+        genesis_ipld.save(out_dir / "genesis.cbor")
+        alpha_ipld.save(out_dir / "cascade_alpha.cbor")
+        # Also save JSON for inspection
+        with open(out_dir / "genesis.ipld.json", 'w') as f:
+            f.write(genesis_ipld.to_json())
+        with open(out_dir / "cascade_alpha.ipld.json", 'w') as f:
+            f.write(alpha_ipld.to_json())
+        print(f"\nSaved to {out_dir}/")
+        print(f"  - genesis.cbor")
+        print(f"  - cascade_alpha.cbor")
+        print(f"  - genesis.ipld.json")
+        print(f"  - cascade_alpha.ipld.json")
+    print("\n" + "=" * 60)
+    print("INTERPLANETARY ADDRESSES")
+    print("=" * 60)
+    print(f"""
+Genesis:       {genesis_cid}
+cascade_alpha: {alpha_ipld.cid if alpha_path.exists() else 'N/A'}
+These CIDs are content-addressed. Anyone with the CID can:
+1. Fetch the data from IPFS (if pinned)
+2. Verify the content matches the CID
+3. Trust the chain without trusting the source
+To publish to IPFS:
+    ipfs daemon  # Start IPFS
+    python -c "
+    from cascade.ipld import publish_to_ipfs, get_genesis_cid
+    _, genesis = get_genesis_cid()
+    cid = publish_to_ipfs(genesis)
+    print(f'Published: {{cid}}')
+    "
+    """)

cascade/listen.py ADDED Viewed

	@@ -0,0 +1,154 @@

+"""
+Cascade Passive Monitor.
+Listens to stdin or follows a log file and observes events.
+Usage:
+    python -m cascade.listen                    # Listen to stdin
+    python -m cascade.listen --follow app.log   # Follow a log file
+This module:
+1. Reads input from stdin or a log file
+2. Pipes lines -> Cascade Adapter
+3. Writes events to tape file (JSONL) and human log (Markdown)
+4. Emits events to event_queue for external consumers
+For visualization, point a consumer at the event_queue or load the tape file
+into your preferred visualization tool.
+"""
+import sys
+import argparse
+import time
+import json
+from pathlib import Path
+from queue import Queue
+# Ensure package root is in path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from cascade import Monitor
+# Shared event queue for external consumers (e.g., custom UIs)
+event_queue: Queue = Queue()
+def main():
+    parser = argparse.ArgumentParser(description="Cascade Passive Monitor")
+    parser.add_argument("--log-dir", default="./logs", help="Directory for logs")
+    parser.add_argument("--follow", help="Log file to follow (tail -f style)")
+    parser.add_argument("--quiet", "-q", action="store_true", help="Suppress console output")
+    args = parser.parse_args()
+    # 0. Setup Logs & Baggies
+    log_dir = Path(args.log_dir)
+    log_dir.mkdir(parents=True, exist_ok=True)
+    baggies_dir = log_dir / "baggies"
+    baggies_dir.mkdir(exist_ok=True)
+    # Excrement Management (Archive old artifacts)
+    follow_abs = Path(args.follow).absolute() if args.follow else None
+    for f in log_dir.glob("*.*"):
+        if f.is_file() and f.suffix in [".md", ".jsonl", ".log"] and "baggies" not in str(f):
+            if follow_abs and f.absolute() == follow_abs:
+                continue
+            try:
+                dest = baggies_dir / f.name
+                if dest.exists():
+                    dest = baggies_dir / f"{f.stem}_{int(time.time())}{f.suffix}"
+                f.replace(dest)
+            except Exception:
+                pass
+    print(f"[CASCADE] Logs archived to {baggies_dir}")
+    session_id = int(time.time())
+    tape_path = log_dir / f"cascade_tape_{session_id}.jsonl"
+    human_path = log_dir / f"cascade_log_{session_id}.md"
+    tape_file = open(tape_path, "w", encoding="utf-8")
+    human_file = open(human_path, "w", encoding="utf-8")
+    # Init Log
+    human_file.write(f"# CASCADE MISSION LOG // SESSION {session_id}\n")
+    human_file.write(f"**Mode:** PASSIVE {'FOLLOWER' if args.follow else 'LISTENER'}\n")
+    human_file.write(f"**Target:** `{args.follow or 'STDIN'}`\n---\n\n")
+    human_file.flush()
+    print("="*60)
+    print("CASCADE // LISTENER")
+    print(f"Monitoring: {args.follow if args.follow else 'Standard Input'}")
+    print(f"Tape:       {tape_path.absolute()}")
+    print(f"Baggies:    {baggies_dir.absolute()}")
+    print("="*60)
+    monitor = Monitor("symbiont_passive")
+    def process_line(line):
+        line = line.strip()
+        if not line:
+            return
+        event = monitor.observe(line)
+        payload = {
+            "event": {
+                "event_id": event.event_id,
+                "timestamp": event.timestamp,
+                "component": event.component,
+                "event_type": event.event_type,
+                "data": event.data,
+                "raw": line,  # Include original line for drill-down
+            },
+            "metrics": monitor.metrics.summary(),
+            "triage": monitor.metrics.triage(),
+        }
+        event_queue.put(payload)
+        tape_file.write(json.dumps(payload) + "\n")
+        tape_file.flush()
+        # Narrative
+        t_str = time.strftime('%H:%M:%S', time.localtime(event.timestamp))
+        icon = {"error": "🔴", "warning": "⚠️", "state_change": "🔄"}.get(event.event_type, "ℹ️")
+        if "loss" in str(event.data):
+            icon = "📉"
+        human_file.write(f"### {icon} {t_str} // {event.event_type.upper()}\n")
+        human_file.write(f"Event observed in **{event.component}**.\n")
+        if event.data:
+            human_file.write("```yaml\n")
+            for k, v in event.data.items():
+                human_file.write(f"{k}: {v}\n")
+            human_file.write("```\n")
+        human_file.write("\n")
+        human_file.flush()
+        # Mirror to console (unless quiet)
+        if not args.quiet:
+            sys.stdout.write(f"[SIGHT] {line[:80]}...\n")
+            sys.stdout.flush()
+    try:
+        if args.follow:
+            print(f"[CASCADE] Waiting for stream: {args.follow}")
+            f_path = Path(args.follow)
+            if not f_path.exists():
+                f_path.touch()
+            with open(f_path, "r", encoding="utf-8", errors="replace") as f:
+                print(f"[CASCADE] Scanning for events...")
+                while True:
+                    line = f.readline()
+                    if not line:
+                        time.sleep(0.1)
+                        continue
+                    process_line(line)
+        else:
+            print("[CASCADE] Reading from stdin (Ctrl+C to stop)...")
+            for line in sys.stdin:
+                process_line(line)
+    except KeyboardInterrupt:
+        print("\n[CASCADE] Detaching...")
+    finally:
+        tape_file.close()
+        human_file.close()
+        print(f"[CASCADE] Session complete. Tape: {tape_path}")
+if __name__ == "__main__":
+    main()

cascade/logging/__init__.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+CASCADE Logging System
+Industry-standard dual-layer logging for mathematical precision and human clarity.
+Two modes:
+1. Kleene Mode: Mathematical fixed point logs for debugging and verification
+2. Interpretive Mode: Human-readable causation stories for operators
+Use together for complete system observability.
+"""
+from .kleene_logger import (
+    KleeneLogger,
+    LogLevel,
+    get_kleene_logger,
+    log_fixed_point,
+    log_iterations
+)
+from .interpretive_logger import (
+    InterpretiveLogger,
+    ImpactLevel,
+    get_interpretive_logger,
+    translate_kleene_to_interpretive
+)
+from .log_manager import (
+    LogMode,
+    LogConfig,
+    CascadeLogManager,
+    init_logging,
+    get_log_manager,
+    log
+)
+def init_cascade_logging(component: str, system: str):
+    """Initialize both logging layers for a component"""
+    kleene = get_kleene_logger(component)
+    interpretive = get_interpretive_logger(system)
+    # Bridge automatic translation
+    def bridge_log(entry):
+        translate_kleene_to_interpretive(entry, interpretive)
+    kleene._emit_to_container = lambda entry: (
+        print(kleene._format_container(entry)),
+        bridge_log(entry)
+    )
+    return kleene, interpretive
+# Convenience for quick setup
+def setup_logging(component: str, system: str = "CASCADE"):
+    """Quick setup for both loggers"""
+    return init_cascade_logging(component, system)
+# Export main interfaces
+__all__ = [
+    # Kleene (mathematical)
+    'KleeneLogger',
+    'LogLevel',
+    'get_kleene_logger',
+    'log_fixed_point',
+    'log_iterations',
+    # Interpretive (human)
+    'InterpretiveLogger',
+    'ImpactLevel',
+    'get_interpretive_logger',
+    'translate_kleene_to_interpretive',
+    # Log Manager (orchestrator)
+    'LogMode',
+    'LogConfig',
+    'CascadeLogManager',
+    'init_logging',
+    'get_log_manager',
+    'log',
+    # Unified
+    'init_cascade_logging',
+    'setup_logging'
+]

cascade/logging/color_example.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+CASCADE Color Logging Example
+Shows how to integrate beautiful colored logs throughout your system.
+"""
+from .kleene_logger import get_kleene_logger, LogLevel
+from .interpretive_logger import get_interpretive_logger, ImpactLevel
+def example_data_processing():
+    """Example: Data processing with beautiful logs"""
+    kleene = get_kleene_logger("DataProcessor")
+    interpretive = get_interpretive_logger("Data Pipeline")
+    # Start processing
+    kleene.log(LogLevel.INFO, "load_dataset_start",
+               state_before={"dataset": "smollm3-blueprint.pdf"})
+    interpretive.log(ImpactLevel.LOW, "DataLoader", "Loading dataset",
+                    context="Reading PDF file for analysis",
+                    consequence="Will extract text and metadata",
+                    metrics={"file_size": "1.0MB", "type": "PDF"})
+    # Processing steps
+    kleene.log(LogLevel.DEBUG, "extract_text",
+               state_before={"page": 1},
+               state_after={"pages_processed": 15})
+    # Fixed point reached
+    kleene.log(LogLevel.INFO, "processing_complete",
+               state_after={"records": 500, "clean": True},
+               fixed_point=True,
+               iterations=3)
+    interpretive.log(ImpactLevel.MEDIUM, "DataProcessor", "Processing complete",
+                    context="Successfully extracted and cleaned data",
+                    consequence="Ready for forensics analysis",
+                    metrics={"records": 500, "pages": 15, "errors": 0})
+def example_model_observation():
+    """Example: Model observation with beautiful logs"""
+    kleene = get_kleene_logger("ModelObserver")
+    interpretive = get_interpretive_logger("Model Observatory")
+    # Model loading
+    kleene.log(LogLevel.INFO, "model_load_start",
+               state_before={"model": "mistralai/Mixtral-8x22B-Instruct-v0.1"})
+    interpretive.log(ImpactLevel.MEDIUM, "ModelLoader", "Loading Mixtral",
+                    context="Loading 8x22B MoE model for inference",
+                    consequence="Will consume significant VRAM",
+                    metrics={"params": "141B", "active": "39B", "device": "cuda"})
+    # Observation
+    kleene.log(LogLevel.INFO, "observation_start",
+               state_before={"layers": 0, "hash": "initial"})
+    # Fixed point achieved
+    kleene.log(LogLevel.INFO, "observation_fixed_point",
+               state_after={"layers": 64, "merkle": "abc123..."},
+               fixed_point=True,
+               iterations=64)
+    interpretive.log(ImpactLevel.LOW, "CASCADE", "Model observed",
+                    context="Cryptographic proof generated for model execution",
+                    consequence="Merkle root provides verifiable audit trail",
+                    metrics={"model": "Mixtral", "layers": 64, "merkle": "abc123..."})
+def example_error_handling():
+    """Example: Error handling with colored logs"""
+    kleene = get_kleene_logger("ErrorHandler")
+    interpretive = get_interpretive_logger("System Monitor")
+    # Error detected
+    kleene.log(LogLevel.ERROR, "memory_exhaustion",
+               state_before={"memory": "15.8/16GB", "operation": "inference"},
+               fixed_point=False)
+    interpretive.log(ImpactLevel.HIGH, "MemoryManager", "Out of memory",
+                    context="GPU memory exhausted during model inference",
+                    consequence="Inference failed, system degraded",
+                    metrics={"used": "15.8GB", "total": "16GB", "available": "200MB"},
+                    recommendation="Enable gradient checkpointing or use smaller batch size")
+    # Recovery
+    kleene.log(LogLevel.WARNING, "fallback_activated",
+               state_after={"mode": "cpu_fallback", "batch_size": 1})
+    interpretive.log(ImpactLevel.MEDIUM, "FallbackHandler", "CPU fallback activated",
+                    context="Switched to CPU inference due to memory constraints",
+                    consequence="Performance degraded but functionality preserved",
+                    metrics={"device": "cpu", "batch_size": 1, "slowdown": "10x"})
+# Run all examples
+if __name__ == "__main__":
+    print("\n🎨 CASCADE Color Logging Examples\n")
+    print("="*60)
+    example_data_processing()
+    print("\n" + "="*60)
+    example_model_observation()
+    print("\n" + "="*60)
+    example_error_handling()
+    print("\n" + "="*60)
+    print("\n✨ Beautiful logs are ready for production!")

cascade/logging/integrate.py ADDED Viewed

	@@ -0,0 +1,275 @@

+"""
+CASCADE Logging Integration
+Plug-and-play logging for existing CASCADE components.
+Retrofits existing systems with world-class logging without major surgery.
+"""
+import functools
+import time
+from typing import Any, Callable, Dict, Optional
+from .log_manager import get_log_manager, LogLevel, ImpactLevel
+def log_component(component_name: str, system: str = "CASCADE"):
+    """Decorator to add logging to any class or function"""
+    def decorator(target):
+        if isinstance(target, type):
+            # Decorating a class
+            return _log_class(target, component_name, system)
+        else:
+            # Decorating a function
+            return _log_function(target, component_name, system)
+    return decorator
+def _log_class(cls, component_name: str, system: str):
+    """Add logging to all methods of a class"""
+    manager = get_log_manager()
+    manager.register_component(component_name, system)
+    for attr_name in dir(cls):
+        if not attr_name.startswith('_'):
+            attr = getattr(cls, attr_name)
+            if callable(attr):
+                setattr(cls, attr_name, _log_method(attr, component_name))
+    return cls
+def _log_function(func, component_name: str, system: str):
+    """Add logging to a function"""
+    manager = get_log_manager()
+    manager.register_component(component_name, system)
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        # Log start
+        get_log_manager().log_operation(
+            component_name, f"{func.__name__}_start",
+            level=LogLevel.DEBUG,
+            impact=ImpactLevel.TRACE,
+            details={
+                "context": f"Starting {func.__name__}",
+                "consequence": f"Will execute {func.__name__}",
+                "metrics": {"args": len(args), "kwargs": len(kwargs)}
+            }
+        )
+        try:
+            result = func(*args, **kwargs)
+            # Log success
+            duration = time.time() - start_time
+            get_log_manager().log_operation(
+                component_name, f"{func.__name__}_complete",
+                level=LogLevel.INFO,
+                impact=ImpactLevel.LOW,
+                details={
+                    "context": f"Completed {func.__name__}",
+                    "consequence": f"Result ready",
+                    "metrics": {"duration_seconds": duration}
+                }
+            )
+            return result
+        except Exception as e:
+            # Log error
+            get_log_manager().log_operation(
+                component_name, f"{func.__name__}_error",
+                level=LogLevel.ERROR,
+                impact=ImpactLevel.HIGH,
+                details={
+                    "context": f"Failed in {func.__name__}",
+                    "consequence": "Operation failed",
+                    "metrics": {"error": str(e)}
+                }
+            )
+            raise
+    return wrapper
+def _log_method(method, component_name: str):
+    """Add logging to a method"""
+    @functools.wraps(method)
+    def wrapper(self, *args, **kwargs):
+        start_time = time.time()
+        try:
+            result = method(self, *args, **kwargs)
+            # Log successful method call
+            get_log_manager().log_operation(
+                component_name, f"{method.__name__}",
+                level=LogLevel.DEBUG,
+                impact=ImpactLevel.TRACE,
+                details={
+                    "metrics": {"duration": time.time() - start_time}
+                }
+            )
+            return result
+        except Exception as e:
+            # Log method error
+            get_log_manager().log_operation(
+                component_name, f"{method.__name__}_error",
+                level=LogLevel.ERROR,
+                impact=ImpactLevel.HIGH,
+                details={
+                    "context": f"Method {method.__name__} failed",
+                    "metrics": {"error": str(e)}
+                }
+            )
+            raise
+    return wrapper
+def log_kleene_iterations(operation_name: str):
+    """Decorator specifically for Kleene fixed point iterations"""
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            get_log_manager().log_operation(
+                "KleeneEngine", f"{operation_name}_start",
+                level=LogLevel.INFO,
+                impact=ImpactLevel.MEDIUM,
+                details={
+                    "context": f"Starting fixed point iteration for {operation_name}",
+                    "consequence": "Will iterate until convergence"
+                }
+            )
+            start_time = time.time()
+            result = func(*args, **kwargs)
+            # Extract iteration info from result if available
+            iterations = getattr(result, 'iterations', 0)
+            converged = getattr(result, 'converged', True)
+            get_log_manager().log_operation(
+                "KleeneEngine", f"{operation_name}_complete",
+                level=LogLevel.INFO,
+                impact=ImpactLevel.LOW if converged else ImpactLevel.HIGH,
+                details={
+                    "context": f"Fixed point iteration {'converged' if converged else 'diverged'}",
+                    "consequence": f"Processed {iterations} iterations",
+                    "metrics": {
+                        "iterations": iterations,
+                        "converged": converged,
+                        "duration": time.time() - start_time
+                    },
+                    "fixed_point": converged
+                }
+            )
+            return result
+        return wrapper
+    return decorator
+def log_model_observation(model_id: str):
+    """Decorator for model observation functions"""
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            get_log_manager().log_operation(
+                "ModelObserver", f"observe_{model_id}",
+                level=LogLevel.INFO,
+                impact=ImpactLevel.MEDIUM,
+                details={
+                    "context": f"Starting observation of model {model_id}",
+                    "consequence": "Will generate cryptographic proof"
+                }
+            )
+            result = func(*args, **kwargs)
+            # Extract observation details
+            layers = getattr(result, 'layer_count', 0)
+            merkle = getattr(result, 'merkle_root', 'unknown')
+            get_log_manager().log_operation(
+                "ModelObserver", f"observed_{model_id}",
+                level=LogLevel.INFO,
+                impact=ImpactLevel.LOW,
+                details={
+                    "context": f"Model observation complete",
+                    "consequence": "Cryptographic proof generated",
+                    "metrics": {
+                        "model": model_id,
+                        "layers": layers,
+                        "merkle": merkle[:16] + "..."
+                    },
+                    "fixed_point": True
+                }
+            )
+            return result
+        return wrapper
+    return decorator
+def log_data_processing(dataset_name: str):
+    """Decorator for data processing functions"""
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            get_log_manager().log_operation(
+                "DataProcessor", f"process_{dataset_name}",
+                level=LogLevel.INFO,
+                impact=ImpactLevel.MEDIUM,
+                details={
+                    "context": f"Processing dataset {dataset_name}",
+                    "consequence": "Will extract and analyze data"
+                }
+            )
+            result = func(*args, **kwargs)
+            # Extract processing stats
+            records = getattr(result, 'record_count', 0)
+            operations = getattr(result, 'operations', [])
+            get_log_manager().log_operation(
+                "DataProcessor", f"processed_{dataset_name}",
+                level=LogLevel.INFO,
+                impact=ImpactLevel.LOW,
+                details={
+                    "context": f"Dataset processing complete",
+                    "consequence": f"Processed {records} records",
+                    "metrics": {
+                        "dataset": dataset_name,
+                        "records": records,
+                        "operations": len(operations)
+                    }
+                }
+            )
+            return result
+        return wrapper
+    return decorator
+# Quick integration function
+def integrate_cascade_logging():
+    """One-call integration for entire CASCADE system"""
+    from ..system.observer import SystemObserver
+    from ..core.provenance import ProvenanceTracker
+    from data_unity import run_kleene_iteration
+    # Register main components
+    manager = get_log_manager()
+    manager.register_component("SystemObserver", "System Observatory")
+    manager.register_component("ProvenanceTracker", "Model Observatory")
+    manager.register_component("DataUnity", "Data Unity")
+    manager.register_component("KleeneEngine", "NEXUS")
+    print("✅ CASCADE logging integrated across all components")
+    return manager

cascade/logging/interpretive_logger.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""
+CASCADE Interpretive Logger
+Human-readable causation flow logging for operators and stakeholders.
+Translates mathematical events into stories humans can understand and act upon.
+"""
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional
+from datetime import datetime
+class ImpactLevel(Enum):
+    """Business impact levels"""
+    CRITICAL = "🔴 CRITICAL"  # Service down, data loss
+    HIGH = "🟠 HIGH"         # Degraded performance, user impact
+    MEDIUM = "🟡 MEDIUM"     # Issues detected, monitoring needed
+    LOW = "🟢 LOW"          # Informational, routine operations
+    TRACE = "🔵 TRACE"      # Detailed flow, debugging
+@dataclass
+class InterpretiveEntry:
+    """A human-readable system event"""
+    timestamp: float = field(default_factory=time.time)
+    impact: ImpactLevel = ImpactLevel.LOW
+    system: str = ""  # High-level system name
+    component: str = ""  # Specific component
+    event: str = ""  # What happened
+    context: str = ""  # Why it matters
+    consequence: str = ""  # What happens next
+    metrics: Dict[str, Any] = field(default_factory=dict)
+    recommendation: Optional[str] = None
+    def format_display(self) -> str:
+        """Format for beautiful terminal output with colors"""
+        time_str = datetime.fromtimestamp(self.timestamp).strftime("%H:%M:%S")
+        # ANSI color codes
+        colors = {
+            "CRITICAL": ("\033[91m", "🔴"),  # Bright red
+            "HIGH": ("\033[31m", "🟠"),       # Red
+            "MEDIUM": ("\033[33m", "🟡"),     # Yellow
+            "LOW": ("\033[32m", "🟢"),        # Green
+            "TRACE": ("\033[90m", "🔵"),      # Gray
+            "RESET": "\033[0m",
+            "BOLD": "\033[1m",
+            "DIM": "\033[2m",
+            "CYAN": "\033[36m",
+            "MAGENTA": "\033[35m",
+        }
+        color, icon = colors.get(self.impact.value, ("\033[0m", "⚪"))
+        reset = colors["RESET"]
+        bold = colors["BOLD"]
+        dim = colors["DIM"]
+        cyan = colors["CYAN"]
+        magenta = colors["MAGENTA"]
+        lines = [
+            f"\n{color}{bold}{icon} {self.impact.value} [{time_str}] {self.system}{reset}",
+            f"├─ {cyan}Component:{reset} {self.component}",
+            f"├─ {magenta}Event:{reset} {self.event}",
+            f"├─ {dim}Context:{reset} {self.context}",
+            f"├─ {dim}Consequence:{reset} {self.consequence}",
+        ]
+        if self.metrics:
+            lines.append(f"├─ {cyan}Metrics:{reset} {self._format_metrics()}")
+        if self.recommendation:
+            lines.append(f"└─ {bold}Recommendation:{reset} {self.recommendation}")
+        else:
+            lines.append(f"└─ {dim}Status: Monitoring{reset}")
+        return "\n".join(lines)
+    def _format_metrics(self) -> str:
+        """Format metrics nicely"""
+        return ", ".join([f"{k}={v}" for k, v in self.metrics.items()])
+class InterpretiveLogger:
+    """Human-readable system storytelling"""
+    def __init__(self, system_name: str):
+        self.system = system_name
+        self.entries: List[InterpretiveEntry] = []
+        self.start_time = time.time()
+    def log(self, impact: ImpactLevel, component: str, event: str,
+            context: str, consequence: str,
+            metrics: Optional[Dict] = None,
+            recommendation: Optional[str] = None):
+        """Record a system event"""
+        entry = InterpretiveEntry(
+            impact=impact,
+            system=self.system,
+            component=component,
+            event=event,
+            context=context,
+            consequence=consequence,
+            metrics=metrics or {},
+            recommendation=recommendation
+        )
+        self.entries.append(entry)
+        self._emit_to_container(entry)
+    def _emit_to_container(self, entry: InterpretiveEntry):
+        """Emit beautiful formatted log to container"""
+        print(entry.format_display())
+    # Convenience methods for common events
+    def service_start(self, component: str, port: int = None):
+        """Service started successfully"""
+        self.log(
+            ImpactLevel.LOW,
+            component,
+            "Service started",
+            f"Component initialized and ready for requests",
+            f"Accepting connections on port {port}" if port else "Ready for operations",
+            metrics={"port": port} if port else {},
+            recommendation="Monitor for healthy connections"
+        )
+    def service_error(self, component: str, error: str, impact: ImpactLevel = ImpactLevel.HIGH):
+        """Service encountered error"""
+        self.log(
+            impact,
+            component,
+            "Service error",
+            f"Component failed to process request",
+            f"May affect system reliability",
+            metrics={"error": error},
+            recommendation="Check component logs and restart if needed"
+        )
+    def data_processing(self, dataset: str, records: int, operations: List[str]):
+        """Data processing pipeline"""
+        self.log(
+            ImpactLevel.MEDIUM,
+            "DataProcessor",
+            f"Processing {dataset}",
+            f"Executing pipeline operations on dataset",
+            f"Will process {records:,} records through {len(operations)} stages",
+            metrics={
+                "dataset": dataset,
+                "records": records,
+                "operations": len(operations)
+            },
+            recommendation="Monitor processing progress and error rates"
+        )
+    def model_loaded(self, model_id: str, size_gb: float, device: str):
+        """AI model loaded into memory"""
+        self.log(
+            ImpactLevel.MEDIUM,
+            "ModelLoader",
+            f"Model {model_id} loaded",
+            f"Neural network loaded and ready for inference",
+            f"Consuming {size_gb:.1f}GB VRAM on {device}",
+            metrics={
+                "model": model_id,
+                "size_gb": size_gb,
+                "device": device
+            },
+            recommendation="Monitor GPU memory usage during inference"
+        )
+    def security_event(self, component: str, event: str, details: str):
+        """Security-related event"""
+        self.log(
+            ImpactLevel.CRITICAL,
+            component,
+            f"Security: {event}",
+            f"Security system detected potential threat",
+            f"Immediate investigation required",
+            metrics={"details": details},
+            recommendation="Review security logs and consider blocking source"
+        )
+    def performance_warning(self, component: str, metric: str, value: float, threshold: float):
+        """Performance threshold exceeded"""
+        self.log(
+            ImpactLevel.HIGH,
+            component,
+            f"Performance warning: {metric}",
+            f"Component performance degraded",
+            f"May impact user experience if continues",
+            metrics={metric: value, "threshold": threshold},
+            recommendation=f"Optimize {metric} or scale resources"
+        )
+    def cascade_observation(self, model: str, layers: int, merkle_root: str):
+        """CASCADE observed model execution"""
+        self.log(
+            ImpactLevel.INFO,
+            "CASCADE",
+            f"Model observation complete",
+            f"Cryptographic proof generated for model execution",
+            f"Merkle root provides verifiable audit trail",
+            metrics={
+                "model": model,
+                "layers": layers,
+                "merkle": merkle_root[:16] + "..."
+            },
+            recommendation="Store attestation for permanent records"
+        )
+    def fixed_point_convergence(self, operation: str, iterations: int, entities: int):
+        """Mathematical fixed point reached"""
+        self.log(
+            ImpactLevel.INFO,
+            "KleeneEngine",
+            f"Fixed point convergence",
+            f"{operation} completed after {iterations} iterations",
+            f"Resolved relationships for {entities} entities",
+            metrics={
+                "operation": operation,
+                "iterations": iterations,
+                "entities": entities
+            },
+            recommendation="Review convergence quality metrics"
+        )
+# Global interpretive loggers
+_interpretive_loggers: Dict[str, InterpretiveLogger] = {}
+def get_interpretive_logger(system: str) -> InterpretiveLogger:
+    """Get or create interpretive logger for system"""
+    if system not in _interpretive_loggers:
+        _interpretive_loggers[system] = InterpretiveLogger(system)
+    return _interpretive_loggers[system]
+# Bridge function to translate Kleene logs to interpretive
+def translate_kleene_to_interpretive(kleene_entry, interpretive_logger):
+    """Translate mathematical log to human story"""
+    # Map Kleene levels to impact levels
+    impact_map = {
+        "CRITICAL": ImpactLevel.CRITICAL,
+        "ERROR": ImpactLevel.HIGH,
+        "WARNING": ImpactLevel.MEDIUM,
+        "INFO": ImpactLevel.LOW,
+        "DEBUG": ImpactLevel.TRACE,
+        "TRACE": ImpactLevel.TRACE
+    }
+    # Create human-readable context
+    if kleene_entry.fixed_point_reached:
+        event = f"Mathematical convergence achieved"
+        context = f"Operation {kleene_entry.operation} reached stable state"
+        consequence = "System can proceed with verified result"
+    else:
+        event = f"State transition in {kleene_entry.operation}"
+        context = f"Component processing through iterations"
+        consequence = "Continuing toward fixed point"
+    interpretive_logger.log(
+        impact_map.get(kleene_entry.level.value, ImpactLevel.LOW),
+        kleene_entry.component,
+        event,
+        context,
+        consequence,
+        metrics={
+            "iterations": kleene_entry.iteration_count,
+            "hash": kleene_entry.hash_value
+        }
+    )

cascade/logging/kleene_logger.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+CASCADE Kleene Fixed Point Logger
+Industry-standard mathematical logging for debugging and verification.
+Each log entry is a fixed point observation - hashable, verifiable, complete.
+"""
+import hashlib
+import json
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional
+from contextlib import contextmanager
+class LogLevel(Enum):
+    """Mathematical significance levels"""
+    CRITICAL = "CRITICAL"  # System-breaking fixed point failure
+    ERROR = "ERROR"        # Fixed point not reached
+    WARNING = "WARNING"    # Unexpected state transition
+    INFO = "INFO"          # Fixed point achieved
+    DEBUG = "DEBUG"        # State transition details
+    TRACE = "TRACE"        # Every computation step
+@dataclass
+class KleeneLogEntry:
+    """A single fixed point observation"""
+    timestamp: float = field(default_factory=time.time)
+    level: LogLevel = LogLevel.INFO
+    component: str = ""
+    operation: str = ""
+    state_before: Optional[Dict] = None
+    state_after: Optional[Dict] = None
+    fixed_point_reached: bool = False
+    iteration_count: int = 0
+    hash_value: str = field(init=False)
+    def __post_init__(self):
+        # Create content hash for verifiability
+        content = {
+            "timestamp": self.timestamp,
+            "component": self.component,
+            "operation": self.operation,
+            "state_before": self.state_before,
+            "state_after": self.state_after,
+            "iteration": self.iteration_count
+        }
+        self.hash_value = hashlib.sha256(
+            json.dumps(content, sort_keys=True).encode()
+        ).hexdigest()[:16]
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "ts": self.timestamp,
+            "lvl": self.level.value,
+            "comp": self.component,
+            "op": self.operation,
+            "before": self.state_before,
+            "after": self.state_after,
+            "fixed": self.fixed_point_reached,
+            "iter": self.iteration_count,
+            "hash": self.hash_value
+        }
+class KleeneLogger:
+    """Mathematical logging for fixed point systems"""
+    def __init__(self, component_name: str):
+        self.component = component_name
+        self.entries: List[KleeneLogEntry] = []
+        self.session_start = time.time()
+        self.operation_count = 0
+    def log(self, level: LogLevel, operation: str,
+            state_before: Optional[Dict] = None,
+            state_after: Optional[Dict] = None,
+            fixed_point: bool = False,
+            iterations: int = 0):
+        """Record a state transition"""
+        entry = KleeneLogEntry(
+            level=level,
+            component=self.component,
+            operation=operation,
+            state_before=state_before,
+            state_after=state_after,
+            fixed_point_reached=fixed_point,
+            iteration_count=iterations
+        )
+        self.entries.append(entry)
+        self._emit_to_container(entry)
+    def _emit_to_container(self, entry: KleeneLogEntry):
+        """Emit structured log to container with colors"""
+        # ANSI color codes
+        colors = {
+            "CRITICAL": "\033[91m",  # Bright red
+            "ERROR": "\033[31m",      # Red
+            "WARNING": "\033[33m",    # Yellow
+            "INFO": "\033[32m",       # Green
+            "DEBUG": "\033[36m",      # Cyan
+            "TRACE": "\033[90m",      # Gray
+            "RESET": "\033[0m",       # Reset
+            "BOLD": "\033[1m",        # Bold
+            "DIM": "\033[2m",         # Dim
+        }
+        color = colors.get(entry.level.value, colors["RESET"])
+        reset = colors["RESET"]
+        dim = colors["DIM"]
+        # Format with colors
+        print(f"{color}[KLEENE]{reset} {color}{entry.level.value:8}{reset} | "
+              f"{dim}{entry.component:20}{reset} | "
+              f"{entry.operation:30} | "
+              f"Iter:{entry.iteration_count:3} | "
+              f"Fixed:{'Y' if entry.fixed_point_reached else 'N':1} | "
+              f"{dim}Hash:{entry.hash_value}{reset}")
+    @contextmanager
+    def observe_operation(self, operation: str, initial_state: Dict):
+        """Context manager for observing operations"""
+        self.operation_count += 1
+        iterations = 0
+        try:
+            self.log(LogLevel.DEBUG, f"{operation}_start",
+                    state_before=initial_state)
+            # Yield control back to operation
+            yield self
+            # Operation completed successfully
+            self.log(LogLevel.INFO, f"{operation}_complete",
+                    fixed_point=True, iterations=iterations)
+        except Exception as e:
+            self.log(LogLevel.ERROR, f"{operation}_failed",
+                    state_after={"error": str(e)})
+            raise
+    def fixed_point(self, operation: str, final_state: Dict, iterations: int):
+        """Log successful fixed point convergence"""
+        self.log(LogLevel.INFO, f"{operation}_fixed_point",
+                state_after=final_state,
+                fixed_point=True,
+                iterations=iterations)
+    def divergence(self, operation: str, state: Dict):
+        """Log when system diverges (no fixed point)"""
+        self.log(LogLevel.WARNING, f"{operation}_divergence",
+                state_after=state,
+                fixed_point=False)
+    def critical_failure(self, operation: str, error_state: Dict):
+        """Log critical system failure"""
+        self.log(LogLevel.CRITICAL, f"{operation}_critical",
+                state_after=error_state,
+                fixed_point=False)
+    def get_session_hash(self) -> str:
+        """Get hash of entire session for verification"""
+        content = {
+            "component": self.component,
+            "start": self.session_start,
+            "operations": self.operation_count,
+            "entries": [e.hash_value for e in self.entries]
+        }
+        return hashlib.sha256(json.dumps(content).encode()).hexdigest()
+# Global loggers for major components
+_loggers: Dict[str, KleeneLogger] = {}
+def get_kleene_logger(component: str) -> KleeneLogger:
+    """Get or create logger for component"""
+    if component not in _loggers:
+        _loggers[component] = KleeneLogger(component)
+    return _loggers[component]
+# Convenience decorators
+def log_fixed_point(operation: str):
+    """Decorator to automatically log fixed point operations"""
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            logger = get_kleene_logger(func.__module__)
+            start_state = {"args": str(args), "kwargs": str(kwargs)}
+            try:
+                result = func(*args, **kwargs)
+                logger.fixed_point(operation, {"result": str(result)}, 1)
+                return result
+            except Exception as e:
+                logger.critical_failure(operation, {"error": str(e)})
+                raise
+        return wrapper
+    return decorator
+def log_iterations(operation: str):
+    """Decorator for operations that iterate to fixed points"""
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            logger = get_kleene_logger(func.__module__)
+            # Simulate iteration counting (real implementation would track)
+            result = func(*args, **kwargs)
+            iterations = getattr(result, 'iterations', 1)
+            logger.fixed_point(operation, {"converged": True}, iterations)
+            return result
+        return wrapper
+    return decorator

cascade/logging/log_manager.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""
+CASCADE Log Manager
+Orchestrates the tsunami of data into ordered causation troops.
+Manages log levels, routing, and the beautiful display of system truth.
+"""
+import os
+import sys
+import time
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass
+from enum import Enum
+from .kleene_logger import KleeneLogger, LogLevel
+from .interpretive_logger import InterpretiveLogger, ImpactLevel
+class LogMode(Enum):
+    """The two modes of logging excellence"""
+    KLEENE = "kleene"      # Mathematical precision
+    INTERPRETIVE = "interpretive"  # Human stories
+    DUAL = "dual"          # Both simultaneously
+@dataclass
+class LogConfig:
+    """Configuration for logging behavior"""
+    mode: LogMode = LogMode.DUAL
+    min_level_kleene: LogLevel = LogLevel.INFO
+    min_level_interpretive: ImpactLevel = ImpactLevel.LOW
+    show_metrics: bool = True
+    show_timestamps: bool = True
+    color_output: bool = True
+    file_output: bool = False
+    max_file_size_mb: int = 100
+class CascadeLogManager:
+    """The conductor of your causation orchestra"""
+    def __init__(self, config: Optional[LogConfig] = None):
+        self.config = config or LogConfig()
+        self.kleene_loggers: Dict[str, KleeneLogger] = {}
+        self.interpretive_loggers: Dict[str, InterpretiveLogger] = {}
+        self.start_time = time.time()
+        self.operation_count = 0
+        # Initialize display
+        self._setup_display()
+    def _setup_display(self):
+        """Setup beautiful terminal output"""
+        if self.config.color_output:
+            # Enable ANSI colors
+            sys.stdout.reconfigure(encoding='utf-8')
+        # Print header
+        self._print_header()
+    def _print_header(self):
+        """Print beautiful cascade header with colors"""
+        # ANSI color codes
+        colors = {
+            "WAVE": "\033[94m",      # Bright blue
+            "BRIDGE": "\033[96m",    # Cyan
+            "BOLD": "\033[1m",
+            "DIM": "\033[2m",
+            "RESET": "\033[0m",
+            "GREEN": "\033[32m",
+            "YELLOW": "\033[33m",
+        }
+        wave = colors["WAVE"]
+        bridge = colors["BRIDGE"]
+        bold = colors["BOLD"]
+        dim = colors["DIM"]
+        reset = colors["RESET"]
+        green = colors["GREEN"]
+        yellow = colors["YELLOW"]
+        print(f"\n{bold}{'='*80}{reset}")
+        print(f"{wave}🌊{reset} {bold}CASCADE // TRUTH INFRASTRUCTURE{reset} {bridge}🧠{reset}")
+        print(f"{bold}{'='*80}{reset}")
+        print(f"{bold}Mode:{reset} {green}{self.config.mode.value.upper()}{reset}")
+        print(f"{bold}Started:{reset} {dim}{time.strftime('%Y-%m-%d %H:%M:%S')}{reset}")
+        print(f"{bold}{'='*80}{reset}\n")
+    def register_component(self, component: str, system: str = "CASCADE"):
+        """Register a component for logging"""
+        if self.config.mode in [LogMode.KLEENE, LogMode.DUAL]:
+            kleene = KleeneLogger(component)
+            self.kleene_loggers[component] = kleene
+        if self.config.mode in [LogMode.INTERPRETIVE, LogMode.DUAL]:
+            interpretive = InterpretiveLogger(system)
+            self.interpretive_loggers[system] = interpretive
+    def log_operation(self, component: str, operation: str,
+                     level: LogLevel = LogLevel.INFO,
+                     impact: ImpactLevel = ImpactLevel.LOW,
+                     details: Optional[Dict] = None):
+        """Log an operation across all active loggers"""
+        self.operation_count += 1
+        if self.config.mode in [LogMode.KLEENE, LogMode.DUAL]:
+            if component in self.kleene_loggers:
+                self.kleene_loggers[component].log(
+                    level, operation,
+                    state_before=details.get("before") if details else None,
+                    state_after=details.get("after") if details else None,
+                    fixed_point=details.get("fixed_point", False) if details else False,
+                    iterations=details.get("iterations", 0) if details else 0
+                )
+        if self.config.mode in [LogMode.INTERPRETIVE, LogMode.DUAL]:
+            # Find interpretive logger for component
+            system = details.get("system", "CASCADE") if details else "CASCADE"
+            if system in self.interpretive_loggers:
+                self.interpretive_loggers[system].log(
+                    impact, component, operation,
+                    context=details.get("context", "") if details else "",
+                    consequence=details.get("consequence", "") if details else "",
+                    metrics=details.get("metrics", {}) if details else {},
+                    recommendation=details.get("recommendation") if details else None
+                )
+    def get_session_stats(self) -> Dict[str, Any]:
+        """Get beautiful session statistics"""
+        total_kleene = sum(len(logger.entries) for logger in self.kleene_loggers.values())
+        total_interpretive = sum(len(logger.entries) for logger in self.interpretive_loggers.values())
+        return {
+            "uptime_seconds": time.time() - self.start_time,
+            "operations": self.operation_count,
+            "kleene_entries": total_kleene,
+            "interpretive_entries": total_interpretive,
+            "active_components": len(self.kleene_loggers),
+            "active_systems": len(self.interpretive_loggers)
+        }
+    def print_summary(self):
+        """Print beautiful session summary with colors"""
+        stats = self.get_session_stats()
+        # ANSI color codes
+        colors = {
+            "BOLD": "\033[1m",
+            "DIM": "\033[2m",
+            "RESET": "\033[0m",
+            "CYAN": "\033[36m",
+            "GREEN": "\033[32m",
+            "YELLOW": "\033[33m",
+            "BLUE": "\033[34m",
+            "MAGENTA": "\033[35m",
+        }
+        bold = colors["BOLD"]
+        dim = colors["DIM"]
+        reset = colors["RESET"]
+        cyan = colors["CYAN"]
+        green = colors["GREEN"]
+        yellow = colors["YELLOW"]
+        blue = colors["BLUE"]
+        magenta = colors["MAGENTA"]
+        print(f"\n{bold}{'='*80}{reset}")
+        print(f"{cyan}📊 CASCADE SESSION SUMMARY{reset}")
+        print(f"{bold}{'='*80}{reset}")
+        print(f"{bold}Uptime:{reset} {stats['uptime_seconds']:.1f} seconds")
+        print(f"{bold}Operations:{reset} {green}{stats['operations']:,}{reset}")
+        print(f"{bold}Kleene Entries:{reset} {yellow}{stats['kleene_entries']:,}{reset}")
+        print(f"{bold}Interpretive Entries:{reset} {blue}{stats['interpretive_entries']:,}{reset}")
+        print(f"{bold}Active Components:{reset} {magenta}{stats['active_components']}{reset}")
+        print(f"{bold}Active Systems:{reset} {magenta}{stats['active_systems']}{reset}")
+        if stats['kleene_entries'] > 0:
+            # Get session hash from first logger
+            first_logger = next(iter(self.kleene_loggers.values()))
+            print(f"{bold}Session Hash:{reset} {dim}{first_logger.get_session_hash()}{reset}")
+        print(f"{bold}{'='*80}{reset}")
+    def set_mode(self, mode: LogMode):
+        """Switch logging mode dynamically"""
+        old_mode = self.config.mode
+        self.config.mode = mode
+        print(f"\n🔄 Logging mode changed: {old_mode.value} → {mode.value}")
+    def enable_file_logging(self, filepath: str):
+        """Enable logging to file"""
+        self.config.file_output = True
+        # TODO: Implement file logging
+        print(f"📁 File logging enabled: {filepath}")
+# Global log manager instance
+_log_manager: Optional[CascadeLogManager] = None
+def init_logging(config: Optional[LogConfig] = None) -> CascadeLogManager:
+    """Initialize the global CASCADE logging system"""
+    global _log_manager
+    _log_manager = CascadeLogManager(config)
+    return _log_manager
+def get_log_manager() -> CascadeLogManager:
+    """Get the global log manager"""
+    global _log_manager
+    if _log_manager is None:
+        _log_manager = CascadeLogManager()
+    return _log_manager
+def log(component: str, operation: str, context: str = "", consequence: str = "",
+         metrics: Dict[str, Any] = None, impact: str = "LOW", **kwargs):
+    """Quick log operation - convenience function"""
+    manager = get_log_manager()
+    manager.log_operation(component, operation,
+                         details={
+                             "context": context,
+                             "consequence": consequence,
+                             "metrics": metrics or {},
+                             "impact": impact,
+                             **kwargs
+                         })
+def log_fixed_point(component: str, operation: str, iterations: int, **kwargs):
+    """Log successful fixed point"""
+    log(component, operation,
+        level=LogLevel.INFO,
+        impact=ImpactLevel.LOW,
+        details={
+            "fixed_point": True,
+            "iterations": iterations,
+            **kwargs
+        })
+def log_error(component: str, operation: str, error: str, **kwargs):
+    """Log error condition"""
+    log(component, f"{operation}_error",
+        level=LogLevel.ERROR,
+        impact=ImpactLevel.HIGH,
+        details={
+            "context": f"Operation failed: {error}",
+            "consequence": "System may be degraded",
+            "metrics": {"error": error},
+            **kwargs
+        })
+def log_performance(component: str, metric: str, value: float, threshold: float):
+    """Log performance warning"""
+    log(component, f"performance_{metric}",
+        level=LogLevel.WARNING,
+        impact=ImpactLevel.MEDIUM,
+        details={
+            "context": f"Performance metric {metric} exceeded threshold",
+            "consequence": "May impact system performance",
+            "metrics": {metric: value, "threshold": threshold},
+            "recommendation": f"Optimize {metric} or scale resources"
+        })

cascade/observation.py ADDED Viewed

	@@ -0,0 +1,397 @@

+"""
+CASCADE Observation Manager
+Connects the detective tabs (Observatory, Unity, System) to the lattice.
+Flow:
+1. User runs observation through any tab
+2. Observation creates provenance chain
+3. Chain links to model identity (for model obs) or genesis (for data/system)
+4. Chain saved to lattice
+5. Optionally pinned to IPFS
+This is the integration layer between UI and lattice.
+"""
+import json
+import time
+from pathlib import Path
+from typing import Optional, Dict, Any, List
+from dataclasses import dataclass, field
+from cascade.core.provenance import ProvenanceChain
+from cascade.identity import ModelRegistry, ModelIdentity, create_model_identity
+from cascade.genesis import get_genesis_root, link_to_genesis
+@dataclass
+class Observation:
+    """
+    A single observation record in the lattice.
+    Can be:
+    - Model observation (inference through Observatory)
+    - Data observation (entity resolution through Unity)
+    - System observation (log analysis through System tab)
+    """
+    observation_id: str
+    observation_type: str  # "model", "data", "system"
+    # What was observed
+    source_id: str  # Model ID, dataset ID, or log source
+    source_root: str  # Merkle root of source identity
+    # The observation data
+    chain: ProvenanceChain
+    merkle_root: str
+    # Metadata
+    user_hash: Optional[str] = None  # Anonymous user identifier
+    created_at: float = field(default_factory=time.time)
+    # IPFS
+    cid: Optional[str] = None
+class ObservationManager:
+    """
+    Manages observations across all CASCADE tabs.
+    Responsibilities:
+    - Link observations to model identities or genesis
+    - Save observations to lattice
+    - Track observation history
+    - Provide stats for lattice gateway
+    """
+    def __init__(self, lattice_dir: Path = None):
+        self.lattice_dir = lattice_dir or Path(__file__).parent.parent / "lattice"
+        self.observations_dir = self.lattice_dir / "observations"
+        self.observations_dir.mkdir(parents=True, exist_ok=True)
+        # Model registry for linking model observations
+        self.model_registry = ModelRegistry(self.lattice_dir)
+        # Genesis root
+        self.genesis_root = get_genesis_root()
+        # In-memory observation index
+        self._observations: Dict[str, Observation] = {}
+        self._load_index()
+    def _load_index(self):
+        """Load observation index from disk."""
+        index_file = self.lattice_dir / "observation_index.json"
+        if index_file.exists():
+            try:
+                index = json.loads(index_file.read_text())
+                # Just load metadata, not full chains
+                for obs_id, meta in index.items():
+                    self._observations[obs_id] = meta
+            except:
+                pass
+    def _save_index(self):
+        """Save observation index to disk."""
+        index_file = self.lattice_dir / "observation_index.json"
+        # Save lightweight index
+        index = {}
+        for obs_id, obs in self._observations.items():
+            if isinstance(obs, Observation):
+                index[obs_id] = {
+                    "observation_id": obs.observation_id,
+                    "observation_type": obs.observation_type,
+                    "source_id": obs.source_id,
+                    "source_root": obs.source_root,
+                    "merkle_root": obs.merkle_root,
+                    "created_at": obs.created_at,
+                    "cid": obs.cid,
+                }
+            else:
+                index[obs_id] = obs
+        index_file.write_text(json.dumps(index, indent=2))
+    def observe_model(
+        self,
+        model_id: str,
+        chain: ProvenanceChain,
+        user_hash: Optional[str] = None,
+        **model_kwargs,
+    ) -> Observation:
+        """
+        Record a model observation.
+        Args:
+            model_id: HuggingFace model ID or local path
+            chain: Provenance chain from Observatory
+            user_hash: Anonymous user identifier
+            **model_kwargs: Additional model info (parameters, etc.)
+        Returns:
+            Observation linked to model identity
+        """
+        # Get or create model identity
+        identity = self.model_registry.get_or_create(model_id, **model_kwargs)
+        # Link chain to model identity
+        if not chain.external_roots:
+            chain.external_roots = []
+        if identity.merkle_root not in chain.external_roots:
+            chain.external_roots.append(identity.merkle_root)
+        # Finalize chain if not already
+        if not chain.finalized:
+            chain.finalize()
+        # Create observation record
+        obs_id = f"model_{chain.merkle_root}"
+        observation = Observation(
+            observation_id=obs_id,
+            observation_type="model",
+            source_id=model_id,
+            source_root=identity.merkle_root,
+            chain=chain,
+            merkle_root=chain.merkle_root,
+            user_hash=user_hash,
+        )
+        # Save chain to disk
+        self._save_observation(observation)
+        return observation
+    def observe_data(
+        self,
+        dataset_a: str,
+        dataset_b: str,
+        chain: ProvenanceChain,
+        user_hash: Optional[str] = None,
+    ) -> Observation:
+        """
+        Record a data unity observation.
+        Links directly to genesis (data doesn't have model identity).
+        """
+        # Link to genesis
+        if not chain.external_roots:
+            chain.external_roots = []
+        if self.genesis_root not in chain.external_roots:
+            chain.external_roots.append(self.genesis_root)
+        if not chain.finalized:
+            chain.finalize()
+        # Create observation
+        source_id = f"{dataset_a}::{dataset_b}"
+        obs_id = f"data_{chain.merkle_root}"
+        observation = Observation(
+            observation_id=obs_id,
+            observation_type="data",
+            source_id=source_id,
+            source_root=self.genesis_root,
+            chain=chain,
+            merkle_root=chain.merkle_root,
+            user_hash=user_hash,
+        )
+        self._save_observation(observation)
+        return observation
+    def observe_system(
+        self,
+        source_name: str,
+        chain: ProvenanceChain,
+        user_hash: Optional[str] = None,
+    ) -> Observation:
+        """
+        Record a system log observation.
+        Links directly to genesis.
+        """
+        # Link to genesis
+        if not chain.external_roots:
+            chain.external_roots = []
+        if self.genesis_root not in chain.external_roots:
+            chain.external_roots.append(self.genesis_root)
+        if not chain.finalized:
+            chain.finalize()
+        obs_id = f"system_{chain.merkle_root}"
+        observation = Observation(
+            observation_id=obs_id,
+            observation_type="system",
+            source_id=source_name,
+            source_root=self.genesis_root,
+            chain=chain,
+            merkle_root=chain.merkle_root,
+            user_hash=user_hash,
+        )
+        self._save_observation(observation)
+        return observation
+    def _save_observation(self, observation: Observation):
+        """Save observation to disk."""
+        # Save to index
+        self._observations[observation.observation_id] = observation
+        self._save_index()
+        # Save full chain
+        chain_file = self.observations_dir / f"{observation.merkle_root}.json"
+        chain_data = {
+            "observation_id": observation.observation_id,
+            "observation_type": observation.observation_type,
+            "source_id": observation.source_id,
+            "source_root": observation.source_root,
+            "user_hash": observation.user_hash,
+            "created_at": observation.created_at,
+            "cid": observation.cid,
+            "chain": observation.chain.to_dict() if hasattr(observation.chain, 'to_dict') else str(observation.chain),
+        }
+        chain_file.write_text(json.dumps(chain_data, indent=2, default=str))
+    def pin_observation(self, observation: Observation) -> Optional[str]:
+        """
+        Pin observation to IPFS.
+        Returns CID if successful.
+        """
+        try:
+            from cascade.ipld import chain_to_cid, encode_to_dag_cbor
+            from cascade.web3_pin import pin_file
+            # Convert to IPLD format
+            chain_data = observation.chain.to_dict() if hasattr(observation.chain, 'to_dict') else {}
+            cbor_data = encode_to_dag_cbor(chain_data)
+            # Save CBOR
+            cbor_file = self.observations_dir / f"{observation.merkle_root}.cbor"
+            cbor_file.write_bytes(cbor_data)
+            # Compute CID
+            cid = chain_to_cid(chain_data)
+            observation.cid = cid
+            # Update index
+            self._save_observation(observation)
+            return cid
+        except Exception as e:
+            print(f"Failed to pin observation: {e}")
+            return None
+    def get_observation(self, merkle_root: str) -> Optional[Observation]:
+        """Get observation by merkle root."""
+        for obs in self._observations.values():
+            if isinstance(obs, Observation) and obs.merkle_root == merkle_root:
+                return obs
+            elif isinstance(obs, dict) and obs.get("merkle_root") == merkle_root:
+                return obs
+        return None
+    def list_observations(
+        self,
+        observation_type: Optional[str] = None,
+        source_id: Optional[str] = None,
+        limit: int = 100,
+    ) -> List[Dict[str, Any]]:
+        """List observations with optional filters."""
+        results = []
+        for obs in self._observations.values():
+            if isinstance(obs, Observation):
+                obs_dict = {
+                    "observation_id": obs.observation_id,
+                    "observation_type": obs.observation_type,
+                    "source_id": obs.source_id,
+                    "merkle_root": obs.merkle_root,
+                    "created_at": obs.created_at,
+                    "cid": obs.cid,
+                }
+            else:
+                obs_dict = obs
+            # Apply filters
+            if observation_type and obs_dict.get("observation_type") != observation_type:
+                continue
+            if source_id and source_id not in obs_dict.get("source_id", ""):
+                continue
+            results.append(obs_dict)
+        # Sort by time, newest first
+        results.sort(key=lambda x: x.get("created_at", 0), reverse=True)
+        return results[:limit]
+    def get_stats(self) -> Dict[str, Any]:
+        """Get lattice statistics."""
+        obs_list = list(self._observations.values())
+        model_obs = [o for o in obs_list if (isinstance(o, Observation) and o.observation_type == "model") or (isinstance(o, dict) and o.get("observation_type") == "model")]
+        data_obs = [o for o in obs_list if (isinstance(o, Observation) and o.observation_type == "data") or (isinstance(o, dict) and o.get("observation_type") == "data")]
+        system_obs = [o for o in obs_list if (isinstance(o, Observation) and o.observation_type == "system") or (isinstance(o, dict) and o.get("observation_type") == "system")]
+        # Count unique models
+        model_ids = set()
+        for o in model_obs:
+            if isinstance(o, Observation):
+                model_ids.add(o.source_id)
+            elif isinstance(o, dict):
+                model_ids.add(o.get("source_id", ""))
+        return {
+            "total_observations": len(obs_list),
+            "model_observations": len(model_obs),
+            "data_observations": len(data_obs),
+            "system_observations": len(system_obs),
+            "unique_models": len(model_ids),
+            "registered_models": len(self.model_registry.list_all()),
+            "genesis_root": self.genesis_root,
+        }
+    def get_model_observations(self, model_id: str) -> List[Dict[str, Any]]:
+        """Get all observations for a specific model."""
+        return self.list_observations(observation_type="model", source_id=model_id)
+# =============================================================================
+# SINGLETON INSTANCE
+# =============================================================================
+_manager: Optional[ObservationManager] = None
+def get_observation_manager() -> ObservationManager:
+    """Get singleton observation manager."""
+    global _manager
+    if _manager is None:
+        _manager = ObservationManager()
+    return _manager
+# =============================================================================
+# CLI
+# =============================================================================
+if __name__ == "__main__":
+    print("=== CASCADE Observation Manager ===\n")
+    manager = get_observation_manager()
+    # Show stats
+    stats = manager.get_stats()
+    print(f"Genesis: {stats['genesis_root']}")
+    print(f"Registered Models: {stats['registered_models']}")
+    print(f"Total Observations: {stats['total_observations']}")
+    print(f"  - Model: {stats['model_observations']}")
+    print(f"  - Data: {stats['data_observations']}")
+    print(f"  - System: {stats['system_observations']}")
+    print(f"Unique Models Observed: {stats['unique_models']}")
+    # List recent observations
+    print("\nRecent Observations:")
+    for obs in manager.list_observations(limit=5):
+        print(f"  [{obs['observation_type']}] {obs['source_id'][:40]}... → {obs['merkle_root']}")

cascade/observe.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""
+Cascade Observer CLI.
+Wraps a target process and observes its output.
+Usage:
+    python -m cascade.observe --cmd "python path/to/train.py --args..."
+This module:
+1. Wraps the target process
+2. Pipes stdout/stderr -> Cascade Adapter
+3. Writes events to tape file (JSONL) and human log (Markdown)
+4. Emits events to event_queue for external consumers
+For visualization, point a consumer at the event_queue or load the tape file
+into your preferred visualization tool.
+"""
+import sys
+import subprocess
+import argparse
+import time
+import json
+import shlex
+import shutil
+from pathlib import Path
+from queue import Queue
+# Ensure package root is in path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from cascade import Monitor
+# Shared event queue for external consumers (e.g., custom UIs)
+event_queue: Queue = Queue()
+def scoop_the_poop(log_dir: Path):
+    """
+    Baggies system - archive old logs on startup.
+    Keeps the logs folder clean. Old sessions go to baggies/.
+    """
+    baggies_dir = log_dir / "baggies"
+    baggies_dir.mkdir(parents=True, exist_ok=True)
+    # Find all old log files (not the current session)
+    tape_files = list(log_dir.glob("cascade_tape_*.jsonl"))
+    log_files = list(log_dir.glob("cascade_log_*.md"))
+    moved_count = 0
+    for f in tape_files + log_files:
+        if f.parent == log_dir:  # Only files in root logs/, not baggies/
+            dest = baggies_dir / f.name
+            try:
+                shutil.move(str(f), str(dest))
+                moved_count += 1
+            except Exception as e:
+                print(f"[CASCADE] Could not archive {f.name}: {e}")
+    if moved_count > 0:
+        print(f"[CASCADE] 🧹 Scooped {moved_count} old logs → baggies/")
+def main():
+    parser = argparse.ArgumentParser(
+        prog="cascade",
+        description="🌊 Cascade - Real-Time Neural Network Observability",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  cascade --cmd "python train.py"
+  cascade --cmd "python train.py --epochs=10"
+  cascade --cmd "python train.py" --cwd /path/to/project
+Events are written to tape files in the log directory.
+        """
+    )
+    # Support both "cascade --cmd" and "cascade observe --cmd"
+    subparsers = parser.add_subparsers(dest="command")
+    observe_parser = subparsers.add_parser("observe", help="Observe a training process")
+    # Add args to both main parser and observe subparser
+    for p in [parser, observe_parser]:
+        p.add_argument("--cmd", required=True, help="Command to run the target process")
+        p.add_argument("--cwd", default=None, help="Working directory for the target (absolute path)")
+        p.add_argument("--log-dir", default="./logs", help="Directory for session tapes")
+        p.add_argument("--quiet", "-q", action="store_true", help="Suppress console output")
+    args = parser.parse_args()
+    # Resolve working directory to absolute
+    if args.cwd:
+        work_dir = Path(args.cwd).resolve()
+    else:
+        work_dir = Path.cwd()
+    # 0. Setup Session Tape (The Excrement/Product)
+    log_dir = Path(args.log_dir).resolve()
+    log_dir.mkdir(parents=True, exist_ok=True)
+    # 🧹 Scoop old logs before starting new session
+    scoop_the_poop(log_dir)
+    session_id = int(time.time())
+    # 1. Machine Tape (JSONL)
+    tape_path = log_dir / f"cascade_tape_{session_id}.jsonl"
+    tape_file = open(tape_path, "a", encoding="utf-8")
+    # 2. Human Log (Markdown)
+    human_path = log_dir / f"cascade_log_{session_id}.md"
+    human_file = open(human_path, "a", encoding="utf-8")
+    # Header for Human Log
+    human_file.write(f"# CASCADE MISSION LOG // SESSION {session_id}\n")
+    human_file.write(f"**Target:** `{args.cmd}`\n")
+    human_file.write(f"**Date:** {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+    human_file.write("---\n\n")
+    human_file.flush()
+    print("="*60)
+    print("CASCADE // OBSERVER")
+    print(f"Target:  {args.cmd}")
+    print(f"Tape:    {tape_path.absolute()}")
+    print(f"Log:     {human_path.absolute()}")
+    print("="*60)
+    # Init Monitor
+    monitor = Monitor("symbiont_alpha")
+    def write_human_entry(evt):
+        """Convert an event into an articulate log entry."""
+        t_str = time.strftime('%H:%M:%S', time.localtime(evt.timestamp))
+        # Narrative construction based on event type
+        if evt.event_type == "error":
+            icon = "🔴"
+            narrative = f"CRITICAL FAILURE in **{evt.component}**."
+        elif evt.event_type == "warning":
+            icon = "⚠️"
+            narrative = f"Warning signal detected from **{evt.component}**."
+        elif evt.event_type == "state_change":
+            icon = "🔄"
+            narrative = f"State transition observed in **{evt.component}**."
+        elif "loss" in str(evt.data):
+             icon = "📉"
+             narrative = f"Optimization step completed by **{evt.component}**."
+        else:
+            icon = "ℹ️"
+            narrative = f"Standard event recorded from **{evt.component}**."
+        # Write readable block
+        human_file.write(f"### {icon} {t_str} // {evt.event_type.upper()}\n")
+        human_file.write(f"{narrative}\n")
+        if evt.data:
+            # Format data as a clean list or quote
+            human_file.write("```yaml\n")
+            for k, v in evt.data.items():
+                human_file.write(f"{k}: {v}\n")
+            human_file.write("```\n")
+        human_file.write("\n")
+        human_file.flush()
+    # Launch Target
+    try:
+        # Split command for subprocess if it's a string
+        cmd_parts = shlex.split(args.cmd)
+        process = subprocess.Popen(
+            cmd_parts,
+            cwd=args.cwd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1
+        )
+        print(f"[CASCADE] Linked to target. Recording to tape & log...")
+        for line in process.stdout:
+            line = line.strip()
+            if not line: continue
+            # Feed Adapter
+            event = monitor.observe(line)
+            # Build payload with FULL wealth: metrics + triage + raw
+            metrics_summary = monitor.metrics.summary()
+            triage_status = monitor.metrics.triage()
+            payload = {
+                "event": {
+                    "event_id": event.event_id,
+                    "timestamp": event.timestamp,
+                    "component": event.component,
+                    "event_type": event.event_type,
+                    "data": event.data,
+                    "raw": line,  # Include original line for drill-down
+                },
+                "metrics": metrics_summary,
+                "triage": triage_status,
+            }
+            # Emit to queue for external consumers
+            event_queue.put(payload)
+            # Write to Tape (Machine)
+            tape_file.write(json.dumps(payload) + "\n")
+            tape_file.flush()
+            # Write to Log (Human)
+            write_human_entry(event)
+            # Echo to console (unless quiet)
+            if not args.quiet:
+                print(f"[RAW] {line}")
+    except KeyboardInterrupt:
+        print("\n[CASCADE] Detaching...")
+    except Exception as e:
+        print(f"[CASCADE] Error: {e}")
+    finally:
+        tape_file.close()
+        human_file.close()
+        if 'process' in locals() and process.poll() is None:
+            process.terminate()
+        print(f"[CASCADE] Session complete. Tape: {tape_path}")
+if __name__ == "__main__":
+    main()

cascade/patches/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+CASCADE Patches - Auto-intercept LLM provider libraries
+Each patch module wraps a provider's API to automatically emit receipts.
+"""
+from .openai_patch import patch_openai
+from .anthropic_patch import patch_anthropic
+from .huggingface_patch import patch_huggingface
+from .ollama_patch import patch_ollama
+from .litellm_patch import patch_litellm
+__all__ = [
+    "patch_openai",
+    "patch_anthropic",
+    "patch_huggingface",
+    "patch_ollama",
+    "patch_litellm",
+]