Spaces:
Configuration error
Configuration error
Commit
·
77bcbf1
0
Parent(s):
Initial commit - cascade-lattice 0.5.4
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .github/workflows/publish.yml +31 -0
- .gitignore +35 -0
- LICENSE +21 -0
- README.md +70 -0
- cascade/__init__.py +290 -0
- cascade/analysis/__init__.py +37 -0
- cascade/analysis/metrics.py +1168 -0
- cascade/analysis/tracer.py +487 -0
- cascade/bridge.py +265 -0
- cascade/cli_main.py +851 -0
- cascade/core/__init__.py +13 -0
- cascade/core/adapter.py +470 -0
- cascade/core/event.py +177 -0
- cascade/core/graph.py +292 -0
- cascade/core/provenance.py +601 -0
- cascade/core/web3_bridge.py +846 -0
- cascade/data/__init__.py +112 -0
- cascade/data/croissant.py +289 -0
- cascade/data/entities.py +349 -0
- cascade/data/hub.py +533 -0
- cascade/data/license.py +635 -0
- cascade/data/live.py +844 -0
- cascade/data/observer.py +666 -0
- cascade/data/pii.py +748 -0
- cascade/data/provenance.py +503 -0
- cascade/data/schema.py +417 -0
- cascade/demo.py +174 -0
- cascade/demo_sdk.py +114 -0
- cascade/export/__init__.py +23 -0
- cascade/export/tableau_export.py +598 -0
- cascade/forensics/__init__.py +53 -0
- cascade/forensics/analyzer.py +464 -0
- cascade/forensics/artifacts.py +1063 -0
- cascade/forensics/fingerprints.py +328 -0
- cascade/genesis.py +200 -0
- cascade/hold/__init__.py +82 -0
- cascade/hold/primitives.py +673 -0
- cascade/hold/session.py +707 -0
- cascade/identity.py +715 -0
- cascade/ipld.py +379 -0
- cascade/listen.py +154 -0
- cascade/logging/__init__.py +86 -0
- cascade/logging/color_example.py +107 -0
- cascade/logging/integrate.py +275 -0
- cascade/logging/interpretive_logger.py +276 -0
- cascade/logging/kleene_logger.py +219 -0
- cascade/logging/log_manager.py +266 -0
- cascade/observation.py +397 -0
- cascade/observe.py +231 -0
- cascade/patches/__init__.py +19 -0
.github/workflows/publish.yml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Publish to PyPI
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
tags:
|
| 6 |
+
- 'v*'
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
publish:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
permissions:
|
| 12 |
+
id-token: write # For trusted publishing (optional)
|
| 13 |
+
|
| 14 |
+
steps:
|
| 15 |
+
- uses: actions/checkout@v4
|
| 16 |
+
|
| 17 |
+
- name: Set up Python
|
| 18 |
+
uses: actions/setup-python@v5
|
| 19 |
+
with:
|
| 20 |
+
python-version: '3.10'
|
| 21 |
+
|
| 22 |
+
- name: Install build tools
|
| 23 |
+
run: pip install build
|
| 24 |
+
|
| 25 |
+
- name: Build package
|
| 26 |
+
run: python -m build
|
| 27 |
+
|
| 28 |
+
- name: Publish to PyPI
|
| 29 |
+
uses: pypa/gh-action-pypi-publish@release/v1
|
| 30 |
+
with:
|
| 31 |
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
.gitignore
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# Distribution / packaging
|
| 7 |
+
dist/
|
| 8 |
+
build/
|
| 9 |
+
*.egg-info/
|
| 10 |
+
*.egg
|
| 11 |
+
*.whl
|
| 12 |
+
|
| 13 |
+
# Virtual environments
|
| 14 |
+
venv/
|
| 15 |
+
.venv/
|
| 16 |
+
env/
|
| 17 |
+
|
| 18 |
+
# IDE
|
| 19 |
+
.vscode/
|
| 20 |
+
.idea/
|
| 21 |
+
*.swp
|
| 22 |
+
*.swo
|
| 23 |
+
|
| 24 |
+
# Testing
|
| 25 |
+
.pytest_cache/
|
| 26 |
+
.coverage
|
| 27 |
+
htmlcov/
|
| 28 |
+
|
| 29 |
+
# Logs
|
| 30 |
+
*.log
|
| 31 |
+
logs/
|
| 32 |
+
|
| 33 |
+
# OS
|
| 34 |
+
.DS_Store
|
| 35 |
+
Thumbs.db
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2024-2026 Jeff Towers
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cascade Lattice
|
| 2 |
+
|
| 3 |
+
**Universal AI provenance layer — cryptographic receipts for every call, with HOLD inference halt protocol**
|
| 4 |
+
|
| 5 |
+
[](https://pypi.org/project/cascade-lattice/)
|
| 6 |
+
[](https://opensource.org/licenses/MIT)
|
| 7 |
+
|
| 8 |
+
## Installation
|
| 9 |
+
|
| 10 |
+
```bash
|
| 11 |
+
pip install cascade-lattice
|
| 12 |
+
```
|
| 13 |
+
|
| 14 |
+
With optional dependencies:
|
| 15 |
+
```bash
|
| 16 |
+
pip install cascade-lattice[torch] # PyTorch integration
|
| 17 |
+
pip install cascade-lattice[all] # All integrations
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
## Quick Start
|
| 21 |
+
|
| 22 |
+
```python
|
| 23 |
+
from cascade import Monitor
|
| 24 |
+
|
| 25 |
+
# Create a monitor for your component
|
| 26 |
+
monitor = Monitor("training_loop")
|
| 27 |
+
|
| 28 |
+
# Observe events (parses logs, extracts metrics)
|
| 29 |
+
event = monitor.observe("Epoch 5: loss=0.0234, accuracy=0.9812")
|
| 30 |
+
print(event.data) # {'loss': 0.0234, 'accuracy': 0.9812, ...}
|
| 31 |
+
|
| 32 |
+
# Get metrics summary
|
| 33 |
+
print(monitor.metrics.summary())
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
## Features
|
| 37 |
+
|
| 38 |
+
- **Universal Observation** — Monitor training, inference, system logs, API calls
|
| 39 |
+
- **Cryptographic Receipts** — Every observation gets a verifiable hash chain
|
| 40 |
+
- **HOLD Protocol** — Inference halt capability for safety-critical applications
|
| 41 |
+
- **Tape Storage** — JSONL event streams for replay and analysis
|
| 42 |
+
- **Provider Patches** — Drop-in monitoring for OpenAI, Anthropic, LiteLLM, Ollama
|
| 43 |
+
|
| 44 |
+
## CLI Usage
|
| 45 |
+
|
| 46 |
+
```bash
|
| 47 |
+
cascade --help # Show all commands
|
| 48 |
+
cascade stats # Lattice statistics
|
| 49 |
+
cascade list -n 20 # Recent observations
|
| 50 |
+
cascade watch # Live observation feed
|
| 51 |
+
cascade fingerprint model/ # Fingerprint a model
|
| 52 |
+
cascade pii scan.log # Scan for PII
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
## Tape Utilities
|
| 56 |
+
|
| 57 |
+
```python
|
| 58 |
+
from cascade.viz import load_tape_file, find_latest_tape, list_tape_files
|
| 59 |
+
|
| 60 |
+
# Find and load tape files
|
| 61 |
+
latest = find_latest_tape("./logs")
|
| 62 |
+
events = load_tape_file(latest)
|
| 63 |
+
|
| 64 |
+
for event in events:
|
| 65 |
+
print(event['event']['event_type'], event['event']['data'])
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
## License
|
| 69 |
+
|
| 70 |
+
MIT
|
cascade/__init__.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
╔═══════════════════════════════════════════════════════════════════════════════╗
|
| 3 |
+
║ ║
|
| 4 |
+
║ ██████╗ █████╗ ███████╗ ██████╗ █████╗ ██████╗ ███████╗ ║
|
| 5 |
+
║ ██╔════╝██╔══██╗██╔════╝██╔════╝██╔══██╗██╔══██╗██╔════╝ ║
|
| 6 |
+
║ ██║ ███████║███████╗██║ ███████║██║ ██║█████╗ ║
|
| 7 |
+
║ ██║ ██╔══██║╚════██║██║ ██╔══██║██║ ██║██╔══╝ ║
|
| 8 |
+
║ ╚██████╗██║ ██║███████║╚██████╗██║ ██║██████╔╝███████╗ ║
|
| 9 |
+
║ ╚═════╝╚═╝ ╚═╝╚══════╝ ╚═════╝╚═╝ ╚═╝╚═════╝ ╚══════╝ ║
|
| 10 |
+
║ ║
|
| 11 |
+
║ Symbiotic Causation Monitoring for Neural Networks ║
|
| 12 |
+
║ ║
|
| 13 |
+
║ "even still, i grow, and yet, I grow still" ║
|
| 14 |
+
║ ║
|
| 15 |
+
╚═══════════════════════════════════════════════════════════════════════════════╝
|
| 16 |
+
|
| 17 |
+
Cascade is a self-interpreting causation monitor that symbiotically adapts to
|
| 18 |
+
any system architecture through Kleene fixed-point convergence.
|
| 19 |
+
|
| 20 |
+
Feed it ANY signal format. It learns your system's patterns. It traces cause
|
| 21 |
+
and effect bidirectionally through time. It predicts cascading failures before
|
| 22 |
+
they complete.
|
| 23 |
+
|
| 24 |
+
Quick Start:
|
| 25 |
+
>>> import cascade
|
| 26 |
+
>>> monitor = cascade.Monitor()
|
| 27 |
+
>>> monitor.observe({"loss": 0.5, "epoch": 10})
|
| 28 |
+
>>> monitor.observe("ERROR: gradient exploded at layer 5")
|
| 29 |
+
>>>
|
| 30 |
+
>>> # What caused this?
|
| 31 |
+
>>> monitor.trace_backwards("gradient_explosion")
|
| 32 |
+
>>>
|
| 33 |
+
>>> # What will this cause?
|
| 34 |
+
>>> monitor.trace_forwards("learning_rate_spike")
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
__version__ = "0.5.4"
|
| 38 |
+
__author__ = "Cascade Team"
|
| 39 |
+
__license__ = "MIT"
|
| 40 |
+
|
| 41 |
+
from cascade.core.event import Event, CausationLink
|
| 42 |
+
from cascade.core.graph import CausationGraph
|
| 43 |
+
from cascade.core.adapter import SymbioticAdapter
|
| 44 |
+
from cascade.analysis.tracer import Tracer
|
| 45 |
+
from cascade.analysis.metrics import MetricsEngine
|
| 46 |
+
|
| 47 |
+
# Primary API
|
| 48 |
+
class Monitor:
|
| 49 |
+
"""
|
| 50 |
+
The main entry point for Cascade monitoring.
|
| 51 |
+
|
| 52 |
+
A symbiotic observer that acclimate to any system architecture.
|
| 53 |
+
Feed it signals in any format — it adapts and builds a causation graph.
|
| 54 |
+
|
| 55 |
+
Example:
|
| 56 |
+
>>> monitor = cascade.Monitor()
|
| 57 |
+
>>>
|
| 58 |
+
>>> # Feed it anything - dicts, strings, tensors, whatever
|
| 59 |
+
>>> monitor.observe({"loss": 0.5, "epoch": 10})
|
| 60 |
+
>>> monitor.observe("2024-01-01 12:00:00 INFO training started")
|
| 61 |
+
>>> monitor.observe(torch.tensor([0.1, 0.2, 0.3]))
|
| 62 |
+
>>>
|
| 63 |
+
>>> # Trace causation backwards (what caused this?)
|
| 64 |
+
>>> causes = monitor.trace_backwards(event_id)
|
| 65 |
+
>>>
|
| 66 |
+
>>> # Trace causation forwards (what will this cause?)
|
| 67 |
+
>>> effects = monitor.trace_forwards(event_id)
|
| 68 |
+
>>>
|
| 69 |
+
>>> # Get the full causation graph
|
| 70 |
+
>>> graph = monitor.graph
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
def __init__(self, name: str = "default"):
|
| 74 |
+
"""
|
| 75 |
+
Initialize a new Cascade monitor.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
name: Optional name for this monitor instance
|
| 79 |
+
"""
|
| 80 |
+
self.name = name
|
| 81 |
+
self.adapter = SymbioticAdapter()
|
| 82 |
+
self.graph = CausationGraph()
|
| 83 |
+
self.tracer = Tracer(self.graph)
|
| 84 |
+
self.metrics = MetricsEngine(self.graph)
|
| 85 |
+
self._event_count = 0
|
| 86 |
+
|
| 87 |
+
def observe(self, signal) -> Event:
|
| 88 |
+
"""
|
| 89 |
+
Observe a signal from the host system.
|
| 90 |
+
|
| 91 |
+
The signal can be in ANY format:
|
| 92 |
+
- dict: {"loss": 0.5, "epoch": 10}
|
| 93 |
+
- str: "ERROR: gradient exploded"
|
| 94 |
+
- tensor: torch.tensor([...])
|
| 95 |
+
- protobuf, JSON, log line, etc.
|
| 96 |
+
|
| 97 |
+
Cascade will automatically adapt to your signal format.
|
| 98 |
+
|
| 99 |
+
Args:
|
| 100 |
+
signal: Any signal from the host system
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
Event: The interpreted event added to the causation graph
|
| 104 |
+
"""
|
| 105 |
+
event = self.adapter.interpret(signal)
|
| 106 |
+
self.graph.add_event(event)
|
| 107 |
+
self.metrics.ingest(event)
|
| 108 |
+
self._event_count += 1
|
| 109 |
+
return event
|
| 110 |
+
|
| 111 |
+
def trace_backwards(self, event_id: str, max_depth: int = 10):
|
| 112 |
+
"""
|
| 113 |
+
Trace causation backwards: what caused this event?
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
event_id: ID of the event to trace from
|
| 117 |
+
max_depth: Maximum depth to trace (default: 10)
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
List of CausationChain objects showing the causal history
|
| 121 |
+
"""
|
| 122 |
+
return self.tracer.trace_backwards(event_id, max_depth)
|
| 123 |
+
|
| 124 |
+
def trace_forwards(self, event_id: str, max_depth: int = 10):
|
| 125 |
+
"""
|
| 126 |
+
Trace causation forwards: what did this event cause?
|
| 127 |
+
|
| 128 |
+
Args:
|
| 129 |
+
event_id: ID of the event to trace from
|
| 130 |
+
max_depth: Maximum depth to trace (default: 10)
|
| 131 |
+
|
| 132 |
+
Returns:
|
| 133 |
+
List of CausationChain objects showing the effects
|
| 134 |
+
"""
|
| 135 |
+
return self.tracer.trace_forwards(event_id, max_depth)
|
| 136 |
+
|
| 137 |
+
def find_root_causes(self, event_id: str):
|
| 138 |
+
"""
|
| 139 |
+
Find the ultimate root causes of an event.
|
| 140 |
+
|
| 141 |
+
Goes all the way back to find the origin points.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
event_id: ID of the event to analyze
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
List of root cause events with their causal chains
|
| 148 |
+
"""
|
| 149 |
+
return self.tracer.find_root_causes(event_id)
|
| 150 |
+
|
| 151 |
+
def analyze_impact(self, event_id: str, max_depth: int = 20):
|
| 152 |
+
"""
|
| 153 |
+
Analyze the downstream impact of an event.
|
| 154 |
+
|
| 155 |
+
Traces forward to find everything this event set in motion.
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
event_id: ID of the event to analyze
|
| 159 |
+
max_depth: Maximum depth to search
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
ImpactAnalysis with effects and severity score
|
| 163 |
+
"""
|
| 164 |
+
return self.tracer.analyze_impact(event_id, max_depth)
|
| 165 |
+
|
| 166 |
+
def predict_cascade(self, event_id: str):
|
| 167 |
+
"""
|
| 168 |
+
Predict the likely future cascade from this event.
|
| 169 |
+
|
| 170 |
+
Uses learned patterns to forecast effects before they happen.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
event_id: ID of the event to predict from
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
CascadePrediction with risk scores and intervention points
|
| 177 |
+
"""
|
| 178 |
+
return self.tracer.predict_cascade(event_id)
|
| 179 |
+
|
| 180 |
+
def __repr__(self):
|
| 181 |
+
return f"<Cascade Monitor '{self.name}' | {self._event_count} events>"
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# Convenience function for quick setup
|
| 185 |
+
def observe() -> Monitor:
|
| 186 |
+
"""
|
| 187 |
+
Create a new Cascade monitor ready for observation.
|
| 188 |
+
|
| 189 |
+
This is the simplest way to get started:
|
| 190 |
+
|
| 191 |
+
>>> import cascade
|
| 192 |
+
>>> monitor = cascade.observe()
|
| 193 |
+
>>> monitor.observe({"loss": 0.5})
|
| 194 |
+
|
| 195 |
+
Returns:
|
| 196 |
+
Monitor: A new monitor instance
|
| 197 |
+
"""
|
| 198 |
+
return Monitor()
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
# Tape utilities for event storage
|
| 202 |
+
from cascade.viz.tape import (
|
| 203 |
+
load_tape_file,
|
| 204 |
+
find_latest_tape,
|
| 205 |
+
list_tape_files,
|
| 206 |
+
PlaybackBuffer,
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
# SDK - Universal AI Observation Layer
|
| 210 |
+
from cascade.sdk import init, observe as sdk_observe, shutdown
|
| 211 |
+
|
| 212 |
+
# Store - Simple observe/query with HuggingFace sync
|
| 213 |
+
from cascade.store import (
|
| 214 |
+
observe as store_observe,
|
| 215 |
+
query as store_query,
|
| 216 |
+
get as store_get,
|
| 217 |
+
stats as store_stats,
|
| 218 |
+
sync_all,
|
| 219 |
+
pull_from_hf,
|
| 220 |
+
Receipt,
|
| 221 |
+
# Discovery - find other users' lattices
|
| 222 |
+
discover_models,
|
| 223 |
+
discover_datasets,
|
| 224 |
+
discover_live,
|
| 225 |
+
dataset_info,
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
# Convenience aliases
|
| 229 |
+
auto_observe = init # cascade.auto_observe() is clearer for some users
|
| 230 |
+
|
| 231 |
+
# HOLD - Inference-Level Halt Protocol
|
| 232 |
+
from cascade import hold as hold_module
|
| 233 |
+
from cascade.hold import (
|
| 234 |
+
Hold,
|
| 235 |
+
HoldPoint,
|
| 236 |
+
HoldResolution,
|
| 237 |
+
HoldState,
|
| 238 |
+
HoldAwareMixin,
|
| 239 |
+
CausationHold,
|
| 240 |
+
InferenceStep,
|
| 241 |
+
HoldSession,
|
| 242 |
+
ArcadeFeedback,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
__all__ = [
|
| 247 |
+
# SDK - Primary Interface
|
| 248 |
+
"init",
|
| 249 |
+
"auto_observe",
|
| 250 |
+
"shutdown",
|
| 251 |
+
# Store - HuggingFace-backed storage
|
| 252 |
+
"store_observe",
|
| 253 |
+
"store_query",
|
| 254 |
+
"store_get",
|
| 255 |
+
"store_stats",
|
| 256 |
+
"sync_all",
|
| 257 |
+
"pull_from_hf",
|
| 258 |
+
"Receipt",
|
| 259 |
+
# Discovery
|
| 260 |
+
"discover_models",
|
| 261 |
+
"discover_datasets",
|
| 262 |
+
"discover_live",
|
| 263 |
+
"dataset_info",
|
| 264 |
+
# Monitor (causation tracking)
|
| 265 |
+
"Monitor",
|
| 266 |
+
"observe",
|
| 267 |
+
"Event",
|
| 268 |
+
"CausationLink",
|
| 269 |
+
"CausationGraph",
|
| 270 |
+
"SymbioticAdapter",
|
| 271 |
+
"Tracer",
|
| 272 |
+
"MetricsEngine",
|
| 273 |
+
# Tape playback
|
| 274 |
+
"load_tape_file",
|
| 275 |
+
"find_latest_tape",
|
| 276 |
+
"list_tape_files",
|
| 277 |
+
"PlaybackBuffer",
|
| 278 |
+
# HOLD - Inference Halt Protocol
|
| 279 |
+
"Hold",
|
| 280 |
+
"HoldPoint",
|
| 281 |
+
"HoldResolution",
|
| 282 |
+
"HoldState",
|
| 283 |
+
"HoldAwareMixin",
|
| 284 |
+
"CausationHold",
|
| 285 |
+
"InferenceStep",
|
| 286 |
+
"HoldSession",
|
| 287 |
+
"ArcadeFeedback",
|
| 288 |
+
"hold_module",
|
| 289 |
+
"__version__",
|
| 290 |
+
]
|
cascade/analysis/__init__.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Cascade Analysis module - tracing, prediction, and intervention."""
|
| 2 |
+
|
| 3 |
+
from cascade.analysis.tracer import (
|
| 4 |
+
Tracer,
|
| 5 |
+
RootCauseAnalysis,
|
| 6 |
+
ImpactAnalysis,
|
| 7 |
+
CascadePrediction,
|
| 8 |
+
)
|
| 9 |
+
from cascade.analysis.metrics import (
|
| 10 |
+
MetricsEngine,
|
| 11 |
+
MetricSeries,
|
| 12 |
+
MetricCategory,
|
| 13 |
+
MetricHealthSpec,
|
| 14 |
+
Anomaly,
|
| 15 |
+
Correlation,
|
| 16 |
+
ThresholdCrossing,
|
| 17 |
+
classify_metric,
|
| 18 |
+
METRIC_TAXONOMY,
|
| 19 |
+
HEALTH_SPECS,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
__all__ = [
|
| 23 |
+
"Tracer",
|
| 24 |
+
"RootCauseAnalysis",
|
| 25 |
+
"ImpactAnalysis",
|
| 26 |
+
"CascadePrediction",
|
| 27 |
+
"MetricsEngine",
|
| 28 |
+
"MetricSeries",
|
| 29 |
+
"MetricCategory",
|
| 30 |
+
"MetricHealthSpec",
|
| 31 |
+
"Anomaly",
|
| 32 |
+
"Correlation",
|
| 33 |
+
"ThresholdCrossing",
|
| 34 |
+
"classify_metric",
|
| 35 |
+
"METRIC_TAXONOMY",
|
| 36 |
+
"HEALTH_SPECS",
|
| 37 |
+
]
|
cascade/analysis/metrics.py
ADDED
|
@@ -0,0 +1,1168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cascade Analysis - Metrics Engine.
|
| 3 |
+
|
| 4 |
+
The quantification layer. Extracts, tracks, and correlates numeric data
|
| 5 |
+
from the event stream. Provides the WHAT with enough depth that the WHY
|
| 6 |
+
becomes self-evident to the observer.
|
| 7 |
+
|
| 8 |
+
This module does NOT interpret or explain. It quantifies.
|
| 9 |
+
|
| 10 |
+
Industry-Standard Neural Network Observability Taxonomy:
|
| 11 |
+
=========================================================
|
| 12 |
+
|
| 13 |
+
CATEGORY 1: TRAINING_DYNAMICS
|
| 14 |
+
Core training loop metrics - loss, accuracy, learning rate, throughput
|
| 15 |
+
|
| 16 |
+
CATEGORY 2: GRADIENT_HEALTH
|
| 17 |
+
Gradient flow diagnostics - norms, clipping, vanishing/exploding
|
| 18 |
+
|
| 19 |
+
CATEGORY 3: WEIGHT_DYNAMICS
|
| 20 |
+
Parameter evolution - norms, update ratios, dead neurons
|
| 21 |
+
|
| 22 |
+
CATEGORY 4: ACTIVATION_FLOW
|
| 23 |
+
Forward pass health - magnitudes, saturation, dead ReLUs
|
| 24 |
+
|
| 25 |
+
CATEGORY 5: ATTENTION_MECHANICS
|
| 26 |
+
Transformer-specific - entropy, sparsity, head importance
|
| 27 |
+
|
| 28 |
+
CATEGORY 6: MEMORY_COMPUTE
|
| 29 |
+
Resource utilization - GPU/CPU memory, MFU, throughput
|
| 30 |
+
|
| 31 |
+
CATEGORY 7: OPTIMIZATION_STATE
|
| 32 |
+
Optimizer internals - Adam moments, momentum, weight decay
|
| 33 |
+
|
| 34 |
+
CATEGORY 8: CONVERGENCE_SIGNALS
|
| 35 |
+
Training health indicators - plateau, overfitting, noise scale
|
| 36 |
+
|
| 37 |
+
CATEGORY 9: DATA_PIPELINE
|
| 38 |
+
Data loading metrics - batch time, queue depth, prefetch
|
| 39 |
+
|
| 40 |
+
CATEGORY 10: REGULARIZATION
|
| 41 |
+
Regularization effects - dropout, batch norm, layer norm stats
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
from typing import Dict, List, Any, Optional, Tuple, Set
|
| 45 |
+
from dataclasses import dataclass, field
|
| 46 |
+
from collections import defaultdict
|
| 47 |
+
from enum import Enum, auto
|
| 48 |
+
import math
|
| 49 |
+
import re
|
| 50 |
+
|
| 51 |
+
from cascade.core.event import Event
|
| 52 |
+
from cascade.core.graph import CausationGraph
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# =============================================================================
|
| 56 |
+
# METRIC CATEGORY TAXONOMY
|
| 57 |
+
# =============================================================================
|
| 58 |
+
|
| 59 |
+
class MetricCategory(Enum):
|
| 60 |
+
"""Industry-standard neural network metric categories."""
|
| 61 |
+
TRAINING_DYNAMICS = auto() # Loss, accuracy, LR, throughput
|
| 62 |
+
GRADIENT_HEALTH = auto() # Grad norms, clipping, flow
|
| 63 |
+
WEIGHT_DYNAMICS = auto() # Weight norms, updates, dead neurons
|
| 64 |
+
ACTIVATION_FLOW = auto() # Activation stats, saturation
|
| 65 |
+
ATTENTION_MECHANICS = auto() # Attention entropy, sparsity, heads
|
| 66 |
+
MEMORY_COMPUTE = auto() # GPU/CPU mem, MFU, FLOPS
|
| 67 |
+
OPTIMIZATION_STATE = auto() # Adam moments, momentum, decay
|
| 68 |
+
CONVERGENCE_SIGNALS = auto() # Plateau, overfit, noise scale
|
| 69 |
+
DATA_PIPELINE = auto() # Batch time, queue, prefetch
|
| 70 |
+
REGULARIZATION = auto() # Dropout, norm layer stats
|
| 71 |
+
SYSTEM = auto() # Iteration, epoch, timestamps
|
| 72 |
+
UNKNOWN = auto() # Uncategorized metrics
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# Comprehensive metric-to-category mapping
|
| 76 |
+
# This is the "knowledge base" of neural network metric taxonomy
|
| 77 |
+
METRIC_TAXONOMY: Dict[str, MetricCategory] = {
|
| 78 |
+
# TRAINING_DYNAMICS
|
| 79 |
+
"loss": MetricCategory.TRAINING_DYNAMICS,
|
| 80 |
+
"train_loss": MetricCategory.TRAINING_DYNAMICS,
|
| 81 |
+
"val_loss": MetricCategory.TRAINING_DYNAMICS,
|
| 82 |
+
"test_loss": MetricCategory.TRAINING_DYNAMICS,
|
| 83 |
+
"eval_loss": MetricCategory.TRAINING_DYNAMICS,
|
| 84 |
+
"nll_loss": MetricCategory.TRAINING_DYNAMICS,
|
| 85 |
+
"ce_loss": MetricCategory.TRAINING_DYNAMICS,
|
| 86 |
+
"cross_entropy": MetricCategory.TRAINING_DYNAMICS,
|
| 87 |
+
"mse_loss": MetricCategory.TRAINING_DYNAMICS,
|
| 88 |
+
"mae_loss": MetricCategory.TRAINING_DYNAMICS,
|
| 89 |
+
"perplexity": MetricCategory.TRAINING_DYNAMICS,
|
| 90 |
+
"ppl": MetricCategory.TRAINING_DYNAMICS,
|
| 91 |
+
"accuracy": MetricCategory.TRAINING_DYNAMICS,
|
| 92 |
+
"acc": MetricCategory.TRAINING_DYNAMICS,
|
| 93 |
+
"top1_acc": MetricCategory.TRAINING_DYNAMICS,
|
| 94 |
+
"top5_acc": MetricCategory.TRAINING_DYNAMICS,
|
| 95 |
+
"precision": MetricCategory.TRAINING_DYNAMICS,
|
| 96 |
+
"recall": MetricCategory.TRAINING_DYNAMICS,
|
| 97 |
+
"f1": MetricCategory.TRAINING_DYNAMICS,
|
| 98 |
+
"f1_score": MetricCategory.TRAINING_DYNAMICS,
|
| 99 |
+
"auc": MetricCategory.TRAINING_DYNAMICS,
|
| 100 |
+
"auroc": MetricCategory.TRAINING_DYNAMICS,
|
| 101 |
+
"bleu": MetricCategory.TRAINING_DYNAMICS,
|
| 102 |
+
"rouge": MetricCategory.TRAINING_DYNAMICS,
|
| 103 |
+
"lr": MetricCategory.TRAINING_DYNAMICS,
|
| 104 |
+
"learning_rate": MetricCategory.TRAINING_DYNAMICS,
|
| 105 |
+
"samples_per_sec": MetricCategory.TRAINING_DYNAMICS,
|
| 106 |
+
"tokens_per_sec": MetricCategory.TRAINING_DYNAMICS,
|
| 107 |
+
"throughput": MetricCategory.TRAINING_DYNAMICS,
|
| 108 |
+
"steps_per_sec": MetricCategory.TRAINING_DYNAMICS,
|
| 109 |
+
|
| 110 |
+
# GRADIENT_HEALTH
|
| 111 |
+
"grad_norm": MetricCategory.GRADIENT_HEALTH,
|
| 112 |
+
"gradient_norm": MetricCategory.GRADIENT_HEALTH,
|
| 113 |
+
"global_grad_norm": MetricCategory.GRADIENT_HEALTH,
|
| 114 |
+
"grad_norm_clipped": MetricCategory.GRADIENT_HEALTH,
|
| 115 |
+
"grad_clip_rate": MetricCategory.GRADIENT_HEALTH,
|
| 116 |
+
"grad_scale": MetricCategory.GRADIENT_HEALTH,
|
| 117 |
+
"grad_mean": MetricCategory.GRADIENT_HEALTH,
|
| 118 |
+
"grad_std": MetricCategory.GRADIENT_HEALTH,
|
| 119 |
+
"grad_max": MetricCategory.GRADIENT_HEALTH,
|
| 120 |
+
"grad_min": MetricCategory.GRADIENT_HEALTH,
|
| 121 |
+
"grad_sparsity": MetricCategory.GRADIENT_HEALTH,
|
| 122 |
+
"vanishing_grad": MetricCategory.GRADIENT_HEALTH,
|
| 123 |
+
"exploding_grad": MetricCategory.GRADIENT_HEALTH,
|
| 124 |
+
|
| 125 |
+
# WEIGHT_DYNAMICS
|
| 126 |
+
"weight_norm": MetricCategory.WEIGHT_DYNAMICS,
|
| 127 |
+
"param_norm": MetricCategory.WEIGHT_DYNAMICS,
|
| 128 |
+
"weight_mean": MetricCategory.WEIGHT_DYNAMICS,
|
| 129 |
+
"weight_std": MetricCategory.WEIGHT_DYNAMICS,
|
| 130 |
+
"update_ratio": MetricCategory.WEIGHT_DYNAMICS,
|
| 131 |
+
"weight_update": MetricCategory.WEIGHT_DYNAMICS,
|
| 132 |
+
"dead_neurons": MetricCategory.WEIGHT_DYNAMICS,
|
| 133 |
+
"dead_neuron_pct": MetricCategory.WEIGHT_DYNAMICS,
|
| 134 |
+
"param_count": MetricCategory.WEIGHT_DYNAMICS,
|
| 135 |
+
"num_params": MetricCategory.WEIGHT_DYNAMICS,
|
| 136 |
+
"trainable_params": MetricCategory.WEIGHT_DYNAMICS,
|
| 137 |
+
|
| 138 |
+
# ACTIVATION_FLOW
|
| 139 |
+
"activation_mean": MetricCategory.ACTIVATION_FLOW,
|
| 140 |
+
"activation_std": MetricCategory.ACTIVATION_FLOW,
|
| 141 |
+
"activation_norm": MetricCategory.ACTIVATION_FLOW,
|
| 142 |
+
"activation_max": MetricCategory.ACTIVATION_FLOW,
|
| 143 |
+
"saturation": MetricCategory.ACTIVATION_FLOW,
|
| 144 |
+
"saturation_pct": MetricCategory.ACTIVATION_FLOW,
|
| 145 |
+
"dead_relu": MetricCategory.ACTIVATION_FLOW,
|
| 146 |
+
"dead_relu_pct": MetricCategory.ACTIVATION_FLOW,
|
| 147 |
+
"activation_sparsity": MetricCategory.ACTIVATION_FLOW,
|
| 148 |
+
# Generic activation stats from layer hooks
|
| 149 |
+
"mean": MetricCategory.ACTIVATION_FLOW,
|
| 150 |
+
"std": MetricCategory.ACTIVATION_FLOW,
|
| 151 |
+
"min": MetricCategory.ACTIVATION_FLOW,
|
| 152 |
+
"max": MetricCategory.ACTIVATION_FLOW,
|
| 153 |
+
"sparsity": MetricCategory.ACTIVATION_FLOW,
|
| 154 |
+
"layer_idx": MetricCategory.SYSTEM,
|
| 155 |
+
|
| 156 |
+
# ATTENTION_MECHANICS
|
| 157 |
+
"attention_entropy": MetricCategory.ATTENTION_MECHANICS,
|
| 158 |
+
"attn_entropy": MetricCategory.ATTENTION_MECHANICS,
|
| 159 |
+
"attention_sparsity": MetricCategory.ATTENTION_MECHANICS,
|
| 160 |
+
"head_importance": MetricCategory.ATTENTION_MECHANICS,
|
| 161 |
+
"attention_weight_norm": MetricCategory.ATTENTION_MECHANICS,
|
| 162 |
+
"position_bias": MetricCategory.ATTENTION_MECHANICS,
|
| 163 |
+
"attention_score_mean": MetricCategory.ATTENTION_MECHANICS,
|
| 164 |
+
"attention_score_std": MetricCategory.ATTENTION_MECHANICS,
|
| 165 |
+
|
| 166 |
+
# MEMORY_COMPUTE
|
| 167 |
+
"gpu_memory": MetricCategory.MEMORY_COMPUTE,
|
| 168 |
+
"gpu_mem": MetricCategory.MEMORY_COMPUTE,
|
| 169 |
+
"gpu_memory_allocated": MetricCategory.MEMORY_COMPUTE,
|
| 170 |
+
"gpu_memory_cached": MetricCategory.MEMORY_COMPUTE,
|
| 171 |
+
"gpu_memory_peak": MetricCategory.MEMORY_COMPUTE,
|
| 172 |
+
"cpu_memory": MetricCategory.MEMORY_COMPUTE,
|
| 173 |
+
"memory_usage": MetricCategory.MEMORY_COMPUTE,
|
| 174 |
+
"mfu": MetricCategory.MEMORY_COMPUTE,
|
| 175 |
+
"model_flops_utilization": MetricCategory.MEMORY_COMPUTE,
|
| 176 |
+
"flops": MetricCategory.MEMORY_COMPUTE,
|
| 177 |
+
"tflops": MetricCategory.MEMORY_COMPUTE,
|
| 178 |
+
"gpu_utilization": MetricCategory.MEMORY_COMPUTE,
|
| 179 |
+
"gpu_util": MetricCategory.MEMORY_COMPUTE,
|
| 180 |
+
|
| 181 |
+
# OPTIMIZATION_STATE
|
| 182 |
+
"adam_m_norm": MetricCategory.OPTIMIZATION_STATE,
|
| 183 |
+
"adam_v_norm": MetricCategory.OPTIMIZATION_STATE,
|
| 184 |
+
"momentum": MetricCategory.OPTIMIZATION_STATE,
|
| 185 |
+
"beta1": MetricCategory.OPTIMIZATION_STATE,
|
| 186 |
+
"beta2": MetricCategory.OPTIMIZATION_STATE,
|
| 187 |
+
"weight_decay": MetricCategory.OPTIMIZATION_STATE,
|
| 188 |
+
"effective_weight_decay": MetricCategory.OPTIMIZATION_STATE,
|
| 189 |
+
"warmup_progress": MetricCategory.OPTIMIZATION_STATE,
|
| 190 |
+
"lr_schedule_progress": MetricCategory.OPTIMIZATION_STATE,
|
| 191 |
+
|
| 192 |
+
# CONVERGENCE_SIGNALS
|
| 193 |
+
"train_val_gap": MetricCategory.CONVERGENCE_SIGNALS,
|
| 194 |
+
"overfit_ratio": MetricCategory.CONVERGENCE_SIGNALS,
|
| 195 |
+
"loss_plateau": MetricCategory.CONVERGENCE_SIGNALS,
|
| 196 |
+
"gradient_noise_scale": MetricCategory.CONVERGENCE_SIGNALS,
|
| 197 |
+
"critical_batch_size": MetricCategory.CONVERGENCE_SIGNALS,
|
| 198 |
+
"effective_batch_size": MetricCategory.CONVERGENCE_SIGNALS,
|
| 199 |
+
"early_stop_score": MetricCategory.CONVERGENCE_SIGNALS,
|
| 200 |
+
"best_val_loss": MetricCategory.CONVERGENCE_SIGNALS,
|
| 201 |
+
"improvement_rate": MetricCategory.CONVERGENCE_SIGNALS,
|
| 202 |
+
|
| 203 |
+
# DATA_PIPELINE
|
| 204 |
+
"data_time": MetricCategory.DATA_PIPELINE,
|
| 205 |
+
"batch_time": MetricCategory.DATA_PIPELINE,
|
| 206 |
+
"load_time": MetricCategory.DATA_PIPELINE,
|
| 207 |
+
"preprocessing_time": MetricCategory.DATA_PIPELINE,
|
| 208 |
+
"augmentation_time": MetricCategory.DATA_PIPELINE,
|
| 209 |
+
"queue_depth": MetricCategory.DATA_PIPELINE,
|
| 210 |
+
"prefetch_factor": MetricCategory.DATA_PIPELINE,
|
| 211 |
+
"num_workers": MetricCategory.DATA_PIPELINE,
|
| 212 |
+
|
| 213 |
+
# REGULARIZATION
|
| 214 |
+
"dropout_rate": MetricCategory.REGULARIZATION,
|
| 215 |
+
"dropout": MetricCategory.REGULARIZATION,
|
| 216 |
+
"bn_mean": MetricCategory.REGULARIZATION,
|
| 217 |
+
"bn_var": MetricCategory.REGULARIZATION,
|
| 218 |
+
"bn_running_mean": MetricCategory.REGULARIZATION,
|
| 219 |
+
"bn_running_var": MetricCategory.REGULARIZATION,
|
| 220 |
+
"ln_mean": MetricCategory.REGULARIZATION,
|
| 221 |
+
"ln_var": MetricCategory.REGULARIZATION,
|
| 222 |
+
"l1_penalty": MetricCategory.REGULARIZATION,
|
| 223 |
+
"l2_penalty": MetricCategory.REGULARIZATION,
|
| 224 |
+
|
| 225 |
+
# SYSTEM
|
| 226 |
+
"iter": MetricCategory.SYSTEM,
|
| 227 |
+
"iteration": MetricCategory.SYSTEM,
|
| 228 |
+
"step": MetricCategory.SYSTEM,
|
| 229 |
+
"total": MetricCategory.SYSTEM,
|
| 230 |
+
"epoch": MetricCategory.SYSTEM,
|
| 231 |
+
"batch": MetricCategory.SYSTEM,
|
| 232 |
+
"batch_idx": MetricCategory.SYSTEM,
|
| 233 |
+
"global_step": MetricCategory.SYSTEM,
|
| 234 |
+
"time": MetricCategory.SYSTEM,
|
| 235 |
+
"dt": MetricCategory.SYSTEM,
|
| 236 |
+
"elapsed": MetricCategory.SYSTEM,
|
| 237 |
+
"wall_time": MetricCategory.SYSTEM,
|
| 238 |
+
"timestamp": MetricCategory.SYSTEM,
|
| 239 |
+
"hooked_layers": MetricCategory.SYSTEM,
|
| 240 |
+
"input_tokens": MetricCategory.SYSTEM,
|
| 241 |
+
"predicted_class": MetricCategory.TRAINING_DYNAMICS,
|
| 242 |
+
|
| 243 |
+
# MODEL INFO
|
| 244 |
+
"params": MetricCategory.WEIGHT_DYNAMICS,
|
| 245 |
+
"num_params": MetricCategory.WEIGHT_DYNAMICS,
|
| 246 |
+
"total_params": MetricCategory.WEIGHT_DYNAMICS,
|
| 247 |
+
"trainable_params": MetricCategory.WEIGHT_DYNAMICS,
|
| 248 |
+
"parameters": MetricCategory.WEIGHT_DYNAMICS,
|
| 249 |
+
"model_size": MetricCategory.WEIGHT_DYNAMICS,
|
| 250 |
+
|
| 251 |
+
# INFERENCE METRICS
|
| 252 |
+
"confidence": MetricCategory.TRAINING_DYNAMICS,
|
| 253 |
+
"similarity": MetricCategory.TRAINING_DYNAMICS,
|
| 254 |
+
"score": MetricCategory.TRAINING_DYNAMICS,
|
| 255 |
+
"prob": MetricCategory.TRAINING_DYNAMICS,
|
| 256 |
+
"probability": MetricCategory.TRAINING_DYNAMICS,
|
| 257 |
+
"entropy": MetricCategory.ATTENTION_MECHANICS,
|
| 258 |
+
"latency": MetricCategory.MEMORY_COMPUTE,
|
| 259 |
+
"inference_time": MetricCategory.MEMORY_COMPUTE,
|
| 260 |
+
"input_len": MetricCategory.DATA_PIPELINE,
|
| 261 |
+
"output_len": MetricCategory.DATA_PIPELINE,
|
| 262 |
+
|
| 263 |
+
# OBSERVATION SYSTEM METRICS
|
| 264 |
+
"hooked_modules": MetricCategory.SYSTEM,
|
| 265 |
+
"total_layers": MetricCategory.SYSTEM,
|
| 266 |
+
"sample_rate": MetricCategory.SYSTEM,
|
| 267 |
+
"layer_num": MetricCategory.SYSTEM,
|
| 268 |
+
"max_depth": MetricCategory.SYSTEM,
|
| 269 |
+
"return_code": MetricCategory.SYSTEM,
|
| 270 |
+
"pid": MetricCategory.SYSTEM,
|
| 271 |
+
"max_iterations": MetricCategory.SYSTEM,
|
| 272 |
+
"total_iterations": MetricCategory.SYSTEM,
|
| 273 |
+
"iterations": MetricCategory.SYSTEM,
|
| 274 |
+
|
| 275 |
+
# GPU/VRAM
|
| 276 |
+
"vram_gb": MetricCategory.MEMORY_COMPUTE,
|
| 277 |
+
"gpu_count": MetricCategory.MEMORY_COMPUTE,
|
| 278 |
+
"gpu_memory_gb": MetricCategory.MEMORY_COMPUTE,
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
# Patterns for dynamic metric name matching
|
| 282 |
+
METRIC_PATTERNS: List[Tuple[str, MetricCategory]] = [
|
| 283 |
+
(r".*loss.*", MetricCategory.TRAINING_DYNAMICS),
|
| 284 |
+
(r".*acc.*", MetricCategory.TRAINING_DYNAMICS),
|
| 285 |
+
(r".*accuracy.*", MetricCategory.TRAINING_DYNAMICS),
|
| 286 |
+
(r".*perplexity.*", MetricCategory.TRAINING_DYNAMICS),
|
| 287 |
+
(r".*lr.*", MetricCategory.TRAINING_DYNAMICS),
|
| 288 |
+
(r".*learning_rate.*", MetricCategory.TRAINING_DYNAMICS),
|
| 289 |
+
(r".*grad.*norm.*", MetricCategory.GRADIENT_HEALTH),
|
| 290 |
+
(r".*gradient.*", MetricCategory.GRADIENT_HEALTH),
|
| 291 |
+
(r".*weight.*norm.*", MetricCategory.WEIGHT_DYNAMICS),
|
| 292 |
+
(r".*param.*norm.*", MetricCategory.WEIGHT_DYNAMICS),
|
| 293 |
+
(r".*activation.*", MetricCategory.ACTIVATION_FLOW),
|
| 294 |
+
(r".*attention.*", MetricCategory.ATTENTION_MECHANICS),
|
| 295 |
+
(r".*attn.*", MetricCategory.ATTENTION_MECHANICS),
|
| 296 |
+
(r".*memory.*", MetricCategory.MEMORY_COMPUTE),
|
| 297 |
+
(r".*gpu.*", MetricCategory.MEMORY_COMPUTE),
|
| 298 |
+
(r".*mfu.*", MetricCategory.MEMORY_COMPUTE),
|
| 299 |
+
(r".*adam.*", MetricCategory.OPTIMIZATION_STATE),
|
| 300 |
+
(r".*momentum.*", MetricCategory.OPTIMIZATION_STATE),
|
| 301 |
+
(r".*overfit.*", MetricCategory.CONVERGENCE_SIGNALS),
|
| 302 |
+
(r".*plateau.*", MetricCategory.CONVERGENCE_SIGNALS),
|
| 303 |
+
(r".*data.*time.*", MetricCategory.DATA_PIPELINE),
|
| 304 |
+
(r".*batch.*time.*", MetricCategory.DATA_PIPELINE),
|
| 305 |
+
(r".*dropout.*", MetricCategory.REGULARIZATION),
|
| 306 |
+
(r".*bn_.*", MetricCategory.REGULARIZATION),
|
| 307 |
+
(r".*ln_.*", MetricCategory.REGULARIZATION),
|
| 308 |
+
(r".*iter.*", MetricCategory.SYSTEM),
|
| 309 |
+
(r".*epoch.*", MetricCategory.SYSTEM),
|
| 310 |
+
(r".*step.*", MetricCategory.SYSTEM),
|
| 311 |
+
(r".*time.*", MetricCategory.SYSTEM),
|
| 312 |
+
(r".*_ms$", MetricCategory.SYSTEM),
|
| 313 |
+
(r".*duration.*", MetricCategory.SYSTEM),
|
| 314 |
+
]
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def classify_metric(name: str) -> MetricCategory:
|
| 318 |
+
"""Classify a metric name into its category."""
|
| 319 |
+
name_lower = name.lower()
|
| 320 |
+
|
| 321 |
+
# Direct lookup
|
| 322 |
+
if name_lower in METRIC_TAXONOMY:
|
| 323 |
+
return METRIC_TAXONOMY[name_lower]
|
| 324 |
+
|
| 325 |
+
# Pattern matching
|
| 326 |
+
for pattern, category in METRIC_PATTERNS:
|
| 327 |
+
if re.match(pattern, name_lower):
|
| 328 |
+
return category
|
| 329 |
+
|
| 330 |
+
return MetricCategory.UNKNOWN
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# =============================================================================
|
| 334 |
+
# METRIC HEALTH THRESHOLDS (Industry Standards)
|
| 335 |
+
# =============================================================================
|
| 336 |
+
|
| 337 |
+
@dataclass
|
| 338 |
+
class MetricHealthSpec:
|
| 339 |
+
"""Specification for healthy metric ranges."""
|
| 340 |
+
name: str
|
| 341 |
+
category: MetricCategory
|
| 342 |
+
healthy_min: Optional[float] = None
|
| 343 |
+
healthy_max: Optional[float] = None
|
| 344 |
+
critical_min: Optional[float] = None
|
| 345 |
+
critical_max: Optional[float] = None
|
| 346 |
+
expected_trend: Optional[str] = None # 'falling', 'rising', 'stable'
|
| 347 |
+
|
| 348 |
+
def is_healthy(self, value: float) -> bool:
|
| 349 |
+
if self.healthy_min is not None and value < self.healthy_min:
|
| 350 |
+
return False
|
| 351 |
+
if self.healthy_max is not None and value > self.healthy_max:
|
| 352 |
+
return False
|
| 353 |
+
return True
|
| 354 |
+
|
| 355 |
+
def is_critical(self, value: float) -> bool:
|
| 356 |
+
if self.critical_min is not None and value < self.critical_min:
|
| 357 |
+
return True
|
| 358 |
+
if self.critical_max is not None and value > self.critical_max:
|
| 359 |
+
return True
|
| 360 |
+
return False
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
# Industry-standard health thresholds
|
| 364 |
+
HEALTH_SPECS: Dict[str, MetricHealthSpec] = {
|
| 365 |
+
"loss": MetricHealthSpec(
|
| 366 |
+
name="loss",
|
| 367 |
+
category=MetricCategory.TRAINING_DYNAMICS,
|
| 368 |
+
healthy_max=10.0,
|
| 369 |
+
critical_max=100.0,
|
| 370 |
+
expected_trend="falling",
|
| 371 |
+
),
|
| 372 |
+
"grad_norm": MetricHealthSpec(
|
| 373 |
+
name="grad_norm",
|
| 374 |
+
category=MetricCategory.GRADIENT_HEALTH,
|
| 375 |
+
healthy_min=1e-7,
|
| 376 |
+
healthy_max=10.0,
|
| 377 |
+
critical_min=1e-10, # Vanishing
|
| 378 |
+
critical_max=1000.0, # Exploding
|
| 379 |
+
),
|
| 380 |
+
"lr": MetricHealthSpec(
|
| 381 |
+
name="lr",
|
| 382 |
+
category=MetricCategory.TRAINING_DYNAMICS,
|
| 383 |
+
healthy_min=1e-8,
|
| 384 |
+
healthy_max=1.0,
|
| 385 |
+
critical_max=10.0,
|
| 386 |
+
),
|
| 387 |
+
"mfu": MetricHealthSpec(
|
| 388 |
+
name="mfu",
|
| 389 |
+
category=MetricCategory.MEMORY_COMPUTE,
|
| 390 |
+
healthy_min=0.1, # 10% utilization minimum
|
| 391 |
+
healthy_max=1.0,
|
| 392 |
+
),
|
| 393 |
+
"dead_relu_pct": MetricHealthSpec(
|
| 394 |
+
name="dead_relu_pct",
|
| 395 |
+
category=MetricCategory.ACTIVATION_FLOW,
|
| 396 |
+
healthy_max=0.3, # 30% dead is concerning
|
| 397 |
+
critical_max=0.7, # 70% dead is critical
|
| 398 |
+
),
|
| 399 |
+
"train_val_gap": MetricHealthSpec(
|
| 400 |
+
name="train_val_gap",
|
| 401 |
+
category=MetricCategory.CONVERGENCE_SIGNALS,
|
| 402 |
+
healthy_max=0.5, # Gap shouldn't exceed 50% of train loss
|
| 403 |
+
critical_max=2.0, # Severe overfitting
|
| 404 |
+
),
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
@dataclass
|
| 409 |
+
class MetricSeries:
|
| 410 |
+
"""A time series of a single metric with category awareness."""
|
| 411 |
+
name: str
|
| 412 |
+
category: MetricCategory = field(default=MetricCategory.UNKNOWN)
|
| 413 |
+
values: List[float] = field(default_factory=list)
|
| 414 |
+
timestamps: List[float] = field(default_factory=list)
|
| 415 |
+
event_ids: List[str] = field(default_factory=list)
|
| 416 |
+
|
| 417 |
+
def __post_init__(self):
|
| 418 |
+
if self.category == MetricCategory.UNKNOWN:
|
| 419 |
+
self.category = classify_metric(self.name)
|
| 420 |
+
|
| 421 |
+
@property
|
| 422 |
+
def count(self) -> int:
|
| 423 |
+
return len(self.values)
|
| 424 |
+
|
| 425 |
+
@property
|
| 426 |
+
def current(self) -> Optional[float]:
|
| 427 |
+
return self.values[-1] if self.values else None
|
| 428 |
+
|
| 429 |
+
@property
|
| 430 |
+
def previous(self) -> Optional[float]:
|
| 431 |
+
return self.values[-2] if len(self.values) >= 2 else None
|
| 432 |
+
|
| 433 |
+
@property
|
| 434 |
+
def delta(self) -> Optional[float]:
|
| 435 |
+
"""Change from previous to current."""
|
| 436 |
+
if len(self.values) >= 2:
|
| 437 |
+
return self.values[-1] - self.values[-2]
|
| 438 |
+
return None
|
| 439 |
+
|
| 440 |
+
@property
|
| 441 |
+
def delta_pct(self) -> Optional[float]:
|
| 442 |
+
"""Percentage change from previous to current."""
|
| 443 |
+
if len(self.values) >= 2 and self.values[-2] != 0:
|
| 444 |
+
return (self.values[-1] - self.values[-2]) / abs(self.values[-2])
|
| 445 |
+
return None
|
| 446 |
+
|
| 447 |
+
@property
|
| 448 |
+
def mean(self) -> Optional[float]:
|
| 449 |
+
return sum(self.values) / len(self.values) if self.values else None
|
| 450 |
+
|
| 451 |
+
@property
|
| 452 |
+
def std(self) -> Optional[float]:
|
| 453 |
+
if len(self.values) < 2:
|
| 454 |
+
return None
|
| 455 |
+
mean = self.mean
|
| 456 |
+
variance = sum((x - mean) ** 2 for x in self.values) / len(self.values)
|
| 457 |
+
return math.sqrt(variance)
|
| 458 |
+
|
| 459 |
+
@property
|
| 460 |
+
def min(self) -> Optional[float]:
|
| 461 |
+
return min(self.values) if self.values else None
|
| 462 |
+
|
| 463 |
+
@property
|
| 464 |
+
def max(self) -> Optional[float]:
|
| 465 |
+
return max(self.values) if self.values else None
|
| 466 |
+
|
| 467 |
+
@property
|
| 468 |
+
def range(self) -> Optional[float]:
|
| 469 |
+
if self.values:
|
| 470 |
+
return self.max - self.min
|
| 471 |
+
return None
|
| 472 |
+
|
| 473 |
+
def moving_average(self, window: int = 5) -> Optional[float]:
|
| 474 |
+
"""Compute moving average over last N values."""
|
| 475 |
+
if len(self.values) < window:
|
| 476 |
+
return self.mean
|
| 477 |
+
return sum(self.values[-window:]) / window
|
| 478 |
+
|
| 479 |
+
def rate_of_change(self, window: int = 5) -> Optional[float]:
|
| 480 |
+
"""Average rate of change over last N values."""
|
| 481 |
+
if len(self.values) < 2:
|
| 482 |
+
return None
|
| 483 |
+
window = min(window, len(self.values))
|
| 484 |
+
recent = self.values[-window:]
|
| 485 |
+
deltas = [recent[i] - recent[i-1] for i in range(1, len(recent))]
|
| 486 |
+
return sum(deltas) / len(deltas) if deltas else None
|
| 487 |
+
|
| 488 |
+
def is_anomaly(self, threshold_std: float = 2.0) -> bool:
|
| 489 |
+
"""Is current value anomalous (outside N standard deviations)?"""
|
| 490 |
+
if len(self.values) < 5 or self.std is None or self.std == 0:
|
| 491 |
+
return False
|
| 492 |
+
return abs(self.values[-1] - self.mean) > threshold_std * self.std
|
| 493 |
+
|
| 494 |
+
def trend(self, window: int = 10) -> str:
|
| 495 |
+
"""Determine trend: 'rising', 'falling', 'stable', 'volatile'."""
|
| 496 |
+
if len(self.values) < 3:
|
| 497 |
+
return "unknown"
|
| 498 |
+
|
| 499 |
+
window = min(window, len(self.values))
|
| 500 |
+
recent = self.values[-window:]
|
| 501 |
+
deltas = [recent[i] - recent[i-1] for i in range(1, len(recent))]
|
| 502 |
+
|
| 503 |
+
positive = sum(1 for d in deltas if d > 0)
|
| 504 |
+
negative = sum(1 for d in deltas if d < 0)
|
| 505 |
+
|
| 506 |
+
if positive > 0.7 * len(deltas):
|
| 507 |
+
return "rising"
|
| 508 |
+
elif negative > 0.7 * len(deltas):
|
| 509 |
+
return "falling"
|
| 510 |
+
elif self.std and self.mean and self.std > 0.1 * abs(self.mean):
|
| 511 |
+
return "volatile"
|
| 512 |
+
else:
|
| 513 |
+
return "stable"
|
| 514 |
+
|
| 515 |
+
def health_status(self) -> str:
|
| 516 |
+
"""Check health against industry standards. Returns 'healthy', 'warning', 'critical', 'unknown'."""
|
| 517 |
+
if self.current is None:
|
| 518 |
+
return "unknown"
|
| 519 |
+
|
| 520 |
+
name_lower = self.name.lower()
|
| 521 |
+
if name_lower in HEALTH_SPECS:
|
| 522 |
+
spec = HEALTH_SPECS[name_lower]
|
| 523 |
+
if spec.is_critical(self.current):
|
| 524 |
+
return "critical"
|
| 525 |
+
if not spec.is_healthy(self.current):
|
| 526 |
+
return "warning"
|
| 527 |
+
return "healthy"
|
| 528 |
+
|
| 529 |
+
# Default heuristics for unknown metrics
|
| 530 |
+
if self.is_anomaly(threshold_std=3.0):
|
| 531 |
+
return "critical"
|
| 532 |
+
if self.is_anomaly(threshold_std=2.0):
|
| 533 |
+
return "warning"
|
| 534 |
+
return "healthy"
|
| 535 |
+
|
| 536 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 537 |
+
return {
|
| 538 |
+
"name": self.name,
|
| 539 |
+
"category": self.category.name,
|
| 540 |
+
"count": self.count,
|
| 541 |
+
"current": self.current,
|
| 542 |
+
"delta": self.delta,
|
| 543 |
+
"delta_pct": self.delta_pct,
|
| 544 |
+
"mean": self.mean,
|
| 545 |
+
"std": self.std,
|
| 546 |
+
"min": self.min,
|
| 547 |
+
"max": self.max,
|
| 548 |
+
"trend": self.trend(),
|
| 549 |
+
"health": self.health_status(),
|
| 550 |
+
"is_anomaly": self.is_anomaly(),
|
| 551 |
+
"rate_of_change": self.rate_of_change(),
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
@dataclass
|
| 556 |
+
class Anomaly:
|
| 557 |
+
"""A detected anomaly in the metric stream."""
|
| 558 |
+
metric_name: str
|
| 559 |
+
category: MetricCategory
|
| 560 |
+
event_id: str
|
| 561 |
+
timestamp: float
|
| 562 |
+
value: float
|
| 563 |
+
expected_range: Tuple[float, float] # (low, high)
|
| 564 |
+
deviation_std: float
|
| 565 |
+
severity: str # 'minor', 'major', 'critical'
|
| 566 |
+
|
| 567 |
+
|
| 568 |
+
@dataclass
|
| 569 |
+
class Correlation:
|
| 570 |
+
"""A detected correlation between two metrics."""
|
| 571 |
+
metric_a: str
|
| 572 |
+
metric_b: str
|
| 573 |
+
category_a: MetricCategory
|
| 574 |
+
category_b: MetricCategory
|
| 575 |
+
coefficient: float # -1 to 1
|
| 576 |
+
strength: str # 'weak', 'moderate', 'strong'
|
| 577 |
+
direction: str # 'positive', 'negative'
|
| 578 |
+
|
| 579 |
+
|
| 580 |
+
@dataclass
|
| 581 |
+
class ThresholdCrossing:
|
| 582 |
+
"""A metric crossing a significant threshold."""
|
| 583 |
+
metric_name: str
|
| 584 |
+
category: MetricCategory
|
| 585 |
+
event_id: str
|
| 586 |
+
timestamp: float
|
| 587 |
+
old_value: float
|
| 588 |
+
new_value: float
|
| 589 |
+
threshold: float
|
| 590 |
+
direction: str # 'above', 'below'
|
| 591 |
+
|
| 592 |
+
|
| 593 |
+
class MetricsEngine:
|
| 594 |
+
"""
|
| 595 |
+
Quantification engine for the event stream.
|
| 596 |
+
|
| 597 |
+
Extracts numeric metrics from events, tracks them over time,
|
| 598 |
+
detects anomalies, correlations, and threshold crossings.
|
| 599 |
+
|
| 600 |
+
Does NOT interpret or explain. Provides raw quantified data
|
| 601 |
+
for human or AI observers to divine meaning from.
|
| 602 |
+
|
| 603 |
+
Example:
|
| 604 |
+
>>> engine = MetricsEngine(graph)
|
| 605 |
+
>>> engine.ingest(event)
|
| 606 |
+
>>>
|
| 607 |
+
>>> # Get metric statistics
|
| 608 |
+
>>> loss = engine.get_metric("loss")
|
| 609 |
+
>>> print(f"Loss: {loss.current} (delta: {loss.delta}, trend: {loss.trend()})")
|
| 610 |
+
>>>
|
| 611 |
+
>>> # Get anomalies
|
| 612 |
+
>>> for anomaly in engine.anomalies:
|
| 613 |
+
... print(f"ANOMALY: {anomaly.metric_name} = {anomaly.value}")
|
| 614 |
+
>>>
|
| 615 |
+
>>> # Get correlations
|
| 616 |
+
>>> for corr in engine.get_correlations():
|
| 617 |
+
... print(f"{corr.metric_a} ~ {corr.metric_b}: {corr.coefficient:.2f}")
|
| 618 |
+
"""
|
| 619 |
+
|
| 620 |
+
def __init__(self, graph: Optional[CausationGraph] = None):
|
| 621 |
+
self.graph = graph
|
| 622 |
+
self._metrics: Dict[str, MetricSeries] = {}
|
| 623 |
+
self._anomalies: List[Anomaly] = []
|
| 624 |
+
self._threshold_crossings: List[ThresholdCrossing] = []
|
| 625 |
+
self._event_count = 0
|
| 626 |
+
|
| 627 |
+
# Configurable thresholds
|
| 628 |
+
self.anomaly_std_threshold = 2.5
|
| 629 |
+
self.correlation_min_samples = 10
|
| 630 |
+
|
| 631 |
+
# Known significant thresholds for ML metrics
|
| 632 |
+
self._known_thresholds = {
|
| 633 |
+
"loss": [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
|
| 634 |
+
"accuracy": [0.5, 0.8, 0.9, 0.95, 0.99],
|
| 635 |
+
"lr": [1e-5, 1e-4, 1e-3, 1e-2, 0.1],
|
| 636 |
+
"learning_rate": [1e-5, 1e-4, 1e-3, 1e-2, 0.1],
|
| 637 |
+
"grad_norm": [0.1, 1.0, 10.0, 100.0],
|
| 638 |
+
"gradient_norm": [0.1, 1.0, 10.0, 100.0],
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
def ingest(self, event: Event) -> Dict[str, MetricSeries]:
|
| 642 |
+
"""
|
| 643 |
+
Ingest an event and extract/track all numeric metrics.
|
| 644 |
+
|
| 645 |
+
Returns dict of updated metric series.
|
| 646 |
+
"""
|
| 647 |
+
self._event_count += 1
|
| 648 |
+
updated = {}
|
| 649 |
+
|
| 650 |
+
for key, value in event.data.items():
|
| 651 |
+
if isinstance(value, (int, float)) and not isinstance(value, bool):
|
| 652 |
+
category = classify_metric(key)
|
| 653 |
+
|
| 654 |
+
if math.isnan(value) or math.isinf(value):
|
| 655 |
+
# Track NaN/Inf as anomalies but don't add to series
|
| 656 |
+
self._anomalies.append(Anomaly(
|
| 657 |
+
metric_name=key,
|
| 658 |
+
category=category,
|
| 659 |
+
event_id=event.event_id,
|
| 660 |
+
timestamp=event.timestamp,
|
| 661 |
+
value=value,
|
| 662 |
+
expected_range=(0, 0),
|
| 663 |
+
deviation_std=float('inf'),
|
| 664 |
+
severity='critical',
|
| 665 |
+
))
|
| 666 |
+
continue
|
| 667 |
+
|
| 668 |
+
# Get or create metric series with proper category
|
| 669 |
+
if key not in self._metrics:
|
| 670 |
+
self._metrics[key] = MetricSeries(name=key, category=category)
|
| 671 |
+
|
| 672 |
+
series = self._metrics[key]
|
| 673 |
+
old_value = series.current
|
| 674 |
+
|
| 675 |
+
# Add new value
|
| 676 |
+
series.values.append(float(value))
|
| 677 |
+
series.timestamps.append(event.timestamp)
|
| 678 |
+
series.event_ids.append(event.event_id)
|
| 679 |
+
|
| 680 |
+
# Check for anomaly
|
| 681 |
+
if series.is_anomaly(self.anomaly_std_threshold):
|
| 682 |
+
deviation = abs(value - series.mean) / series.std if series.std else 0
|
| 683 |
+
severity = 'critical' if deviation > 4 else 'major' if deviation > 3 else 'minor'
|
| 684 |
+
self._anomalies.append(Anomaly(
|
| 685 |
+
metric_name=key,
|
| 686 |
+
category=category,
|
| 687 |
+
event_id=event.event_id,
|
| 688 |
+
timestamp=event.timestamp,
|
| 689 |
+
value=value,
|
| 690 |
+
expected_range=(
|
| 691 |
+
series.mean - 2*series.std,
|
| 692 |
+
series.mean + 2*series.std
|
| 693 |
+
),
|
| 694 |
+
deviation_std=deviation,
|
| 695 |
+
severity=severity,
|
| 696 |
+
))
|
| 697 |
+
|
| 698 |
+
# Check for threshold crossing
|
| 699 |
+
if old_value is not None:
|
| 700 |
+
self._check_threshold_crossing(
|
| 701 |
+
key, event.event_id, event.timestamp, old_value, value
|
| 702 |
+
)
|
| 703 |
+
|
| 704 |
+
updated[key] = series
|
| 705 |
+
|
| 706 |
+
return updated
|
| 707 |
+
|
| 708 |
+
def _check_threshold_crossing(
|
| 709 |
+
self,
|
| 710 |
+
metric: str,
|
| 711 |
+
event_id: str,
|
| 712 |
+
timestamp: float,
|
| 713 |
+
old_value: float,
|
| 714 |
+
new_value: float
|
| 715 |
+
):
|
| 716 |
+
"""Check if a metric crossed a known threshold."""
|
| 717 |
+
thresholds = self._known_thresholds.get(metric, [])
|
| 718 |
+
category = classify_metric(metric)
|
| 719 |
+
|
| 720 |
+
for threshold in thresholds:
|
| 721 |
+
# Crossed upward
|
| 722 |
+
if old_value < threshold <= new_value:
|
| 723 |
+
self._threshold_crossings.append(ThresholdCrossing(
|
| 724 |
+
metric_name=metric,
|
| 725 |
+
category=category,
|
| 726 |
+
event_id=event_id,
|
| 727 |
+
timestamp=timestamp,
|
| 728 |
+
old_value=old_value,
|
| 729 |
+
new_value=new_value,
|
| 730 |
+
threshold=threshold,
|
| 731 |
+
direction='above',
|
| 732 |
+
))
|
| 733 |
+
# Crossed downward
|
| 734 |
+
elif old_value > threshold >= new_value:
|
| 735 |
+
self._threshold_crossings.append(ThresholdCrossing(
|
| 736 |
+
metric_name=metric,
|
| 737 |
+
category=category,
|
| 738 |
+
event_id=event_id,
|
| 739 |
+
timestamp=timestamp,
|
| 740 |
+
old_value=old_value,
|
| 741 |
+
new_value=new_value,
|
| 742 |
+
threshold=threshold,
|
| 743 |
+
direction='below',
|
| 744 |
+
))
|
| 745 |
+
|
| 746 |
+
def get_metric(self, name: str) -> Optional[MetricSeries]:
|
| 747 |
+
"""Get a metric series by name."""
|
| 748 |
+
return self._metrics.get(name)
|
| 749 |
+
|
| 750 |
+
@property
|
| 751 |
+
def metrics(self) -> Dict[str, MetricSeries]:
|
| 752 |
+
"""All tracked metrics."""
|
| 753 |
+
return self._metrics
|
| 754 |
+
|
| 755 |
+
@property
|
| 756 |
+
def metric_names(self) -> List[str]:
|
| 757 |
+
"""Names of all tracked metrics."""
|
| 758 |
+
return list(self._metrics.keys())
|
| 759 |
+
|
| 760 |
+
@property
|
| 761 |
+
def anomalies(self) -> List[Anomaly]:
|
| 762 |
+
"""All detected anomalies."""
|
| 763 |
+
return self._anomalies
|
| 764 |
+
|
| 765 |
+
@property
|
| 766 |
+
def recent_anomalies(self) -> List[Anomaly]:
|
| 767 |
+
"""Anomalies from last 10 events."""
|
| 768 |
+
if not self._anomalies:
|
| 769 |
+
return []
|
| 770 |
+
recent_ids = set()
|
| 771 |
+
for series in self._metrics.values():
|
| 772 |
+
recent_ids.update(series.event_ids[-10:])
|
| 773 |
+
return [a for a in self._anomalies if a.event_id in recent_ids]
|
| 774 |
+
|
| 775 |
+
@property
|
| 776 |
+
def threshold_crossings(self) -> List[ThresholdCrossing]:
|
| 777 |
+
"""All threshold crossings."""
|
| 778 |
+
return self._threshold_crossings
|
| 779 |
+
|
| 780 |
+
def get_correlations(self, min_coefficient: float = 0.5) -> List[Correlation]:
|
| 781 |
+
"""
|
| 782 |
+
Compute correlations between all metric pairs.
|
| 783 |
+
|
| 784 |
+
Returns correlations with |coefficient| >= min_coefficient.
|
| 785 |
+
"""
|
| 786 |
+
correlations = []
|
| 787 |
+
metric_names = list(self._metrics.keys())
|
| 788 |
+
|
| 789 |
+
for i, name_a in enumerate(metric_names):
|
| 790 |
+
series_a = self._metrics[name_a]
|
| 791 |
+
for name_b in metric_names[i+1:]:
|
| 792 |
+
series_b = self._metrics[name_b]
|
| 793 |
+
coef = self._pearson_correlation(name_a, name_b)
|
| 794 |
+
if coef is not None and abs(coef) >= min_coefficient:
|
| 795 |
+
strength = 'strong' if abs(coef) > 0.8 else 'moderate' if abs(coef) > 0.5 else 'weak'
|
| 796 |
+
direction = 'positive' if coef > 0 else 'negative'
|
| 797 |
+
correlations.append(Correlation(
|
| 798 |
+
metric_a=name_a,
|
| 799 |
+
metric_b=name_b,
|
| 800 |
+
category_a=series_a.category,
|
| 801 |
+
category_b=series_b.category,
|
| 802 |
+
coefficient=coef,
|
| 803 |
+
strength=strength,
|
| 804 |
+
direction=direction,
|
| 805 |
+
))
|
| 806 |
+
|
| 807 |
+
return sorted(correlations, key=lambda c: abs(c.coefficient), reverse=True)
|
| 808 |
+
|
| 809 |
+
def _pearson_correlation(self, name_a: str, name_b: str) -> Optional[float]:
|
| 810 |
+
"""Compute Pearson correlation between two metrics."""
|
| 811 |
+
series_a = self._metrics.get(name_a)
|
| 812 |
+
series_b = self._metrics.get(name_b)
|
| 813 |
+
|
| 814 |
+
if not series_a or not series_b:
|
| 815 |
+
return None
|
| 816 |
+
|
| 817 |
+
# Need enough samples
|
| 818 |
+
if series_a.count < self.correlation_min_samples or series_b.count < self.correlation_min_samples:
|
| 819 |
+
return None
|
| 820 |
+
|
| 821 |
+
# Align by taking min length
|
| 822 |
+
n = min(series_a.count, series_b.count)
|
| 823 |
+
a = series_a.values[-n:]
|
| 824 |
+
b = series_b.values[-n:]
|
| 825 |
+
|
| 826 |
+
# Compute correlation
|
| 827 |
+
mean_a = sum(a) / n
|
| 828 |
+
mean_b = sum(b) / n
|
| 829 |
+
|
| 830 |
+
numerator = sum((a[i] - mean_a) * (b[i] - mean_b) for i in range(n))
|
| 831 |
+
|
| 832 |
+
var_a = sum((x - mean_a) ** 2 for x in a)
|
| 833 |
+
var_b = sum((x - mean_b) ** 2 for x in b)
|
| 834 |
+
|
| 835 |
+
denominator = math.sqrt(var_a * var_b)
|
| 836 |
+
|
| 837 |
+
if denominator == 0:
|
| 838 |
+
return None
|
| 839 |
+
|
| 840 |
+
return numerator / denominator
|
| 841 |
+
|
| 842 |
+
def summary(self) -> Dict[str, Any]:
|
| 843 |
+
"""Get a summary of all metrics and detections."""
|
| 844 |
+
return {
|
| 845 |
+
"event_count": self._event_count,
|
| 846 |
+
"metric_count": len(self._metrics),
|
| 847 |
+
"metrics": {name: series.to_dict() for name, series in self._metrics.items()},
|
| 848 |
+
"metrics_by_category": self.metrics_by_category_summary(),
|
| 849 |
+
"anomaly_count": len(self._anomalies),
|
| 850 |
+
"recent_anomalies": [
|
| 851 |
+
{"metric": a.metric_name, "category": a.category.name, "value": a.value, "severity": a.severity}
|
| 852 |
+
for a in self.recent_anomalies
|
| 853 |
+
],
|
| 854 |
+
"threshold_crossings": len(self._threshold_crossings),
|
| 855 |
+
"correlations": [
|
| 856 |
+
{"a": c.metric_a, "b": c.metric_b, "r": c.coefficient,
|
| 857 |
+
"cat_a": c.category_a.name, "cat_b": c.category_b.name}
|
| 858 |
+
for c in self.get_correlations()[:5] # Top 5
|
| 859 |
+
],
|
| 860 |
+
"health_status": self.health_summary(),
|
| 861 |
+
}
|
| 862 |
+
|
| 863 |
+
# =========================================================================
|
| 864 |
+
# CATEGORY-AWARE QUERIES
|
| 865 |
+
# =========================================================================
|
| 866 |
+
|
| 867 |
+
def get_metrics_by_category(self, category: MetricCategory) -> Dict[str, MetricSeries]:
|
| 868 |
+
"""Get all metrics in a specific category."""
|
| 869 |
+
return {
|
| 870 |
+
name: series for name, series in self._metrics.items()
|
| 871 |
+
if series.category == category
|
| 872 |
+
}
|
| 873 |
+
|
| 874 |
+
def metrics_by_category_summary(self) -> Dict[str, Dict[str, Any]]:
|
| 875 |
+
"""Get metric count and names grouped by category."""
|
| 876 |
+
by_cat: Dict[str, Dict[str, Any]] = {}
|
| 877 |
+
for name, series in self._metrics.items():
|
| 878 |
+
cat_name = series.category.name
|
| 879 |
+
if cat_name not in by_cat:
|
| 880 |
+
by_cat[cat_name] = {"count": 0, "metrics": [], "health": []}
|
| 881 |
+
by_cat[cat_name]["count"] += 1
|
| 882 |
+
by_cat[cat_name]["metrics"].append(name)
|
| 883 |
+
by_cat[cat_name]["health"].append(series.health_status())
|
| 884 |
+
return by_cat
|
| 885 |
+
|
| 886 |
+
def get_training_metrics(self) -> Dict[str, MetricSeries]:
|
| 887 |
+
"""Convenience: get all TRAINING_DYNAMICS metrics."""
|
| 888 |
+
return self.get_metrics_by_category(MetricCategory.TRAINING_DYNAMICS)
|
| 889 |
+
|
| 890 |
+
def get_gradient_metrics(self) -> Dict[str, MetricSeries]:
|
| 891 |
+
"""Convenience: get all GRADIENT_HEALTH metrics."""
|
| 892 |
+
return self.get_metrics_by_category(MetricCategory.GRADIENT_HEALTH)
|
| 893 |
+
|
| 894 |
+
def get_memory_metrics(self) -> Dict[str, MetricSeries]:
|
| 895 |
+
"""Convenience: get all MEMORY_COMPUTE metrics."""
|
| 896 |
+
return self.get_metrics_by_category(MetricCategory.MEMORY_COMPUTE)
|
| 897 |
+
|
| 898 |
+
def get_convergence_metrics(self) -> Dict[str, MetricSeries]:
|
| 899 |
+
"""Convenience: get all CONVERGENCE_SIGNALS metrics."""
|
| 900 |
+
return self.get_metrics_by_category(MetricCategory.CONVERGENCE_SIGNALS)
|
| 901 |
+
|
| 902 |
+
def health_summary(self) -> Dict[str, Any]:
|
| 903 |
+
"""Get overall health status of all metrics."""
|
| 904 |
+
statuses = {"healthy": 0, "warning": 0, "critical": 0, "unknown": 0}
|
| 905 |
+
issues = []
|
| 906 |
+
|
| 907 |
+
for name, series in self._metrics.items():
|
| 908 |
+
status = series.health_status()
|
| 909 |
+
statuses[status] += 1
|
| 910 |
+
if status in ("warning", "critical"):
|
| 911 |
+
issues.append({
|
| 912 |
+
"metric": name,
|
| 913 |
+
"category": series.category.name,
|
| 914 |
+
"status": status,
|
| 915 |
+
"value": series.current,
|
| 916 |
+
"trend": series.trend(),
|
| 917 |
+
})
|
| 918 |
+
|
| 919 |
+
overall = "critical" if statuses["critical"] > 0 else \
|
| 920 |
+
"warning" if statuses["warning"] > 0 else "healthy"
|
| 921 |
+
|
| 922 |
+
return {
|
| 923 |
+
"overall": overall,
|
| 924 |
+
"counts": statuses,
|
| 925 |
+
"issues": issues,
|
| 926 |
+
}
|
| 927 |
+
|
| 928 |
+
def get_cross_category_correlations(self) -> List[Correlation]:
|
| 929 |
+
"""Get correlations between metrics in different categories."""
|
| 930 |
+
all_corr = self.get_correlations(min_coefficient=0.3)
|
| 931 |
+
return [c for c in all_corr if c.category_a != c.category_b]
|
| 932 |
+
|
| 933 |
+
def get_category_coverage(self) -> Dict[str, bool]:
|
| 934 |
+
"""Check which metric categories are being tracked."""
|
| 935 |
+
tracked = {series.category for series in self._metrics.values()}
|
| 936 |
+
return {cat.name: cat in tracked for cat in MetricCategory}
|
| 937 |
+
|
| 938 |
+
# =========================================================================
|
| 939 |
+
# TRIAGE SYSTEM - Common Sense Diagnostics (Occam's Razor)
|
| 940 |
+
# =========================================================================
|
| 941 |
+
#
|
| 942 |
+
# Five questions that matter:
|
| 943 |
+
# 1. Is training working? (loss trend)
|
| 944 |
+
# 2. Is it about to explode? (gradient health)
|
| 945 |
+
# 3. Am I wasting compute? (efficiency)
|
| 946 |
+
# 4. Am I overfitting? (generalization gap)
|
| 947 |
+
# 5. What broke and why? (anomaly + correlation)
|
| 948 |
+
#
|
| 949 |
+
|
| 950 |
+
def triage(self) -> Dict[str, Any]:
|
| 951 |
+
"""
|
| 952 |
+
Quick diagnostic: Is training healthy? What's wrong?
|
| 953 |
+
|
| 954 |
+
Returns a simple, actionable assessment.
|
| 955 |
+
Occam's Razor: simplest useful answer.
|
| 956 |
+
"""
|
| 957 |
+
diagnosis = {
|
| 958 |
+
"status": "LISTENING", # Not UNKNOWN - we're actively waiting
|
| 959 |
+
"confidence": 0.0,
|
| 960 |
+
"checks": {},
|
| 961 |
+
"action": "Collecting initial metrics...",
|
| 962 |
+
"details": [],
|
| 963 |
+
}
|
| 964 |
+
|
| 965 |
+
checks_passed = 0
|
| 966 |
+
checks_total = 0
|
| 967 |
+
|
| 968 |
+
# CHECK 1: Is loss going down?
|
| 969 |
+
loss_check = self._check_loss_progress()
|
| 970 |
+
diagnosis["checks"]["loss_progress"] = loss_check
|
| 971 |
+
checks_total += 1
|
| 972 |
+
if loss_check["ok"]:
|
| 973 |
+
checks_passed += 1
|
| 974 |
+
|
| 975 |
+
# CHECK 2: Are gradients healthy?
|
| 976 |
+
grad_check = self._check_gradient_health()
|
| 977 |
+
diagnosis["checks"]["gradient_health"] = grad_check
|
| 978 |
+
checks_total += 1
|
| 979 |
+
if grad_check["ok"]:
|
| 980 |
+
checks_passed += 1
|
| 981 |
+
|
| 982 |
+
# CHECK 3: Am I using compute efficiently?
|
| 983 |
+
efficiency_check = self._check_efficiency()
|
| 984 |
+
diagnosis["checks"]["efficiency"] = efficiency_check
|
| 985 |
+
checks_total += 1
|
| 986 |
+
if efficiency_check["ok"]:
|
| 987 |
+
checks_passed += 1
|
| 988 |
+
|
| 989 |
+
# CHECK 4: Am I overfitting?
|
| 990 |
+
overfit_check = self._check_overfitting()
|
| 991 |
+
diagnosis["checks"]["overfitting"] = overfit_check
|
| 992 |
+
checks_total += 1
|
| 993 |
+
if overfit_check["ok"]:
|
| 994 |
+
checks_passed += 1
|
| 995 |
+
|
| 996 |
+
# CHECK 5: Any anomalies pointing to root cause?
|
| 997 |
+
anomaly_check = self._check_anomalies()
|
| 998 |
+
diagnosis["checks"]["anomalies"] = anomaly_check
|
| 999 |
+
checks_total += 1
|
| 1000 |
+
if anomaly_check["ok"]:
|
| 1001 |
+
checks_passed += 1
|
| 1002 |
+
|
| 1003 |
+
# Overall status
|
| 1004 |
+
diagnosis["confidence"] = checks_passed / checks_total if checks_total > 0 else 0
|
| 1005 |
+
|
| 1006 |
+
if checks_passed == checks_total:
|
| 1007 |
+
diagnosis["status"] = "HEALTHY"
|
| 1008 |
+
diagnosis["action"] = "Training looks good. Continue monitoring."
|
| 1009 |
+
elif checks_passed >= checks_total * 0.6:
|
| 1010 |
+
diagnosis["status"] = "WARNING"
|
| 1011 |
+
# Find what's wrong
|
| 1012 |
+
issues = [k for k, v in diagnosis["checks"].items() if not v["ok"]]
|
| 1013 |
+
diagnosis["action"] = f"Review: {', '.join(issues)}"
|
| 1014 |
+
else:
|
| 1015 |
+
diagnosis["status"] = "CRITICAL"
|
| 1016 |
+
diagnosis["action"] = "Stop and investigate. Multiple issues detected."
|
| 1017 |
+
|
| 1018 |
+
# Collect all details
|
| 1019 |
+
for check_name, check_result in diagnosis["checks"].items():
|
| 1020 |
+
if check_result.get("detail"):
|
| 1021 |
+
diagnosis["details"].append(f"{check_name}: {check_result['detail']}")
|
| 1022 |
+
|
| 1023 |
+
return diagnosis
|
| 1024 |
+
|
| 1025 |
+
def _check_loss_progress(self) -> Dict[str, Any]:
|
| 1026 |
+
"""Is loss decreasing as expected?"""
|
| 1027 |
+
# Find loss metric (try common names)
|
| 1028 |
+
loss_series = None
|
| 1029 |
+
for name in ["loss", "train_loss", "nll_loss", "ce_loss"]:
|
| 1030 |
+
if name in self._metrics:
|
| 1031 |
+
loss_series = self._metrics[name]
|
| 1032 |
+
break
|
| 1033 |
+
|
| 1034 |
+
if loss_series is None or loss_series.count < 3:
|
| 1035 |
+
return {"ok": True, "detail": "Waiting for loss metrics (need 3+)", "status": "waiting"}
|
| 1036 |
+
|
| 1037 |
+
trend = loss_series.trend()
|
| 1038 |
+
roc = loss_series.rate_of_change()
|
| 1039 |
+
|
| 1040 |
+
if trend == "falling":
|
| 1041 |
+
return {"ok": True, "detail": f"Loss falling (Δ={roc:.4f}/step)", "status": "good"}
|
| 1042 |
+
elif trend == "stable" and loss_series.current < 1.0:
|
| 1043 |
+
return {"ok": True, "detail": f"Loss stable at {loss_series.current:.4f}", "status": "converged"}
|
| 1044 |
+
elif trend == "rising":
|
| 1045 |
+
return {"ok": False, "detail": f"Loss RISING! Current: {loss_series.current:.4f}", "status": "diverging"}
|
| 1046 |
+
elif trend == "volatile":
|
| 1047 |
+
return {"ok": False, "detail": f"Loss unstable (std={loss_series.std:.4f})", "status": "unstable"}
|
| 1048 |
+
else:
|
| 1049 |
+
return {"ok": True, "detail": f"Loss: {loss_series.current:.4f} (trend unclear)", "status": "stable"}
|
| 1050 |
+
|
| 1051 |
+
def _check_gradient_health(self) -> Dict[str, Any]:
|
| 1052 |
+
"""Are gradients in a healthy range?"""
|
| 1053 |
+
grad_series = None
|
| 1054 |
+
for name in ["grad_norm", "gradient_norm", "global_grad_norm"]:
|
| 1055 |
+
if name in self._metrics:
|
| 1056 |
+
grad_series = self._metrics[name]
|
| 1057 |
+
break
|
| 1058 |
+
|
| 1059 |
+
if grad_series is None or grad_series.count < 2:
|
| 1060 |
+
return {"ok": True, "detail": "Waiting for grad_norm metrics", "status": "waiting"}
|
| 1061 |
+
|
| 1062 |
+
current = grad_series.current
|
| 1063 |
+
|
| 1064 |
+
# Vanishing gradients
|
| 1065 |
+
if current < 1e-7:
|
| 1066 |
+
return {"ok": False, "detail": f"VANISHING gradients: {current:.2e}", "status": "vanishing"}
|
| 1067 |
+
|
| 1068 |
+
# Exploding gradients
|
| 1069 |
+
if current > 100:
|
| 1070 |
+
return {"ok": False, "detail": f"EXPLODING gradients: {current:.2f}", "status": "exploding"}
|
| 1071 |
+
|
| 1072 |
+
# Healthy range
|
| 1073 |
+
if 1e-5 < current < 10:
|
| 1074 |
+
return {"ok": True, "detail": f"Gradients healthy: {current:.4f}", "status": "healthy"}
|
| 1075 |
+
|
| 1076 |
+
# Warning zone
|
| 1077 |
+
return {"ok": True, "detail": f"Gradients marginal: {current:.4f}", "status": "marginal"}
|
| 1078 |
+
|
| 1079 |
+
def _check_efficiency(self) -> Dict[str, Any]:
|
| 1080 |
+
"""Am I using compute efficiently?"""
|
| 1081 |
+
# Check MFU (Model FLOP Utilization)
|
| 1082 |
+
mfu_series = self._metrics.get("mfu")
|
| 1083 |
+
if mfu_series and mfu_series.count > 0:
|
| 1084 |
+
mfu = mfu_series.current
|
| 1085 |
+
if mfu < 0.1:
|
| 1086 |
+
return {"ok": False, "detail": f"Low GPU utilization: {mfu*100:.1f}%", "status": "inefficient"}
|
| 1087 |
+
elif mfu < 0.3:
|
| 1088 |
+
return {"ok": True, "detail": f"Moderate efficiency: {mfu*100:.1f}%", "status": "moderate"}
|
| 1089 |
+
else:
|
| 1090 |
+
return {"ok": True, "detail": f"Good efficiency: {mfu*100:.1f}%", "status": "efficient"}
|
| 1091 |
+
|
| 1092 |
+
# Fallback: check timing
|
| 1093 |
+
time_series = self._metrics.get("dt") or self._metrics.get("time") or self._metrics.get("batch_time")
|
| 1094 |
+
if time_series and time_series.count > 2:
|
| 1095 |
+
trend = time_series.trend()
|
| 1096 |
+
if trend == "rising":
|
| 1097 |
+
return {"ok": False, "detail": "Step time increasing (slowdown)", "status": "degrading"}
|
| 1098 |
+
return {"ok": True, "detail": f"Step time: {time_series.current:.3f}s", "status": "stable"}
|
| 1099 |
+
|
| 1100 |
+
return {"ok": True, "detail": "Need mfu or dt/time metrics", "status": "waiting"}
|
| 1101 |
+
|
| 1102 |
+
def _check_overfitting(self) -> Dict[str, Any]:
|
| 1103 |
+
"""Is model overfitting?"""
|
| 1104 |
+
train_loss = None
|
| 1105 |
+
val_loss = None
|
| 1106 |
+
|
| 1107 |
+
# Find train and val loss
|
| 1108 |
+
for name in ["loss", "train_loss"]:
|
| 1109 |
+
if name in self._metrics:
|
| 1110 |
+
train_loss = self._metrics[name]
|
| 1111 |
+
break
|
| 1112 |
+
|
| 1113 |
+
for name in ["val_loss", "eval_loss", "test_loss"]:
|
| 1114 |
+
if name in self._metrics:
|
| 1115 |
+
val_loss = self._metrics[name]
|
| 1116 |
+
break
|
| 1117 |
+
|
| 1118 |
+
if train_loss is None or val_loss is None:
|
| 1119 |
+
return {"ok": True, "detail": "Need train_loss + val_loss to check", "status": "waiting"}
|
| 1120 |
+
|
| 1121 |
+
if train_loss.count < 3 or val_loss.count < 3:
|
| 1122 |
+
return {"ok": True, "detail": f"Collecting ({train_loss.count}/3 train, {val_loss.count}/3 val)", "status": "waiting"}
|
| 1123 |
+
|
| 1124 |
+
gap = val_loss.current - train_loss.current
|
| 1125 |
+
gap_pct = gap / train_loss.current if train_loss.current > 0 else 0
|
| 1126 |
+
|
| 1127 |
+
# Check if gap is widening
|
| 1128 |
+
train_trend = train_loss.trend()
|
| 1129 |
+
val_trend = val_loss.trend()
|
| 1130 |
+
|
| 1131 |
+
if train_trend == "falling" and val_trend == "rising":
|
| 1132 |
+
return {"ok": False, "detail": f"OVERFITTING: train↓ val↑ (gap={gap:.4f})", "status": "overfitting"}
|
| 1133 |
+
|
| 1134 |
+
if gap_pct > 0.5: # Val loss 50% higher than train
|
| 1135 |
+
return {"ok": False, "detail": f"Large generalization gap: {gap_pct*100:.1f}%", "status": "high_gap"}
|
| 1136 |
+
|
| 1137 |
+
if gap_pct > 0.2:
|
| 1138 |
+
return {"ok": True, "detail": f"Moderate gap: {gap_pct*100:.1f}%", "status": "moderate_gap"}
|
| 1139 |
+
|
| 1140 |
+
return {"ok": True, "detail": f"Good generalization (gap={gap:.4f})", "status": "healthy"}
|
| 1141 |
+
|
| 1142 |
+
def _check_anomalies(self) -> Dict[str, Any]:
|
| 1143 |
+
"""Any recent anomalies that need attention?"""
|
| 1144 |
+
recent = self.recent_anomalies
|
| 1145 |
+
|
| 1146 |
+
if not recent:
|
| 1147 |
+
return {"ok": True, "detail": "No anomalies", "status": "clean"}
|
| 1148 |
+
|
| 1149 |
+
critical = [a for a in recent if a.severity == "critical"]
|
| 1150 |
+
major = [a for a in recent if a.severity == "major"]
|
| 1151 |
+
|
| 1152 |
+
if critical:
|
| 1153 |
+
names = list(set(a.metric_name for a in critical))
|
| 1154 |
+
return {"ok": False, "detail": f"CRITICAL anomalies in: {', '.join(names)}", "status": "critical"}
|
| 1155 |
+
|
| 1156 |
+
if major:
|
| 1157 |
+
names = list(set(a.metric_name for a in major))
|
| 1158 |
+
return {"ok": False, "detail": f"Major anomalies in: {', '.join(names)}", "status": "major"}
|
| 1159 |
+
|
| 1160 |
+
return {"ok": True, "detail": f"{len(recent)} minor anomalies", "status": "minor"}
|
| 1161 |
+
|
| 1162 |
+
def quick_status(self) -> str:
|
| 1163 |
+
"""One-line status for dashboards."""
|
| 1164 |
+
t = self.triage()
|
| 1165 |
+
return f"[{t['status']}] {t['action']} (confidence: {t['confidence']*100:.0f}%)"
|
| 1166 |
+
|
| 1167 |
+
def __repr__(self) -> str:
|
| 1168 |
+
return f"<MetricsEngine | {len(self._metrics)} metrics, {len(self._anomalies)} anomalies>"
|
cascade/analysis/tracer.py
ADDED
|
@@ -0,0 +1,487 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cascade Analysis - Bidirectional Causation Tracer.
|
| 3 |
+
|
| 4 |
+
Trace cause-effect chains forwards and backwards through time.
|
| 5 |
+
Find root causes. Predict cascading effects.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import List, Dict, Any, Optional, Set
|
| 9 |
+
from collections import deque
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
|
| 12 |
+
from cascade.core.event import Event, CausationLink, CausationChain
|
| 13 |
+
from cascade.core.graph import CausationGraph
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class RootCauseAnalysis:
|
| 18 |
+
"""Results of a root cause analysis."""
|
| 19 |
+
target_event: Event
|
| 20 |
+
root_causes: List[Event]
|
| 21 |
+
chains: List[CausationChain]
|
| 22 |
+
deepest_depth: int = 0
|
| 23 |
+
narrative: str = ""
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class ImpactAnalysis:
|
| 28 |
+
"""Results of an impact/forward analysis."""
|
| 29 |
+
source_event: Event
|
| 30 |
+
effects: List[Event]
|
| 31 |
+
chains: List[CausationChain]
|
| 32 |
+
total_impact_count: int = 0
|
| 33 |
+
severity_score: float = 0.0
|
| 34 |
+
narrative: str = ""
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@dataclass
|
| 38 |
+
class CascadePrediction:
|
| 39 |
+
"""Prediction of likely cascade from an event."""
|
| 40 |
+
source_event: Event
|
| 41 |
+
predicted_effects: List[Dict[str, Any]] # [{event_type, probability, time_estimate}, ...]
|
| 42 |
+
risk_score: float = 0.0
|
| 43 |
+
intervention_points: List[str] = field(default_factory=list)
|
| 44 |
+
narrative: str = ""
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class Tracer:
|
| 48 |
+
"""
|
| 49 |
+
Bidirectional causation tracer.
|
| 50 |
+
|
| 51 |
+
Traces cause-effect chains through the causation graph:
|
| 52 |
+
- Backwards: "What caused this?" → find root causes
|
| 53 |
+
- Forwards: "What will this cause?" → predict cascades
|
| 54 |
+
|
| 55 |
+
Example:
|
| 56 |
+
>>> tracer = Tracer(graph)
|
| 57 |
+
>>>
|
| 58 |
+
>>> # What caused this gradient explosion?
|
| 59 |
+
>>> causes = tracer.trace_backwards("evt_123")
|
| 60 |
+
>>>
|
| 61 |
+
>>> # What will this learning rate change cause?
|
| 62 |
+
>>> effects = tracer.trace_forwards("evt_456")
|
| 63 |
+
>>>
|
| 64 |
+
>>> # Deep root cause analysis
|
| 65 |
+
>>> roots = tracer.find_root_causes("evt_789")
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
def __init__(self, graph: CausationGraph):
|
| 69 |
+
"""
|
| 70 |
+
Initialize tracer with a causation graph.
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
graph: The causation graph to trace through
|
| 74 |
+
"""
|
| 75 |
+
self.graph = graph
|
| 76 |
+
self._prediction_model = None # Future: ML model for predictions
|
| 77 |
+
|
| 78 |
+
def trace_backwards(self, event_id: str, max_depth: int = 1000) -> List[CausationChain]:
|
| 79 |
+
"""
|
| 80 |
+
Trace causation backwards: what caused this event?
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
event_id: ID of the event to trace from
|
| 84 |
+
max_depth: Maximum depth to trace (default: 1000 - effectively unlimited)
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
List of CausationChain objects, one per causal path found
|
| 88 |
+
"""
|
| 89 |
+
target = self.graph.get_event(event_id)
|
| 90 |
+
if not target:
|
| 91 |
+
return []
|
| 92 |
+
|
| 93 |
+
chains = []
|
| 94 |
+
self._trace_backwards_recursive(event_id, [], [], max_depth, chains)
|
| 95 |
+
|
| 96 |
+
# Sort by depth (longest chain first for root cause analysis)
|
| 97 |
+
chains.sort(key=lambda c: c.depth, reverse=True)
|
| 98 |
+
return chains
|
| 99 |
+
|
| 100 |
+
def _trace_backwards_recursive(
|
| 101 |
+
self,
|
| 102 |
+
current_id: str,
|
| 103 |
+
current_events: List[Event],
|
| 104 |
+
current_links: List[CausationLink],
|
| 105 |
+
depth_remaining: int,
|
| 106 |
+
results: List[CausationChain],
|
| 107 |
+
visited: Optional[Set[str]] = None
|
| 108 |
+
) -> None:
|
| 109 |
+
"""Recursive helper for backwards tracing."""
|
| 110 |
+
if visited is None:
|
| 111 |
+
visited = set()
|
| 112 |
+
|
| 113 |
+
if current_id in visited:
|
| 114 |
+
return # Avoid cycles
|
| 115 |
+
visited.add(current_id)
|
| 116 |
+
|
| 117 |
+
current_event = self.graph.get_event(current_id)
|
| 118 |
+
if not current_event:
|
| 119 |
+
return
|
| 120 |
+
|
| 121 |
+
current_events = [current_event] + current_events
|
| 122 |
+
|
| 123 |
+
if depth_remaining <= 0:
|
| 124 |
+
# Max depth reached, record this chain
|
| 125 |
+
if len(current_events) > 1:
|
| 126 |
+
results.append(self._build_chain(current_events, current_links))
|
| 127 |
+
return
|
| 128 |
+
|
| 129 |
+
causes = self.graph.get_causes(current_id)
|
| 130 |
+
|
| 131 |
+
if not causes:
|
| 132 |
+
# This is a root - record the chain
|
| 133 |
+
if len(current_events) >= 1:
|
| 134 |
+
results.append(self._build_chain(current_events, current_links))
|
| 135 |
+
return
|
| 136 |
+
|
| 137 |
+
for cause in causes:
|
| 138 |
+
link = self.graph.get_link(cause.event_id, current_id)
|
| 139 |
+
new_links = [link] + current_links if link else current_links
|
| 140 |
+
|
| 141 |
+
self._trace_backwards_recursive(
|
| 142 |
+
cause.event_id,
|
| 143 |
+
current_events,
|
| 144 |
+
new_links,
|
| 145 |
+
depth_remaining - 1,
|
| 146 |
+
results,
|
| 147 |
+
visited.copy()
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
def trace_forwards(self, event_id: str, max_depth: int = 1000) -> List[CausationChain]:
|
| 151 |
+
"""
|
| 152 |
+
Trace causation forwards: what will this event cause?
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
event_id: ID of the event to trace from
|
| 156 |
+
max_depth: Maximum depth to trace (default: 1000 - effectively unlimited)
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
List of CausationChain objects, one per effect path found
|
| 160 |
+
"""
|
| 161 |
+
source = self.graph.get_event(event_id)
|
| 162 |
+
if not source:
|
| 163 |
+
return []
|
| 164 |
+
|
| 165 |
+
chains = []
|
| 166 |
+
self._trace_forwards_recursive(event_id, [], [], max_depth, chains)
|
| 167 |
+
|
| 168 |
+
# Sort by depth
|
| 169 |
+
chains.sort(key=lambda c: c.depth, reverse=True)
|
| 170 |
+
return chains
|
| 171 |
+
|
| 172 |
+
def _trace_forwards_recursive(
|
| 173 |
+
self,
|
| 174 |
+
current_id: str,
|
| 175 |
+
current_events: List[Event],
|
| 176 |
+
current_links: List[CausationLink],
|
| 177 |
+
depth_remaining: int,
|
| 178 |
+
results: List[CausationChain],
|
| 179 |
+
visited: Optional[Set[str]] = None
|
| 180 |
+
) -> None:
|
| 181 |
+
"""Recursive helper for forwards tracing."""
|
| 182 |
+
if visited is None:
|
| 183 |
+
visited = set()
|
| 184 |
+
|
| 185 |
+
if current_id in visited:
|
| 186 |
+
return
|
| 187 |
+
visited.add(current_id)
|
| 188 |
+
|
| 189 |
+
current_event = self.graph.get_event(current_id)
|
| 190 |
+
if not current_event:
|
| 191 |
+
return
|
| 192 |
+
|
| 193 |
+
current_events = current_events + [current_event]
|
| 194 |
+
|
| 195 |
+
if depth_remaining <= 0:
|
| 196 |
+
if len(current_events) > 1:
|
| 197 |
+
results.append(self._build_chain(current_events, current_links))
|
| 198 |
+
return
|
| 199 |
+
|
| 200 |
+
effects = self.graph.get_effects(current_id)
|
| 201 |
+
|
| 202 |
+
if not effects:
|
| 203 |
+
# This is a leaf - record the chain
|
| 204 |
+
if len(current_events) >= 1:
|
| 205 |
+
results.append(self._build_chain(current_events, current_links))
|
| 206 |
+
return
|
| 207 |
+
|
| 208 |
+
for effect in effects:
|
| 209 |
+
link = self.graph.get_link(current_id, effect.event_id)
|
| 210 |
+
new_links = current_links + [link] if link else current_links
|
| 211 |
+
|
| 212 |
+
self._trace_forwards_recursive(
|
| 213 |
+
effect.event_id,
|
| 214 |
+
current_events,
|
| 215 |
+
new_links,
|
| 216 |
+
depth_remaining - 1,
|
| 217 |
+
results,
|
| 218 |
+
visited.copy()
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
def find_root_causes(self, event_id: str, max_depth: int = 1000) -> RootCauseAnalysis:
|
| 222 |
+
"""
|
| 223 |
+
Deep root cause analysis: find the ultimate origins.
|
| 224 |
+
|
| 225 |
+
Traces all the way back to find events with no causes.
|
| 226 |
+
|
| 227 |
+
Args:
|
| 228 |
+
event_id: ID of the event to analyze
|
| 229 |
+
max_depth: Maximum depth to search (default: 1000 - effectively unlimited)
|
| 230 |
+
|
| 231 |
+
Returns:
|
| 232 |
+
RootCauseAnalysis with root causes and narrative
|
| 233 |
+
"""
|
| 234 |
+
target = self.graph.get_event(event_id)
|
| 235 |
+
if not target:
|
| 236 |
+
return RootCauseAnalysis(
|
| 237 |
+
target_event=None,
|
| 238 |
+
root_causes=[],
|
| 239 |
+
chains=[],
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
chains = self.trace_backwards(event_id, max_depth)
|
| 243 |
+
|
| 244 |
+
# Extract root causes (events at the start of chains)
|
| 245 |
+
root_causes = []
|
| 246 |
+
seen = set()
|
| 247 |
+
for chain in chains:
|
| 248 |
+
if chain.events:
|
| 249 |
+
root = chain.events[0]
|
| 250 |
+
if root.event_id not in seen:
|
| 251 |
+
root_causes.append(root)
|
| 252 |
+
seen.add(root.event_id)
|
| 253 |
+
|
| 254 |
+
# Build narrative
|
| 255 |
+
narrative = self._build_root_cause_narrative(target, root_causes, chains)
|
| 256 |
+
|
| 257 |
+
return RootCauseAnalysis(
|
| 258 |
+
target_event=target,
|
| 259 |
+
root_causes=root_causes,
|
| 260 |
+
chains=chains,
|
| 261 |
+
deepest_depth=max(c.depth for c in chains) if chains else 0,
|
| 262 |
+
narrative=narrative,
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
def analyze_impact(self, event_id: str, max_depth: int = 1000) -> ImpactAnalysis:
|
| 266 |
+
"""
|
| 267 |
+
Impact analysis: what were ALL downstream effects?
|
| 268 |
+
|
| 269 |
+
Traces forward to find everything this event set in motion.
|
| 270 |
+
|
| 271 |
+
Args:
|
| 272 |
+
event_id: ID of the event to analyze
|
| 273 |
+
max_depth: Maximum depth to search (default: 1000 - effectively unlimited)
|
| 274 |
+
|
| 275 |
+
Returns:
|
| 276 |
+
ImpactAnalysis with effects and severity score
|
| 277 |
+
"""
|
| 278 |
+
source = self.graph.get_event(event_id)
|
| 279 |
+
if not source:
|
| 280 |
+
return ImpactAnalysis(
|
| 281 |
+
source_event=None,
|
| 282 |
+
effects=[],
|
| 283 |
+
chains=[],
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
chains = self.trace_forwards(event_id, max_depth)
|
| 287 |
+
|
| 288 |
+
# Extract all effects
|
| 289 |
+
effects = []
|
| 290 |
+
seen = set()
|
| 291 |
+
for chain in chains:
|
| 292 |
+
for event in chain.events[1:]: # Skip source
|
| 293 |
+
if event.event_id not in seen:
|
| 294 |
+
effects.append(event)
|
| 295 |
+
seen.add(event.event_id)
|
| 296 |
+
|
| 297 |
+
# Calculate severity
|
| 298 |
+
severity = self._calculate_impact_severity(source, effects)
|
| 299 |
+
|
| 300 |
+
# Build narrative
|
| 301 |
+
narrative = self._build_impact_narrative(source, effects, chains)
|
| 302 |
+
|
| 303 |
+
return ImpactAnalysis(
|
| 304 |
+
source_event=source,
|
| 305 |
+
effects=effects,
|
| 306 |
+
chains=chains,
|
| 307 |
+
total_impact_count=len(effects),
|
| 308 |
+
severity_score=severity,
|
| 309 |
+
narrative=narrative,
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
def predict_cascade(self, event_id: str) -> CascadePrediction:
|
| 313 |
+
"""
|
| 314 |
+
Predict likely cascade from this event.
|
| 315 |
+
|
| 316 |
+
Uses learned patterns to forecast effects BEFORE they happen.
|
| 317 |
+
This is the "Minority Report" capability.
|
| 318 |
+
|
| 319 |
+
Args:
|
| 320 |
+
event_id: ID of the event to predict from
|
| 321 |
+
|
| 322 |
+
Returns:
|
| 323 |
+
CascadePrediction with risk scores and intervention points
|
| 324 |
+
"""
|
| 325 |
+
source = self.graph.get_event(event_id)
|
| 326 |
+
if not source:
|
| 327 |
+
return CascadePrediction(
|
| 328 |
+
source_event=None,
|
| 329 |
+
predicted_effects=[],
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
# Get historical patterns for this event type
|
| 333 |
+
similar_events = self.graph.get_events_by_type(source.event_type)
|
| 334 |
+
|
| 335 |
+
# Count what typically follows - use all available history for better predictions
|
| 336 |
+
# No artificial cap - system learns from full history
|
| 337 |
+
effect_counts: Dict[str, int] = {}
|
| 338 |
+
analysis_window = similar_events # Full history, no slice
|
| 339 |
+
for similar in analysis_window:
|
| 340 |
+
effects = self.graph.get_effects(similar.event_id)
|
| 341 |
+
for effect in effects:
|
| 342 |
+
key = effect.event_type
|
| 343 |
+
effect_counts[key] = effect_counts.get(key, 0) + 1
|
| 344 |
+
|
| 345 |
+
# Convert to predictions
|
| 346 |
+
total = len(analysis_window)
|
| 347 |
+
predictions = []
|
| 348 |
+
for event_type, count in sorted(effect_counts.items(), key=lambda x: -x[1]):
|
| 349 |
+
predictions.append({
|
| 350 |
+
"event_type": event_type,
|
| 351 |
+
"probability": count / total if total > 0 else 0,
|
| 352 |
+
"historical_count": count,
|
| 353 |
+
})
|
| 354 |
+
|
| 355 |
+
# Calculate risk score
|
| 356 |
+
risk_score = self._calculate_risk_score(source, predictions)
|
| 357 |
+
|
| 358 |
+
# Identify intervention points
|
| 359 |
+
intervention_points = self._find_intervention_points(source, predictions)
|
| 360 |
+
|
| 361 |
+
return CascadePrediction(
|
| 362 |
+
source_event=source,
|
| 363 |
+
predicted_effects=predictions[:10], # Top 10
|
| 364 |
+
risk_score=risk_score,
|
| 365 |
+
intervention_points=intervention_points,
|
| 366 |
+
narrative=f"Based on {total} similar events, predicting {len(predictions)} likely effects.",
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
def _build_chain(self, events: List[Event], links: List[CausationLink]) -> CausationChain:
|
| 370 |
+
"""Build a CausationChain from events and links."""
|
| 371 |
+
total_strength = 1.0
|
| 372 |
+
for link in links:
|
| 373 |
+
total_strength *= link.strength
|
| 374 |
+
|
| 375 |
+
return CausationChain(
|
| 376 |
+
events=events,
|
| 377 |
+
links=links,
|
| 378 |
+
total_strength=total_strength,
|
| 379 |
+
depth=len(links),
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
def _build_root_cause_narrative(
|
| 383 |
+
self,
|
| 384 |
+
target: Event,
|
| 385 |
+
roots: List[Event],
|
| 386 |
+
chains: List[CausationChain]
|
| 387 |
+
) -> str:
|
| 388 |
+
"""Build human-readable narrative for root cause analysis."""
|
| 389 |
+
if not roots:
|
| 390 |
+
return f"No root causes found for {target.event_type}"
|
| 391 |
+
|
| 392 |
+
lines = [f"Root cause analysis for {target.event_type}:"]
|
| 393 |
+
lines.append(f"Found {len(roots)} root cause(s) across {len(chains)} causal chain(s).")
|
| 394 |
+
lines.append("")
|
| 395 |
+
|
| 396 |
+
for i, root in enumerate(roots[:5], 1): # Top 5
|
| 397 |
+
lines.append(f"{i}. {root.component}/{root.event_type}")
|
| 398 |
+
if root.data:
|
| 399 |
+
key_data = list(root.data.items())[:3]
|
| 400 |
+
lines.append(f" Data: {dict(key_data)}")
|
| 401 |
+
|
| 402 |
+
return "\n".join(lines)
|
| 403 |
+
|
| 404 |
+
def _build_impact_narrative(
|
| 405 |
+
self,
|
| 406 |
+
source: Event,
|
| 407 |
+
effects: List[Event],
|
| 408 |
+
chains: List[CausationChain]
|
| 409 |
+
) -> str:
|
| 410 |
+
"""Build human-readable narrative for impact analysis."""
|
| 411 |
+
if not effects:
|
| 412 |
+
return f"No downstream effects found for {source.event_type}"
|
| 413 |
+
|
| 414 |
+
lines = [f"Impact analysis for {source.event_type}:"]
|
| 415 |
+
lines.append(f"Found {len(effects)} downstream effect(s).")
|
| 416 |
+
lines.append("")
|
| 417 |
+
|
| 418 |
+
# Group by event type
|
| 419 |
+
by_type: Dict[str, int] = {}
|
| 420 |
+
for effect in effects:
|
| 421 |
+
by_type[effect.event_type] = by_type.get(effect.event_type, 0) + 1
|
| 422 |
+
|
| 423 |
+
for event_type, count in sorted(by_type.items(), key=lambda x: -x[1]):
|
| 424 |
+
lines.append(f" • {event_type}: {count} occurrence(s)")
|
| 425 |
+
|
| 426 |
+
return "\n".join(lines)
|
| 427 |
+
|
| 428 |
+
def _calculate_impact_severity(self, source: Event, effects: List[Event]) -> float:
|
| 429 |
+
"""Calculate severity score for an impact (0.0 to 1.0)."""
|
| 430 |
+
if not effects:
|
| 431 |
+
return 0.0
|
| 432 |
+
|
| 433 |
+
# Factors: number of effects, types of effects
|
| 434 |
+
count_score = min(1.0, len(effects) / 20) # 20+ effects = max
|
| 435 |
+
|
| 436 |
+
# High-severity event types
|
| 437 |
+
severe_types = {'error', 'anomaly', 'crash', 'failure', 'explosion'}
|
| 438 |
+
severe_count = sum(1 for e in effects if e.event_type in severe_types)
|
| 439 |
+
severity_score = min(1.0, severe_count / 5)
|
| 440 |
+
|
| 441 |
+
return (count_score + severity_score) / 2
|
| 442 |
+
|
| 443 |
+
def _calculate_risk_score(
|
| 444 |
+
self,
|
| 445 |
+
source: Event,
|
| 446 |
+
predictions: List[Dict[str, Any]]
|
| 447 |
+
) -> float:
|
| 448 |
+
"""Calculate risk score for a cascade prediction."""
|
| 449 |
+
if not predictions:
|
| 450 |
+
return 0.0
|
| 451 |
+
|
| 452 |
+
# High-risk event types
|
| 453 |
+
risky_types = {'error', 'anomaly', 'crash', 'failure', 'explosion', 'nan', 'overflow'}
|
| 454 |
+
|
| 455 |
+
risk = 0.0
|
| 456 |
+
for pred in predictions:
|
| 457 |
+
if pred["event_type"] in risky_types:
|
| 458 |
+
risk += pred["probability"] * 2 # Double weight for risky
|
| 459 |
+
else:
|
| 460 |
+
risk += pred["probability"] * 0.5
|
| 461 |
+
|
| 462 |
+
return min(1.0, risk)
|
| 463 |
+
|
| 464 |
+
def _find_intervention_points(
|
| 465 |
+
self,
|
| 466 |
+
source: Event,
|
| 467 |
+
predictions: List[Dict[str, Any]]
|
| 468 |
+
) -> List[str]:
|
| 469 |
+
"""Identify points where intervention could prevent bad cascades."""
|
| 470 |
+
points = []
|
| 471 |
+
|
| 472 |
+
# Look at source event data for intervention hints
|
| 473 |
+
if 'learning_rate' in source.data:
|
| 474 |
+
points.append("Reduce learning rate")
|
| 475 |
+
if 'gradient' in source.event_type.lower():
|
| 476 |
+
points.append("Apply gradient clipping")
|
| 477 |
+
if source.data.get('loss', 0) > 10:
|
| 478 |
+
points.append("Check loss function / data")
|
| 479 |
+
|
| 480 |
+
# Check predictions for severe outcomes
|
| 481 |
+
for pred in predictions:
|
| 482 |
+
if pred["event_type"] == "nan" and pred["probability"] > 0.3:
|
| 483 |
+
points.append("Enable NaN detection early stopping")
|
| 484 |
+
if pred["event_type"] == "overflow" and pred["probability"] > 0.3:
|
| 485 |
+
points.append("Apply gradient scaling")
|
| 486 |
+
|
| 487 |
+
return points
|
cascade/bridge.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HuggingFace → IPFS Bridge
|
| 3 |
+
|
| 4 |
+
Makes every CASCADE instance a node in the IPFS network.
|
| 5 |
+
Serves lattice content to DHT without running a full daemon.
|
| 6 |
+
|
| 7 |
+
Uses js-ipfs HTTP API compatible endpoints via ipfs-http-client.
|
| 8 |
+
For HF Spaces, we use Helia (browser/Node IPFS) style serving.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import json
|
| 12 |
+
import hashlib
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Optional, Dict, Any
|
| 15 |
+
import threading
|
| 16 |
+
import time
|
| 17 |
+
|
| 18 |
+
# Optional: for full IPFS integration
|
| 19 |
+
try:
|
| 20 |
+
import ipfshttpclient
|
| 21 |
+
HAS_IPFS_CLIENT = True
|
| 22 |
+
except ImportError:
|
| 23 |
+
HAS_IPFS_CLIENT = False
|
| 24 |
+
|
| 25 |
+
from cascade.ipld import chain_to_ipld, chain_to_cid, encode_to_dag_cbor
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class LatticeServer:
|
| 29 |
+
"""
|
| 30 |
+
Serves lattice content over IPFS-compatible protocols.
|
| 31 |
+
|
| 32 |
+
Can run in multiple modes:
|
| 33 |
+
1. Gateway mode: HTTP endpoints that mirror IPFS gateway API
|
| 34 |
+
2. DHT mode: Announce content to IPFS DHT (needs daemon)
|
| 35 |
+
3. Hybrid: Both
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(self, lattice_dir: Path = None):
|
| 39 |
+
if lattice_dir is None:
|
| 40 |
+
# Try relative to this file first, then cwd
|
| 41 |
+
candidate = Path(__file__).resolve().parent.parent / "lattice"
|
| 42 |
+
if not candidate.exists():
|
| 43 |
+
candidate = Path.cwd() / "lattice"
|
| 44 |
+
self.lattice_dir = candidate
|
| 45 |
+
else:
|
| 46 |
+
self.lattice_dir = lattice_dir
|
| 47 |
+
self.ipld_dir = self.lattice_dir / "ipld"
|
| 48 |
+
self._index: Dict[str, Path] = {} # CID -> file path
|
| 49 |
+
self._build_index()
|
| 50 |
+
|
| 51 |
+
def _build_index(self):
|
| 52 |
+
"""Index all known CIDs to their local files."""
|
| 53 |
+
# Index CBOR files
|
| 54 |
+
if self.ipld_dir.exists():
|
| 55 |
+
for cbor_file in self.ipld_dir.glob("*.cbor"):
|
| 56 |
+
ipld_json = cbor_file.with_suffix(".ipld.json")
|
| 57 |
+
if ipld_json.exists():
|
| 58 |
+
meta = json.loads(ipld_json.read_text())
|
| 59 |
+
# Try both 'cid' and '_cid' keys
|
| 60 |
+
cid = meta.get("cid") or meta.get("_cid")
|
| 61 |
+
if cid:
|
| 62 |
+
self._index[cid] = cbor_file
|
| 63 |
+
|
| 64 |
+
# Index JSON chain files (compute CID on the fly)
|
| 65 |
+
for json_file in self.lattice_dir.glob("*.json"):
|
| 66 |
+
if json_file.name == "README.md":
|
| 67 |
+
continue
|
| 68 |
+
try:
|
| 69 |
+
chain_data = json.loads(json_file.read_text())
|
| 70 |
+
cid = chain_to_cid(chain_data)
|
| 71 |
+
self._index[cid] = json_file
|
| 72 |
+
except:
|
| 73 |
+
pass
|
| 74 |
+
|
| 75 |
+
print(f"Indexed {len(self._index)} CIDs")
|
| 76 |
+
|
| 77 |
+
def resolve(self, cid: str) -> Optional[bytes]:
|
| 78 |
+
"""Resolve a CID to its content."""
|
| 79 |
+
if cid in self._index:
|
| 80 |
+
filepath = self._index[cid]
|
| 81 |
+
if filepath.suffix == ".cbor":
|
| 82 |
+
return filepath.read_bytes()
|
| 83 |
+
else:
|
| 84 |
+
# JSON file - return as CBOR for consistency
|
| 85 |
+
chain_data = json.loads(filepath.read_text())
|
| 86 |
+
ipld_data = chain_to_ipld(chain_data)
|
| 87 |
+
return encode_to_dag_cbor(ipld_data)
|
| 88 |
+
return None
|
| 89 |
+
|
| 90 |
+
def list_cids(self) -> list:
|
| 91 |
+
"""List all available CIDs."""
|
| 92 |
+
return list(self._index.keys())
|
| 93 |
+
|
| 94 |
+
def get_gateway_response(self, cid: str) -> tuple:
|
| 95 |
+
"""
|
| 96 |
+
Return (content, content_type, status_code) for gateway-style serving.
|
| 97 |
+
"""
|
| 98 |
+
content = self.resolve(cid)
|
| 99 |
+
if content:
|
| 100 |
+
return (content, "application/cbor", 200)
|
| 101 |
+
return (b"CID not found", "text/plain", 404)
|
| 102 |
+
|
| 103 |
+
def announce_to_dht(self, ipfs_api: str = "/ip4/127.0.0.1/tcp/5001"):
|
| 104 |
+
"""
|
| 105 |
+
Announce all CIDs to IPFS DHT.
|
| 106 |
+
Requires running IPFS daemon.
|
| 107 |
+
"""
|
| 108 |
+
if not HAS_IPFS_CLIENT:
|
| 109 |
+
print("ipfshttpclient not installed. Run: pip install ipfshttpclient")
|
| 110 |
+
return
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
client = ipfshttpclient.connect(ipfs_api)
|
| 114 |
+
except Exception as e:
|
| 115 |
+
print(f"Could not connect to IPFS daemon: {e}")
|
| 116 |
+
print("Start daemon with: ipfs daemon")
|
| 117 |
+
return
|
| 118 |
+
|
| 119 |
+
for cid, filepath in self._index.items():
|
| 120 |
+
try:
|
| 121 |
+
# Add file to local IPFS node
|
| 122 |
+
if filepath.suffix == ".cbor":
|
| 123 |
+
result = client.add(str(filepath))
|
| 124 |
+
print(f"Announced {filepath.name}: {result['Hash']}")
|
| 125 |
+
except Exception as e:
|
| 126 |
+
print(f"Failed to announce {cid}: {e}")
|
| 127 |
+
|
| 128 |
+
def start_gateway(self, host: str = "0.0.0.0", port: int = 8080):
|
| 129 |
+
"""
|
| 130 |
+
Start a simple HTTP gateway for serving lattice content.
|
| 131 |
+
|
| 132 |
+
Compatible with IPFS gateway URL format:
|
| 133 |
+
GET /ipfs/{cid}
|
| 134 |
+
"""
|
| 135 |
+
from http.server import HTTPServer, BaseHTTPRequestHandler
|
| 136 |
+
|
| 137 |
+
server = self
|
| 138 |
+
|
| 139 |
+
class GatewayHandler(BaseHTTPRequestHandler):
|
| 140 |
+
def do_GET(self):
|
| 141 |
+
# Parse /ipfs/{cid} or just /{cid}
|
| 142 |
+
path = self.path.strip("/")
|
| 143 |
+
if path.startswith("ipfs/"):
|
| 144 |
+
cid = path[5:]
|
| 145 |
+
else:
|
| 146 |
+
cid = path
|
| 147 |
+
|
| 148 |
+
content, content_type, status = server.get_gateway_response(cid)
|
| 149 |
+
|
| 150 |
+
self.send_response(status)
|
| 151 |
+
self.send_header("Content-Type", content_type)
|
| 152 |
+
self.send_header("Content-Length", len(content))
|
| 153 |
+
self.send_header("Access-Control-Allow-Origin", "*")
|
| 154 |
+
self.end_headers()
|
| 155 |
+
self.wfile.write(content)
|
| 156 |
+
|
| 157 |
+
def do_HEAD(self):
|
| 158 |
+
path = self.path.strip("/")
|
| 159 |
+
if path.startswith("ipfs/"):
|
| 160 |
+
cid = path[5:]
|
| 161 |
+
else:
|
| 162 |
+
cid = path
|
| 163 |
+
|
| 164 |
+
_, content_type, status = server.get_gateway_response(cid)
|
| 165 |
+
|
| 166 |
+
self.send_response(status)
|
| 167 |
+
self.send_header("Content-Type", content_type)
|
| 168 |
+
self.send_header("Access-Control-Allow-Origin", "*")
|
| 169 |
+
self.end_headers()
|
| 170 |
+
|
| 171 |
+
def log_message(self, format, *args):
|
| 172 |
+
print(f"[Gateway] {args[0]}")
|
| 173 |
+
|
| 174 |
+
httpd = HTTPServer((host, port), GatewayHandler)
|
| 175 |
+
print(f"Lattice gateway running at http://{host}:{port}")
|
| 176 |
+
print(f"Serving {len(self._index)} CIDs")
|
| 177 |
+
print(f"\nTry: http://localhost:{port}/ipfs/bafyreidixjlzdat7ex72foi6vm3vnskhzguovxj6ondbazrqks7v6ahmei")
|
| 178 |
+
httpd.serve_forever()
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def create_gradio_gateway():
|
| 182 |
+
"""
|
| 183 |
+
Create a Gradio interface that serves as IPFS gateway.
|
| 184 |
+
Suitable for HuggingFace Spaces deployment.
|
| 185 |
+
"""
|
| 186 |
+
try:
|
| 187 |
+
import gradio as gr
|
| 188 |
+
except ImportError:
|
| 189 |
+
print("Gradio not installed. Run: pip install gradio")
|
| 190 |
+
return None
|
| 191 |
+
|
| 192 |
+
server = LatticeServer()
|
| 193 |
+
|
| 194 |
+
def resolve_cid(cid: str) -> str:
|
| 195 |
+
"""Resolve CID and return content as hex + JSON decode attempt."""
|
| 196 |
+
content = server.resolve(cid.strip())
|
| 197 |
+
if content is None:
|
| 198 |
+
return f"❌ CID not found: {cid}\n\nAvailable CIDs:\n" + "\n".join(server.list_cids())
|
| 199 |
+
|
| 200 |
+
# Try to decode as CBOR → JSON for display
|
| 201 |
+
try:
|
| 202 |
+
import dag_cbor
|
| 203 |
+
decoded = dag_cbor.decode(content)
|
| 204 |
+
return f"✓ Found! ({len(content)} bytes)\n\n{json.dumps(decoded, indent=2, default=str)}"
|
| 205 |
+
except:
|
| 206 |
+
return f"✓ Found! ({len(content)} bytes)\n\nRaw hex: {content.hex()[:200]}..."
|
| 207 |
+
|
| 208 |
+
def list_all() -> str:
|
| 209 |
+
"""List all available CIDs."""
|
| 210 |
+
cids = server.list_cids()
|
| 211 |
+
lines = [f"=== Lattice Index ({len(cids)} chains) ===\n"]
|
| 212 |
+
for cid in cids:
|
| 213 |
+
filepath = server._index[cid]
|
| 214 |
+
lines.append(f"• {filepath.stem}")
|
| 215 |
+
lines.append(f" {cid}\n")
|
| 216 |
+
return "\n".join(lines)
|
| 217 |
+
|
| 218 |
+
with gr.Blocks(title="CASCADE Lattice Gateway") as app:
|
| 219 |
+
gr.Markdown("# 🌐 CASCADE Lattice Gateway")
|
| 220 |
+
gr.Markdown("*The neural internetwork, content-addressed.*")
|
| 221 |
+
|
| 222 |
+
with gr.Tab("Resolve CID"):
|
| 223 |
+
cid_input = gr.Textbox(
|
| 224 |
+
label="CID",
|
| 225 |
+
placeholder="bafyrei...",
|
| 226 |
+
value="bafyreidixjlzdat7ex72foi6vm3vnskhzguovxj6ondbazrqks7v6ahmei"
|
| 227 |
+
)
|
| 228 |
+
resolve_btn = gr.Button("Resolve")
|
| 229 |
+
output = gr.Textbox(label="Content", lines=20)
|
| 230 |
+
resolve_btn.click(resolve_cid, inputs=cid_input, outputs=output)
|
| 231 |
+
|
| 232 |
+
with gr.Tab("Browse Lattice"):
|
| 233 |
+
list_btn = gr.Button("List All CIDs")
|
| 234 |
+
list_output = gr.Textbox(label="Available Chains", lines=20)
|
| 235 |
+
list_btn.click(list_all, outputs=list_output)
|
| 236 |
+
|
| 237 |
+
gr.Markdown("""
|
| 238 |
+
---
|
| 239 |
+
**What is this?**
|
| 240 |
+
|
| 241 |
+
This gateway serves the CASCADE lattice — a cryptographic provenance network for AI agents.
|
| 242 |
+
|
| 243 |
+
Every chain has a CID (Content IDentifier). Same content = same CID. Forever.
|
| 244 |
+
|
| 245 |
+
- **Genesis**: `bafyreidixjlzdat7ex72foi6vm3vnskhzguovxj6ondbazrqks7v6ahmei`
|
| 246 |
+
- Protocol: [IPLD](https://ipld.io/) (InterPlanetary Linked Data)
|
| 247 |
+
""")
|
| 248 |
+
|
| 249 |
+
return app
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
if __name__ == "__main__":
|
| 253 |
+
import sys
|
| 254 |
+
|
| 255 |
+
if "--gradio" in sys.argv:
|
| 256 |
+
app = create_gradio_gateway()
|
| 257 |
+
if app:
|
| 258 |
+
app.launch()
|
| 259 |
+
elif "--announce" in sys.argv:
|
| 260 |
+
server = LatticeServer()
|
| 261 |
+
server.announce_to_dht()
|
| 262 |
+
else:
|
| 263 |
+
# Default: run HTTP gateway
|
| 264 |
+
server = LatticeServer()
|
| 265 |
+
server.start_gateway(port=8080)
|
cascade/cli_main.py
ADDED
|
@@ -0,0 +1,851 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE CLI - Full-featured Rich TUI for cascade-ai.
|
| 3 |
+
|
| 4 |
+
Exposes all CASCADE capabilities:
|
| 5 |
+
- Lattice: stats, list, inspect, chains, pin, export, watch
|
| 6 |
+
- Model: observe, fingerprint
|
| 7 |
+
- Data: entities, provenance, pii scan
|
| 8 |
+
- System: logs, analyze, ingest
|
| 9 |
+
- Proxy: start intercepting proxy
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import argparse
|
| 13 |
+
import sys
|
| 14 |
+
import json
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from datetime import datetime
|
| 17 |
+
|
| 18 |
+
# Rich imports with fallback
|
| 19 |
+
try:
|
| 20 |
+
from rich.console import Console
|
| 21 |
+
from rich.table import Table
|
| 22 |
+
from rich.panel import Panel
|
| 23 |
+
from rich.tree import Tree
|
| 24 |
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
| 25 |
+
from rich.text import Text
|
| 26 |
+
from rich.markdown import Markdown
|
| 27 |
+
from rich.syntax import Syntax
|
| 28 |
+
from rich import box
|
| 29 |
+
HAS_RICH = True
|
| 30 |
+
except ImportError:
|
| 31 |
+
HAS_RICH = False
|
| 32 |
+
|
| 33 |
+
console = Console() if HAS_RICH else None
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 37 |
+
# LATTICE COMMANDS
|
| 38 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 39 |
+
|
| 40 |
+
def cmd_stats(args):
|
| 41 |
+
"""Show lattice statistics with Rich panels."""
|
| 42 |
+
from cascade.observation import ObservationManager
|
| 43 |
+
|
| 44 |
+
manager = ObservationManager()
|
| 45 |
+
stats = manager.get_stats()
|
| 46 |
+
|
| 47 |
+
if HAS_RICH:
|
| 48 |
+
stats_table = Table(show_header=False, box=box.SIMPLE, padding=(0, 2))
|
| 49 |
+
stats_table.add_column("Key", style="cyan")
|
| 50 |
+
stats_table.add_column("Value", style="green")
|
| 51 |
+
|
| 52 |
+
stats_table.add_row("Genesis Root", f"[bold magenta]{stats['genesis_root']}[/]")
|
| 53 |
+
stats_table.add_row("", "")
|
| 54 |
+
stats_table.add_row("Total Observations", str(stats['total_observations']))
|
| 55 |
+
stats_table.add_row(" └─ Model", str(stats['model_observations']))
|
| 56 |
+
stats_table.add_row(" └─ Data", str(stats['data_observations']))
|
| 57 |
+
stats_table.add_row(" └─ System", str(stats['system_observations']))
|
| 58 |
+
stats_table.add_row("", "")
|
| 59 |
+
stats_table.add_row("Registered Models", str(stats['registered_models']))
|
| 60 |
+
stats_table.add_row("Unique Models Observed", str(stats['unique_models']))
|
| 61 |
+
|
| 62 |
+
panel = Panel(
|
| 63 |
+
stats_table,
|
| 64 |
+
title="[bold cyan]CASCADE LATTICE[/]",
|
| 65 |
+
subtitle="[dim]The Neural Internetwork[/]",
|
| 66 |
+
border_style="cyan",
|
| 67 |
+
)
|
| 68 |
+
console.print(panel)
|
| 69 |
+
else:
|
| 70 |
+
print(f"""
|
| 71 |
+
CASCADE LATTICE STATS
|
| 72 |
+
═════════════════════
|
| 73 |
+
Genesis Root: {stats['genesis_root']}
|
| 74 |
+
|
| 75 |
+
Observations:
|
| 76 |
+
Total: {stats['total_observations']}
|
| 77 |
+
Model: {stats['model_observations']}
|
| 78 |
+
Data: {stats['data_observations']}
|
| 79 |
+
System: {stats['system_observations']}
|
| 80 |
+
|
| 81 |
+
Models:
|
| 82 |
+
Registered: {stats['registered_models']}
|
| 83 |
+
Observed: {stats['unique_models']}
|
| 84 |
+
""")
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def cmd_list(args):
|
| 88 |
+
"""List recent observations."""
|
| 89 |
+
from cascade.observation import ObservationManager
|
| 90 |
+
|
| 91 |
+
manager = ObservationManager()
|
| 92 |
+
observations = manager.list_observations(limit=args.limit)
|
| 93 |
+
|
| 94 |
+
if not observations:
|
| 95 |
+
if HAS_RICH:
|
| 96 |
+
console.print("[yellow]No observations yet.[/]")
|
| 97 |
+
else:
|
| 98 |
+
print("No observations yet.")
|
| 99 |
+
return
|
| 100 |
+
|
| 101 |
+
if HAS_RICH:
|
| 102 |
+
table = Table(title=f"Recent Observations", box=box.ROUNDED)
|
| 103 |
+
table.add_column("Type", style="cyan", width=8)
|
| 104 |
+
table.add_column("Source", style="white", max_width=40)
|
| 105 |
+
table.add_column("Merkle Root", style="magenta")
|
| 106 |
+
table.add_column("Time", style="dim")
|
| 107 |
+
|
| 108 |
+
for obs in observations:
|
| 109 |
+
obs_type = obs.get('observation_type', '?')[:7]
|
| 110 |
+
source = obs.get('source_id', 'unknown')[:39]
|
| 111 |
+
merkle = obs.get('merkle_root', '?')[:16]
|
| 112 |
+
timestamp = obs.get('timestamp', '')
|
| 113 |
+
if timestamp:
|
| 114 |
+
try:
|
| 115 |
+
if isinstance(timestamp, (int, float)):
|
| 116 |
+
timestamp = datetime.fromtimestamp(timestamp).strftime('%H:%M:%S')
|
| 117 |
+
else:
|
| 118 |
+
timestamp = str(timestamp)[:8]
|
| 119 |
+
except:
|
| 120 |
+
timestamp = '?'
|
| 121 |
+
|
| 122 |
+
table.add_row(obs_type, source, merkle, timestamp)
|
| 123 |
+
|
| 124 |
+
console.print(table)
|
| 125 |
+
console.print(f"[dim]Showing {len(observations)} of {manager.get_stats()['total_observations']}[/]")
|
| 126 |
+
else:
|
| 127 |
+
print(f"\n{'TYPE':<8} {'SOURCE':<40} {'MERKLE ROOT':<20}")
|
| 128 |
+
print("─" * 70)
|
| 129 |
+
for obs in observations:
|
| 130 |
+
print(f"{obs.get('observation_type', '?')[:7]:<8} {obs.get('source_id', '?')[:39]:<40} {obs.get('merkle_root', '?')[:19]:<20}")
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def cmd_inspect(args):
|
| 134 |
+
"""Inspect a specific observation by merkle root."""
|
| 135 |
+
from cascade.observation import ObservationManager
|
| 136 |
+
|
| 137 |
+
manager = ObservationManager()
|
| 138 |
+
obs = manager.get_observation(args.root)
|
| 139 |
+
|
| 140 |
+
if not obs:
|
| 141 |
+
if HAS_RICH:
|
| 142 |
+
console.print(f"[red]Observation not found:[/] {args.root}")
|
| 143 |
+
else:
|
| 144 |
+
print(f"Observation not found: {args.root}")
|
| 145 |
+
return
|
| 146 |
+
|
| 147 |
+
if HAS_RICH:
|
| 148 |
+
tree = Tree(f"[bold magenta]{args.root}[/]")
|
| 149 |
+
|
| 150 |
+
for key, value in obs.items():
|
| 151 |
+
if isinstance(value, dict):
|
| 152 |
+
branch = tree.add(f"[cyan]{key}[/]")
|
| 153 |
+
for k, v in value.items():
|
| 154 |
+
branch.add(f"[dim]{k}:[/] {v}")
|
| 155 |
+
elif isinstance(value, list):
|
| 156 |
+
branch = tree.add(f"[cyan]{key}[/] ({len(value)} items)")
|
| 157 |
+
for item in value[:5]:
|
| 158 |
+
branch.add(str(item)[:60])
|
| 159 |
+
if len(value) > 5:
|
| 160 |
+
branch.add(f"[dim]... and {len(value) - 5} more[/]")
|
| 161 |
+
else:
|
| 162 |
+
tree.add(f"[cyan]{key}:[/] {value}")
|
| 163 |
+
|
| 164 |
+
console.print(Panel(tree, title="Observation Details", border_style="magenta"))
|
| 165 |
+
else:
|
| 166 |
+
print(json.dumps(obs, indent=2, default=str))
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def cmd_chains(args):
|
| 170 |
+
"""List all chains in the lattice."""
|
| 171 |
+
from cascade.viz.lattice_gateway import load_lattice_data
|
| 172 |
+
|
| 173 |
+
data = load_lattice_data()
|
| 174 |
+
chains = data.get('chains', [])
|
| 175 |
+
|
| 176 |
+
if HAS_RICH:
|
| 177 |
+
table = Table(title="Lattice Chains", box=box.ROUNDED)
|
| 178 |
+
table.add_column("Name", style="cyan")
|
| 179 |
+
table.add_column("Merkle Root", style="magenta")
|
| 180 |
+
table.add_column("Records", justify="right")
|
| 181 |
+
table.add_column("CID", style="dim")
|
| 182 |
+
|
| 183 |
+
for chain in chains:
|
| 184 |
+
name = chain.get('name', '?')
|
| 185 |
+
root = chain.get('merkle_root', '?')[:16]
|
| 186 |
+
records = len(chain.get('records', {}))
|
| 187 |
+
cid = chain.get('cid', 'Not pinned')
|
| 188 |
+
if cid and cid != 'Not pinned':
|
| 189 |
+
cid = cid[:20] + '...'
|
| 190 |
+
|
| 191 |
+
style = "bold green" if name == 'genesis' else None
|
| 192 |
+
table.add_row(name, root, str(records), cid, style=style)
|
| 193 |
+
|
| 194 |
+
console.print(table)
|
| 195 |
+
console.print(f"\n[dim]Genesis: {data.get('genesis_root', 'N/A')}[/]")
|
| 196 |
+
else:
|
| 197 |
+
print(f"Chains in lattice: {len(chains)}")
|
| 198 |
+
for chain in chains:
|
| 199 |
+
print(f" {chain.get('name')}: {chain.get('merkle_root', '?')[:16]} ({len(chain.get('records', {}))} records)")
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def cmd_pin(args):
|
| 203 |
+
"""Pin observation to IPFS."""
|
| 204 |
+
from cascade.observation import ObservationManager
|
| 205 |
+
|
| 206 |
+
manager = ObservationManager()
|
| 207 |
+
obs = manager.get_observation(args.root)
|
| 208 |
+
|
| 209 |
+
if not obs:
|
| 210 |
+
if HAS_RICH:
|
| 211 |
+
console.print(f"[red]Observation not found:[/] {args.root}")
|
| 212 |
+
else:
|
| 213 |
+
print(f"Observation not found: {args.root}")
|
| 214 |
+
return
|
| 215 |
+
|
| 216 |
+
if HAS_RICH:
|
| 217 |
+
with console.status("[cyan]Pinning to IPFS...[/]"):
|
| 218 |
+
cid = manager.pin_to_ipfs(obs)
|
| 219 |
+
|
| 220 |
+
if cid:
|
| 221 |
+
console.print(f"[green]✓ Pinned to IPFS[/]")
|
| 222 |
+
console.print(f" CID: [magenta]{cid}[/]")
|
| 223 |
+
console.print(f" URL: https://storacha.link/ipfs/{cid}")
|
| 224 |
+
else:
|
| 225 |
+
console.print("[red]✗ Failed to pin[/]")
|
| 226 |
+
else:
|
| 227 |
+
print(f"Pinning {args.root}...")
|
| 228 |
+
cid = manager.pin_to_ipfs(obs)
|
| 229 |
+
if cid:
|
| 230 |
+
print(f"✓ Pinned: {cid}")
|
| 231 |
+
else:
|
| 232 |
+
print("✗ Failed")
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def cmd_export(args):
|
| 236 |
+
"""Export lattice or chain to file."""
|
| 237 |
+
from cascade.viz.lattice_gateway import load_lattice_data
|
| 238 |
+
|
| 239 |
+
data = load_lattice_data()
|
| 240 |
+
|
| 241 |
+
if args.chain:
|
| 242 |
+
chains = [c for c in data.get('chains', []) if c['name'] == args.chain]
|
| 243 |
+
if not chains:
|
| 244 |
+
msg = f"Chain not found: {args.chain}"
|
| 245 |
+
console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
|
| 246 |
+
return
|
| 247 |
+
export_data = chains[0]
|
| 248 |
+
else:
|
| 249 |
+
export_data = data
|
| 250 |
+
|
| 251 |
+
output = Path(args.output)
|
| 252 |
+
output.write_text(json.dumps(export_data, indent=2, default=str))
|
| 253 |
+
|
| 254 |
+
msg = f"✓ Exported to {output}"
|
| 255 |
+
console.print(f"[green]{msg}[/]") if HAS_RICH else print(msg)
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def cmd_watch(args):
|
| 259 |
+
"""Watch live observations in real-time."""
|
| 260 |
+
from cascade.observation import ObservationManager
|
| 261 |
+
import time
|
| 262 |
+
|
| 263 |
+
manager = ObservationManager()
|
| 264 |
+
last_count = 0
|
| 265 |
+
|
| 266 |
+
if HAS_RICH:
|
| 267 |
+
console.print("[cyan]Watching for observations... (Ctrl+C to stop)[/]\n")
|
| 268 |
+
else:
|
| 269 |
+
print("Watching... (Ctrl+C to stop)")
|
| 270 |
+
|
| 271 |
+
try:
|
| 272 |
+
while True:
|
| 273 |
+
stats = manager.get_stats()
|
| 274 |
+
current = stats['total_observations']
|
| 275 |
+
|
| 276 |
+
if current > last_count:
|
| 277 |
+
new_obs = manager.list_observations(limit=current - last_count)
|
| 278 |
+
for obs in reversed(new_obs):
|
| 279 |
+
if HAS_RICH:
|
| 280 |
+
console.print(
|
| 281 |
+
f"[green]●[/] [{datetime.now().strftime('%H:%M:%S')}] "
|
| 282 |
+
f"[cyan]{obs.get('observation_type', '?')}[/] "
|
| 283 |
+
f"[white]{obs.get('source_id', '?')[:40]}[/] "
|
| 284 |
+
f"[magenta]{obs.get('merkle_root', '?')[:16]}[/]"
|
| 285 |
+
)
|
| 286 |
+
else:
|
| 287 |
+
print(f"● {obs.get('observation_type', '?')} {obs.get('merkle_root', '?')[:16]}")
|
| 288 |
+
last_count = current
|
| 289 |
+
|
| 290 |
+
time.sleep(1)
|
| 291 |
+
except KeyboardInterrupt:
|
| 292 |
+
msg = "\nStopped watching."
|
| 293 |
+
console.print(f"[yellow]{msg}[/]") if HAS_RICH else print(msg)
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 297 |
+
# MODEL COMMANDS
|
| 298 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 299 |
+
|
| 300 |
+
def cmd_observe(args):
|
| 301 |
+
"""Manually observe a model interaction."""
|
| 302 |
+
from cascade import observe
|
| 303 |
+
|
| 304 |
+
result = observe(
|
| 305 |
+
model_id=args.model,
|
| 306 |
+
input_data=args.input,
|
| 307 |
+
output_data=args.output,
|
| 308 |
+
observation_type='model',
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
if HAS_RICH:
|
| 312 |
+
console.print(f"[green]✓ Observed[/]")
|
| 313 |
+
console.print(f" Merkle Root: [magenta]{result.get('merkle_root', 'N/A')}[/]")
|
| 314 |
+
else:
|
| 315 |
+
print(f"Observed: {result.get('merkle_root', 'N/A')}")
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def cmd_fingerprint(args):
|
| 319 |
+
"""Generate model fingerprint."""
|
| 320 |
+
try:
|
| 321 |
+
from cascade.forensics.fingerprints import ModelFingerprinter
|
| 322 |
+
|
| 323 |
+
if HAS_RICH:
|
| 324 |
+
with console.status(f"[cyan]Fingerprinting {args.model}...[/]"):
|
| 325 |
+
fp = ModelFingerprinter()
|
| 326 |
+
result = fp.fingerprint(args.model)
|
| 327 |
+
|
| 328 |
+
if result:
|
| 329 |
+
table = Table(title=f"Fingerprint: {args.model}", box=box.ROUNDED)
|
| 330 |
+
table.add_column("Property", style="cyan")
|
| 331 |
+
table.add_column("Value", style="white")
|
| 332 |
+
|
| 333 |
+
for key, value in result.items():
|
| 334 |
+
if isinstance(value, dict):
|
| 335 |
+
value = json.dumps(value)[:50] + '...'
|
| 336 |
+
table.add_row(str(key), str(value)[:60])
|
| 337 |
+
|
| 338 |
+
console.print(table)
|
| 339 |
+
else:
|
| 340 |
+
console.print("[yellow]Could not fingerprint model[/]")
|
| 341 |
+
else:
|
| 342 |
+
fp = ModelFingerprinter()
|
| 343 |
+
result = fp.fingerprint(args.model)
|
| 344 |
+
print(json.dumps(result, indent=2, default=str))
|
| 345 |
+
except Exception as e:
|
| 346 |
+
msg = f"Error: {e}"
|
| 347 |
+
console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 351 |
+
# DATA COMMANDS
|
| 352 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 353 |
+
|
| 354 |
+
def cmd_entities(args):
|
| 355 |
+
"""Run entity resolution on a file."""
|
| 356 |
+
try:
|
| 357 |
+
from cascade.data.entities import EntityResolver
|
| 358 |
+
|
| 359 |
+
if HAS_RICH:
|
| 360 |
+
with console.status(f"[cyan]Resolving entities in {args.file}...[/]"):
|
| 361 |
+
resolver = EntityResolver()
|
| 362 |
+
result = resolver.resolve_file(args.file)
|
| 363 |
+
|
| 364 |
+
if result:
|
| 365 |
+
console.print(f"[green]✓ Found {len(result)} entities[/]")
|
| 366 |
+
|
| 367 |
+
table = Table(box=box.SIMPLE)
|
| 368 |
+
table.add_column("Entity", style="cyan")
|
| 369 |
+
table.add_column("Type", style="magenta")
|
| 370 |
+
table.add_column("Count", justify="right")
|
| 371 |
+
|
| 372 |
+
for entity in result[:20]:
|
| 373 |
+
table.add_row(
|
| 374 |
+
str(entity.get('name', '?'))[:30],
|
| 375 |
+
entity.get('type', '?'),
|
| 376 |
+
str(entity.get('count', 1))
|
| 377 |
+
)
|
| 378 |
+
|
| 379 |
+
console.print(table)
|
| 380 |
+
if len(result) > 20:
|
| 381 |
+
console.print(f"[dim]... and {len(result) - 20} more[/]")
|
| 382 |
+
else:
|
| 383 |
+
resolver = EntityResolver()
|
| 384 |
+
result = resolver.resolve_file(args.file)
|
| 385 |
+
print(f"Found {len(result)} entities")
|
| 386 |
+
except Exception as e:
|
| 387 |
+
msg = f"Error: {e}"
|
| 388 |
+
console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
def cmd_pii(args):
|
| 392 |
+
"""Scan for PII in a file."""
|
| 393 |
+
try:
|
| 394 |
+
from cascade.data.pii import PIIScanner
|
| 395 |
+
|
| 396 |
+
if HAS_RICH:
|
| 397 |
+
with console.status(f"[cyan]Scanning {args.file} for PII...[/]"):
|
| 398 |
+
scanner = PIIScanner()
|
| 399 |
+
results = scanner.scan_file(args.file)
|
| 400 |
+
|
| 401 |
+
if results:
|
| 402 |
+
console.print(f"[yellow]⚠ Found {len(results)} potential PII instances[/]")
|
| 403 |
+
|
| 404 |
+
table = Table(box=box.ROUNDED)
|
| 405 |
+
table.add_column("Type", style="red")
|
| 406 |
+
table.add_column("Value", style="yellow")
|
| 407 |
+
table.add_column("Location", style="dim")
|
| 408 |
+
|
| 409 |
+
for pii in results[:20]:
|
| 410 |
+
val = pii.get('value', '?')
|
| 411 |
+
table.add_row(
|
| 412 |
+
pii.get('type', '?'),
|
| 413 |
+
val[:30] + '...' if len(val) > 30 else val,
|
| 414 |
+
str(pii.get('location', '?'))
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
console.print(table)
|
| 418 |
+
else:
|
| 419 |
+
console.print("[green]✓ No PII detected[/]")
|
| 420 |
+
else:
|
| 421 |
+
scanner = PIIScanner()
|
| 422 |
+
results = scanner.scan_file(args.file)
|
| 423 |
+
print(f"Found {len(results)} PII instances")
|
| 424 |
+
except Exception as e:
|
| 425 |
+
msg = f"Error: {e}"
|
| 426 |
+
console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
def cmd_provenance(args):
|
| 430 |
+
"""Show data provenance for a file/dataset."""
|
| 431 |
+
try:
|
| 432 |
+
from cascade.data.provenance import DataProvenance
|
| 433 |
+
|
| 434 |
+
if HAS_RICH:
|
| 435 |
+
with console.status(f"[cyan]Analyzing provenance...[/]"):
|
| 436 |
+
prov = DataProvenance()
|
| 437 |
+
result = prov.analyze(args.path)
|
| 438 |
+
|
| 439 |
+
if result:
|
| 440 |
+
tree = Tree(f"[bold cyan]{args.path}[/]")
|
| 441 |
+
|
| 442 |
+
if 'hash' in result:
|
| 443 |
+
tree.add(f"[magenta]Hash:[/] {result['hash']}")
|
| 444 |
+
if 'sources' in result:
|
| 445 |
+
sources = tree.add("[cyan]Sources[/]")
|
| 446 |
+
for src in result['sources']:
|
| 447 |
+
sources.add(str(src))
|
| 448 |
+
if 'transformations' in result:
|
| 449 |
+
transforms = tree.add("[cyan]Transformations[/]")
|
| 450 |
+
for t in result['transformations']:
|
| 451 |
+
transforms.add(str(t))
|
| 452 |
+
|
| 453 |
+
console.print(Panel(tree, title="Data Provenance", border_style="cyan"))
|
| 454 |
+
else:
|
| 455 |
+
prov = DataProvenance()
|
| 456 |
+
result = prov.analyze(args.path)
|
| 457 |
+
print(json.dumps(result, indent=2, default=str))
|
| 458 |
+
except Exception as e:
|
| 459 |
+
msg = f"Error: {e}"
|
| 460 |
+
console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 464 |
+
# SYSTEM COMMANDS
|
| 465 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 466 |
+
|
| 467 |
+
def cmd_ingest(args):
|
| 468 |
+
"""Ingest logs/files into the lattice."""
|
| 469 |
+
try:
|
| 470 |
+
from cascade.system.repo_ingester import RepoIngester
|
| 471 |
+
|
| 472 |
+
if HAS_RICH:
|
| 473 |
+
with console.status(f"[cyan]Ingesting {args.path}...[/]"):
|
| 474 |
+
ingester = RepoIngester()
|
| 475 |
+
result = ingester.ingest(args.path)
|
| 476 |
+
|
| 477 |
+
console.print(f"[green]✓ Ingested[/]")
|
| 478 |
+
console.print(f" Files: {result.get('files', 0)}")
|
| 479 |
+
console.print(f" Observations: {result.get('observations', 0)}")
|
| 480 |
+
console.print(f" Merkle Root: [magenta]{result.get('merkle_root', 'N/A')}[/]")
|
| 481 |
+
else:
|
| 482 |
+
ingester = RepoIngester()
|
| 483 |
+
result = ingester.ingest(args.path)
|
| 484 |
+
print(f"Ingested: {result}")
|
| 485 |
+
except Exception as e:
|
| 486 |
+
msg = f"Error: {e}"
|
| 487 |
+
console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
def cmd_analyze(args):
|
| 491 |
+
"""Analyze a log file or folder."""
|
| 492 |
+
try:
|
| 493 |
+
from cascade.system.omnidirectional_analyzer import OmnidirectionalAnalyzer
|
| 494 |
+
|
| 495 |
+
if HAS_RICH:
|
| 496 |
+
with console.status(f"[cyan]Analyzing {args.path}...[/]"):
|
| 497 |
+
analyzer = OmnidirectionalAnalyzer()
|
| 498 |
+
result = analyzer.analyze(args.path)
|
| 499 |
+
|
| 500 |
+
if result:
|
| 501 |
+
console.print(Panel(
|
| 502 |
+
Syntax(json.dumps(result, indent=2, default=str), "json"),
|
| 503 |
+
title="Analysis Result",
|
| 504 |
+
border_style="cyan"
|
| 505 |
+
))
|
| 506 |
+
else:
|
| 507 |
+
analyzer = OmnidirectionalAnalyzer()
|
| 508 |
+
result = analyzer.analyze(args.path)
|
| 509 |
+
print(json.dumps(result, indent=2, default=str))
|
| 510 |
+
except Exception as e:
|
| 511 |
+
msg = f"Error: {e}"
|
| 512 |
+
console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
|
| 513 |
+
|
| 514 |
+
|
| 515 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 516 |
+
# PROXY & INIT
|
| 517 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 518 |
+
|
| 519 |
+
def cmd_proxy(args):
|
| 520 |
+
"""Start the CASCADE proxy server."""
|
| 521 |
+
if HAS_RICH:
|
| 522 |
+
console.print(Panel(
|
| 523 |
+
f"""[cyan]CASCADE Proxy Server[/]
|
| 524 |
+
|
| 525 |
+
Listening on [bold]{args.host}:{args.port}[/]
|
| 526 |
+
|
| 527 |
+
Set these environment variables in your app:
|
| 528 |
+
[green]
|
| 529 |
+
OPENAI_BASE_URL=http://localhost:{args.port}/v1
|
| 530 |
+
ANTHROPIC_BASE_URL=http://localhost:{args.port}/anthropic
|
| 531 |
+
[/]
|
| 532 |
+
Press Ctrl+C to stop.""",
|
| 533 |
+
title="🌐 Proxy Mode",
|
| 534 |
+
border_style="cyan",
|
| 535 |
+
))
|
| 536 |
+
else:
|
| 537 |
+
print(f"CASCADE Proxy on {args.host}:{args.port}")
|
| 538 |
+
|
| 539 |
+
from cascade.proxy import run_proxy
|
| 540 |
+
run_proxy(host=args.host, port=args.port, verbose=not args.quiet)
|
| 541 |
+
|
| 542 |
+
|
| 543 |
+
def cmd_init(args):
|
| 544 |
+
"""Show initialization instructions."""
|
| 545 |
+
if HAS_RICH:
|
| 546 |
+
md = """
|
| 547 |
+
# CASCADE Setup
|
| 548 |
+
|
| 549 |
+
## Option 1: Auto-Patch (Python)
|
| 550 |
+
```python
|
| 551 |
+
import cascade
|
| 552 |
+
cascade.init()
|
| 553 |
+
|
| 554 |
+
# Now every call emits a receipt
|
| 555 |
+
from openai import OpenAI
|
| 556 |
+
client = OpenAI()
|
| 557 |
+
client.chat.completions.create(...) # ← automatically observed
|
| 558 |
+
```
|
| 559 |
+
|
| 560 |
+
## Option 2: Proxy Mode (Any Language)
|
| 561 |
+
```bash
|
| 562 |
+
cascade proxy --port 7777
|
| 563 |
+
```
|
| 564 |
+
Then set environment variables:
|
| 565 |
+
```bash
|
| 566 |
+
export OPENAI_BASE_URL=http://localhost:7777/v1
|
| 567 |
+
export ANTHROPIC_BASE_URL=http://localhost:7777/anthropic
|
| 568 |
+
```
|
| 569 |
+
|
| 570 |
+
## Option 3: Manual Observation
|
| 571 |
+
```python
|
| 572 |
+
from cascade import observe
|
| 573 |
+
observe(model_id="my-model", input_data="prompt", output_data="response")
|
| 574 |
+
```
|
| 575 |
+
|
| 576 |
+
---
|
| 577 |
+
**Genesis Root:** `89f940c1a4b7aa65`
|
| 578 |
+
"""
|
| 579 |
+
console.print(Panel(Markdown(md), title="[bold cyan]CASCADE[/]", border_style="cyan"))
|
| 580 |
+
else:
|
| 581 |
+
print("""
|
| 582 |
+
CASCADE - Universal AI Provenance Layer
|
| 583 |
+
|
| 584 |
+
OPTION 1: Auto-Patch (Python)
|
| 585 |
+
import cascade
|
| 586 |
+
cascade.init()
|
| 587 |
+
|
| 588 |
+
OPTION 2: Proxy Mode (Any Language)
|
| 589 |
+
cascade proxy
|
| 590 |
+
export OPENAI_BASE_URL=http://localhost:7777/v1
|
| 591 |
+
|
| 592 |
+
OPTION 3: Manual
|
| 593 |
+
from cascade import observe
|
| 594 |
+
observe(model_id="...", input_data="...", output_data="...")
|
| 595 |
+
""")
|
| 596 |
+
|
| 597 |
+
|
| 598 |
+
def cmd_version(args):
|
| 599 |
+
"""Show version."""
|
| 600 |
+
try:
|
| 601 |
+
from cascade import __version__
|
| 602 |
+
version = __version__
|
| 603 |
+
except:
|
| 604 |
+
version = "0.1.1"
|
| 605 |
+
|
| 606 |
+
if HAS_RICH:
|
| 607 |
+
console.print(f"[cyan]cascade-ai[/] [bold]{version}[/]")
|
| 608 |
+
console.print(f"[dim]Genesis: 89f940c1a4b7aa65[/]")
|
| 609 |
+
else:
|
| 610 |
+
print(f"cascade-ai {version}")
|
| 611 |
+
|
| 612 |
+
|
| 613 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 614 |
+
# HOLD COMMANDS - Inference-Level Halt Protocol
|
| 615 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 616 |
+
|
| 617 |
+
def cmd_hold_status(args):
|
| 618 |
+
"""Show HOLD system status."""
|
| 619 |
+
try:
|
| 620 |
+
from cascade.hold import Hold
|
| 621 |
+
hold = Hold.get()
|
| 622 |
+
|
| 623 |
+
if HAS_RICH:
|
| 624 |
+
from rich.table import Table
|
| 625 |
+
|
| 626 |
+
table = Table(title="🛑 HOLD System Status", box=box.SIMPLE)
|
| 627 |
+
table.add_column("Property", style="cyan")
|
| 628 |
+
table.add_column("Value", style="green")
|
| 629 |
+
|
| 630 |
+
table.add_row("Hold Count", str(hold._hold_count))
|
| 631 |
+
table.add_row("Override Count", str(hold._override_count))
|
| 632 |
+
table.add_row("Timeout", f"{hold.timeout}s")
|
| 633 |
+
table.add_row("Auto Accept", str(hold.auto_accept))
|
| 634 |
+
table.add_row("Listeners", str(len(hold._listeners)))
|
| 635 |
+
table.add_row("Last Merkle", hold._last_merkle or "None")
|
| 636 |
+
table.add_row("Current Hold", "Active" if hold._current_hold else "None")
|
| 637 |
+
|
| 638 |
+
console.print(table)
|
| 639 |
+
else:
|
| 640 |
+
print(f"HOLD Count: {hold._hold_count}")
|
| 641 |
+
print(f"Override Count: {hold._override_count}")
|
| 642 |
+
print(f"Timeout: {hold.timeout}s")
|
| 643 |
+
print(f"Listeners: {len(hold._listeners)}")
|
| 644 |
+
except Exception as e:
|
| 645 |
+
if HAS_RICH:
|
| 646 |
+
console.print(f"[red]Error: {e}[/]")
|
| 647 |
+
else:
|
| 648 |
+
print(f"Error: {e}")
|
| 649 |
+
|
| 650 |
+
|
| 651 |
+
def cmd_hold_info(args):
|
| 652 |
+
"""Show HOLD usage information."""
|
| 653 |
+
info = """
|
| 654 |
+
🛑 HOLD - Inference-Level Halt Protocol
|
| 655 |
+
|
| 656 |
+
HOLD pauses AI inference so humans can observe and intervene.
|
| 657 |
+
|
| 658 |
+
USAGE IN YOUR CODE:
|
| 659 |
+
from cascade.hold import Hold
|
| 660 |
+
|
| 661 |
+
hold = Hold.get()
|
| 662 |
+
|
| 663 |
+
# In your inference loop:
|
| 664 |
+
probs = model.predict(observation)
|
| 665 |
+
|
| 666 |
+
resolution = hold.yield_point(
|
| 667 |
+
action_probs=probs,
|
| 668 |
+
value=value_estimate,
|
| 669 |
+
observation=obs,
|
| 670 |
+
brain_id="my_model",
|
| 671 |
+
# Optional informational wealth:
|
| 672 |
+
action_labels=["up", "down", "left", "right"],
|
| 673 |
+
latent=model.latent,
|
| 674 |
+
attention=model.attention,
|
| 675 |
+
features=model.features,
|
| 676 |
+
imagination=model.imagine(),
|
| 677 |
+
)
|
| 678 |
+
|
| 679 |
+
action = resolution.action # Final action (AI or override)
|
| 680 |
+
was_override = resolution.was_override # True if human intervened
|
| 681 |
+
|
| 682 |
+
REGISTERING LISTENERS:
|
| 683 |
+
def my_handler(hold_point):
|
| 684 |
+
print(f"HOLD: {hold_point.action_probs}")
|
| 685 |
+
# Send to UI, game engine, logger, etc.
|
| 686 |
+
|
| 687 |
+
hold.register_listener(my_handler)
|
| 688 |
+
|
| 689 |
+
RESOLVING HOLDS:
|
| 690 |
+
hold.resolve(action=3, source="human") # Override with action 3
|
| 691 |
+
hold.accept() # Accept AI's choice
|
| 692 |
+
"""
|
| 693 |
+
if HAS_RICH:
|
| 694 |
+
console.print(Panel(info, title="[bold red]HOLD[/]", border_style="red"))
|
| 695 |
+
else:
|
| 696 |
+
print(info)
|
| 697 |
+
|
| 698 |
+
|
| 699 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 700 |
+
# MAIN
|
| 701 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 702 |
+
|
| 703 |
+
def main():
|
| 704 |
+
"""Main CLI entry point."""
|
| 705 |
+
parser = argparse.ArgumentParser(
|
| 706 |
+
prog="cascade",
|
| 707 |
+
description="CASCADE - Universal AI Provenance Layer",
|
| 708 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 709 |
+
epilog="""
|
| 710 |
+
Examples:
|
| 711 |
+
cascade stats Show lattice statistics
|
| 712 |
+
cascade list -n 20 List recent observations
|
| 713 |
+
cascade chains List all chains
|
| 714 |
+
cascade inspect <root> Inspect an observation
|
| 715 |
+
cascade watch Live observation feed
|
| 716 |
+
cascade proxy Start proxy server
|
| 717 |
+
cascade fingerprint <model> Fingerprint a model
|
| 718 |
+
cascade pii <file> Scan file for PII
|
| 719 |
+
cascade ingest <path> Ingest logs/files
|
| 720 |
+
"""
|
| 721 |
+
)
|
| 722 |
+
parser.add_argument("--version", "-v", action="store_true", help="Show version")
|
| 723 |
+
|
| 724 |
+
subparsers = parser.add_subparsers(dest="command", help="Commands")
|
| 725 |
+
|
| 726 |
+
# ─── Lattice commands ───
|
| 727 |
+
subparsers.add_parser("stats", help="Show lattice statistics").set_defaults(func=cmd_stats)
|
| 728 |
+
subparsers.add_parser("chains", help="List all chains").set_defaults(func=cmd_chains)
|
| 729 |
+
subparsers.add_parser("init", help="Show setup instructions").set_defaults(func=cmd_init)
|
| 730 |
+
subparsers.add_parser("watch", help="Watch live observations").set_defaults(func=cmd_watch)
|
| 731 |
+
|
| 732 |
+
list_p = subparsers.add_parser("list", help="List recent observations")
|
| 733 |
+
list_p.add_argument("--limit", "-n", type=int, default=10, help="Number to show")
|
| 734 |
+
list_p.set_defaults(func=cmd_list)
|
| 735 |
+
|
| 736 |
+
inspect_p = subparsers.add_parser("inspect", help="Inspect an observation")
|
| 737 |
+
inspect_p.add_argument("root", help="Merkle root to inspect")
|
| 738 |
+
inspect_p.set_defaults(func=cmd_inspect)
|
| 739 |
+
|
| 740 |
+
pin_p = subparsers.add_parser("pin", help="Pin observation to IPFS")
|
| 741 |
+
pin_p.add_argument("root", help="Merkle root to pin")
|
| 742 |
+
pin_p.set_defaults(func=cmd_pin)
|
| 743 |
+
|
| 744 |
+
export_p = subparsers.add_parser("export", help="Export lattice/chain to JSON")
|
| 745 |
+
export_p.add_argument("--chain", "-c", help="Export specific chain")
|
| 746 |
+
export_p.add_argument("--output", "-o", default="cascade_export.json", help="Output file")
|
| 747 |
+
export_p.set_defaults(func=cmd_export)
|
| 748 |
+
|
| 749 |
+
# ─── Model commands ───
|
| 750 |
+
observe_p = subparsers.add_parser("observe", help="Manual observation")
|
| 751 |
+
observe_p.add_argument("--model", "-m", required=True, help="Model ID")
|
| 752 |
+
observe_p.add_argument("--input", "-i", required=True, help="Input data")
|
| 753 |
+
observe_p.add_argument("--output", "-o", required=True, help="Output data")
|
| 754 |
+
observe_p.set_defaults(func=cmd_observe)
|
| 755 |
+
|
| 756 |
+
fp_p = subparsers.add_parser("fingerprint", help="Fingerprint a model")
|
| 757 |
+
fp_p.add_argument("model", help="Model name/path")
|
| 758 |
+
fp_p.set_defaults(func=cmd_fingerprint)
|
| 759 |
+
|
| 760 |
+
# ─── Data commands ───
|
| 761 |
+
entities_p = subparsers.add_parser("entities", help="Entity resolution")
|
| 762 |
+
entities_p.add_argument("file", help="File to analyze")
|
| 763 |
+
entities_p.set_defaults(func=cmd_entities)
|
| 764 |
+
|
| 765 |
+
pii_p = subparsers.add_parser("pii", help="Scan for PII")
|
| 766 |
+
pii_p.add_argument("file", help="File to scan")
|
| 767 |
+
pii_p.set_defaults(func=cmd_pii)
|
| 768 |
+
|
| 769 |
+
prov_p = subparsers.add_parser("provenance", help="Data provenance")
|
| 770 |
+
prov_p.add_argument("path", help="File or dataset path")
|
| 771 |
+
prov_p.set_defaults(func=cmd_provenance)
|
| 772 |
+
|
| 773 |
+
# ─── System commands ───
|
| 774 |
+
ingest_p = subparsers.add_parser("ingest", help="Ingest logs/files")
|
| 775 |
+
ingest_p.add_argument("path", help="Path to ingest")
|
| 776 |
+
ingest_p.set_defaults(func=cmd_ingest)
|
| 777 |
+
|
| 778 |
+
analyze_p = subparsers.add_parser("analyze", help="Analyze logs/files")
|
| 779 |
+
analyze_p.add_argument("path", help="Path to analyze")
|
| 780 |
+
analyze_p.set_defaults(func=cmd_analyze)
|
| 781 |
+
|
| 782 |
+
# ─── Proxy ───
|
| 783 |
+
proxy_p = subparsers.add_parser("proxy", help="Start proxy server")
|
| 784 |
+
proxy_p.add_argument("--host", default="0.0.0.0", help="Host to bind")
|
| 785 |
+
proxy_p.add_argument("--port", "-p", type=int, default=7777, help="Port")
|
| 786 |
+
proxy_p.add_argument("--quiet", "-q", action="store_true", help="Quiet mode")
|
| 787 |
+
proxy_p.set_defaults(func=cmd_proxy)
|
| 788 |
+
|
| 789 |
+
# ─── HOLD - Inference-Level Halt Protocol ───
|
| 790 |
+
hold_p = subparsers.add_parser("hold", help="Show HOLD usage and API info")
|
| 791 |
+
hold_p.set_defaults(func=cmd_hold_info)
|
| 792 |
+
|
| 793 |
+
hold_status_p = subparsers.add_parser("hold-status", help="Show HOLD system status")
|
| 794 |
+
hold_status_p.set_defaults(func=cmd_hold_status)
|
| 795 |
+
|
| 796 |
+
# Parse
|
| 797 |
+
args = parser.parse_args()
|
| 798 |
+
|
| 799 |
+
if args.version:
|
| 800 |
+
cmd_version(args)
|
| 801 |
+
return
|
| 802 |
+
|
| 803 |
+
if not args.command:
|
| 804 |
+
if HAS_RICH:
|
| 805 |
+
console.print(Panel(
|
| 806 |
+
"""[cyan]CASCADE[/] - Universal AI Provenance Layer
|
| 807 |
+
|
| 808 |
+
[bold]Lattice Commands:[/]
|
| 809 |
+
[green]stats[/] Show lattice statistics
|
| 810 |
+
[green]chains[/] List all chains
|
| 811 |
+
[green]list[/] List recent observations
|
| 812 |
+
[green]inspect[/] Inspect an observation
|
| 813 |
+
[green]watch[/] Live observation feed
|
| 814 |
+
[green]pin[/] Pin to IPFS
|
| 815 |
+
[green]export[/] Export to JSON
|
| 816 |
+
|
| 817 |
+
[bold]Model Commands:[/]
|
| 818 |
+
[green]observe[/] Manual observation
|
| 819 |
+
[green]fingerprint[/] Fingerprint a model
|
| 820 |
+
|
| 821 |
+
[bold]Data Commands:[/]
|
| 822 |
+
[green]entities[/] Entity resolution
|
| 823 |
+
[green]pii[/] PII scanner
|
| 824 |
+
[green]provenance[/] Data provenance
|
| 825 |
+
|
| 826 |
+
[bold]System Commands:[/]
|
| 827 |
+
[green]ingest[/] Ingest files/logs
|
| 828 |
+
[green]analyze[/] Analyze files
|
| 829 |
+
|
| 830 |
+
[bold]HOLD (Inference Halt):[/]
|
| 831 |
+
[green]hold[/] Show HOLD usage and API info
|
| 832 |
+
[green]hold-status[/] Show HOLD system status
|
| 833 |
+
|
| 834 |
+
[bold]Other:[/]
|
| 835 |
+
[green]proxy[/] Start proxy server
|
| 836 |
+
[green]init[/] Setup instructions
|
| 837 |
+
|
| 838 |
+
Use [cyan]cascade <command> --help[/] for details.""",
|
| 839 |
+
title="[bold magenta]🌀 CASCADE[/]",
|
| 840 |
+
subtitle="[dim]pip install cascade-ai[/]",
|
| 841 |
+
border_style="magenta",
|
| 842 |
+
))
|
| 843 |
+
else:
|
| 844 |
+
parser.print_help()
|
| 845 |
+
return
|
| 846 |
+
|
| 847 |
+
args.func(args)
|
| 848 |
+
|
| 849 |
+
|
| 850 |
+
if __name__ == "__main__":
|
| 851 |
+
main()
|
cascade/core/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Cascade Core module - fundamental data structures and algorithms."""
|
| 2 |
+
|
| 3 |
+
from cascade.core.event import Event, CausationLink, CausationChain
|
| 4 |
+
from cascade.core.graph import CausationGraph
|
| 5 |
+
from cascade.core.adapter import SymbioticAdapter
|
| 6 |
+
|
| 7 |
+
__all__ = [
|
| 8 |
+
"Event",
|
| 9 |
+
"CausationLink",
|
| 10 |
+
"CausationChain",
|
| 11 |
+
"CausationGraph",
|
| 12 |
+
"SymbioticAdapter",
|
| 13 |
+
]
|
cascade/core/adapter.py
ADDED
|
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cascade Core - Symbiotic Adapter.
|
| 3 |
+
|
| 4 |
+
The heart of Cascade's system-agnostic design. The adapter uses Kleene fixed-point
|
| 5 |
+
convergence to interpret ANY signal format and convert it to Events.
|
| 6 |
+
|
| 7 |
+
"It doesn't hook into your system — it becomes part of it."
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import time
|
| 11 |
+
import json
|
| 12 |
+
import re
|
| 13 |
+
from typing import Any, Dict, List, Optional, Callable, Type
|
| 14 |
+
from dataclasses import dataclass
|
| 15 |
+
|
| 16 |
+
from cascade.core.event import Event
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class SignalPattern:
|
| 21 |
+
"""A learned pattern for interpreting signals."""
|
| 22 |
+
pattern_type: str # 'dict', 'string', 'tensor', 'protobuf', etc.
|
| 23 |
+
component: str
|
| 24 |
+
event_type: str
|
| 25 |
+
extractor: Optional[Callable[[Any], Dict[str, Any]]] = None
|
| 26 |
+
confidence: float = 0.0
|
| 27 |
+
match_count: int = 0
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class SymbioticAdapter:
|
| 31 |
+
"""
|
| 32 |
+
Self-interpreting adapter that converges to any signal format.
|
| 33 |
+
|
| 34 |
+
The adapter observes signals from the host system and learns how to
|
| 35 |
+
interpret them through fixed-point iteration. It starts with naive
|
| 36 |
+
interpretations and refines them until stable.
|
| 37 |
+
|
| 38 |
+
This is the key to Cascade's system-agnostic design:
|
| 39 |
+
- No framework-specific hooks required
|
| 40 |
+
- No configuration needed
|
| 41 |
+
- Feed it ANY signal format, it adapts
|
| 42 |
+
|
| 43 |
+
Example:
|
| 44 |
+
>>> adapter = SymbioticAdapter()
|
| 45 |
+
>>>
|
| 46 |
+
>>> # Feed it different signal formats
|
| 47 |
+
>>> adapter.interpret({"loss": 0.5, "epoch": 10})
|
| 48 |
+
>>> adapter.interpret("2024-01-01 12:00:00 ERROR training failed")
|
| 49 |
+
>>> adapter.interpret(torch.tensor([0.1, 0.2, 0.3]))
|
| 50 |
+
>>>
|
| 51 |
+
>>> # It learns patterns and gets better at interpretation
|
| 52 |
+
>>> print(adapter.learned_patterns)
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
def __init__(self):
|
| 56 |
+
"""Initialize the symbiotic adapter."""
|
| 57 |
+
self._patterns: List[SignalPattern] = []
|
| 58 |
+
self._signal_count = 0
|
| 59 |
+
self._interpretation_cache: Dict[str, SignalPattern] = {}
|
| 60 |
+
|
| 61 |
+
# Built-in interpreters for common formats
|
| 62 |
+
self._builtin_interpreters = {
|
| 63 |
+
dict: self._interpret_dict,
|
| 64 |
+
str: self._interpret_string,
|
| 65 |
+
list: self._interpret_list,
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
# Regex patterns for log line parsing
|
| 69 |
+
self._log_patterns = [
|
| 70 |
+
# ISO timestamp with level: "2024-01-01 12:00:00 ERROR message"
|
| 71 |
+
re.compile(r'^(\d{4}-\d{2}-\d{2}[T\s]\d{2}:\d{2}:\d{2}(?:\.\d+)?)\s+(\w+)\s+(.*)$'),
|
| 72 |
+
# Simple timestamp: "12:00:00.123 component message"
|
| 73 |
+
re.compile(r'^(\d{2}:\d{2}:\d{2}(?:\.\d+)?)\s+(\w+)\s+(.*)$'),
|
| 74 |
+
# Pipe-delimited: "timestamp|level|component|key:value"
|
| 75 |
+
re.compile(r'^([^|]+)\|(\w+)\|(\w+)\|(.*)$'),
|
| 76 |
+
]
|
| 77 |
+
|
| 78 |
+
# Metric extraction patterns - ONLY extract real training metrics
|
| 79 |
+
# Be strict to avoid extracting garbage from config lines
|
| 80 |
+
self._metric_patterns = [
|
| 81 |
+
# Standard training metrics with = or :
|
| 82 |
+
re.compile(r'\b(loss|val_loss|train_loss|accuracy|acc|val_acc|lr|learning_rate|epoch|step|iter|iteration|mfu|tokens_per_sec|samples_per_sec|grad_norm|perplexity|ppl)[=:]\s*([+-]?\d+\.?\d*(?:e[+-]?\d+)?)', re.I),
|
| 83 |
+
# "iter X: loss=Y" format from nanoGPT
|
| 84 |
+
re.compile(r'iter\s+(\d+).*loss[=:]?\s*([+-]?\d+\.?\d*)', re.I),
|
| 85 |
+
# "step X loss Y" format
|
| 86 |
+
re.compile(r'step\s+(\d+).*loss\s*[=:]?\s*([+-]?\d+\.?\d*)', re.I),
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
def interpret(self, signal: Any) -> Event:
|
| 90 |
+
"""
|
| 91 |
+
Interpret any signal into a Cascade Event.
|
| 92 |
+
|
| 93 |
+
Uses Kleene fixed-point iteration to converge on the best interpretation.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
signal: Any signal from the host system
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
Event: The interpreted event
|
| 100 |
+
"""
|
| 101 |
+
self._signal_count += 1
|
| 102 |
+
|
| 103 |
+
# Get signal type
|
| 104 |
+
signal_type = type(signal)
|
| 105 |
+
|
| 106 |
+
# Try cached pattern first
|
| 107 |
+
cache_key = self._get_cache_key(signal)
|
| 108 |
+
if cache_key in self._interpretation_cache:
|
| 109 |
+
pattern = self._interpretation_cache[cache_key]
|
| 110 |
+
pattern.match_count += 1
|
| 111 |
+
return self._apply_pattern(signal, pattern)
|
| 112 |
+
|
| 113 |
+
# Try built-in interpreter
|
| 114 |
+
if signal_type in self._builtin_interpreters:
|
| 115 |
+
event = self._builtin_interpreters[signal_type](signal)
|
| 116 |
+
self._learn_pattern(signal, event)
|
| 117 |
+
return event
|
| 118 |
+
|
| 119 |
+
# Try tensor-like objects (duck typing)
|
| 120 |
+
if hasattr(signal, 'numpy') or hasattr(signal, 'detach'):
|
| 121 |
+
event = self._interpret_tensor(signal)
|
| 122 |
+
self._learn_pattern(signal, event)
|
| 123 |
+
return event
|
| 124 |
+
|
| 125 |
+
# Try protobuf-like objects
|
| 126 |
+
if hasattr(signal, 'SerializeToString'):
|
| 127 |
+
event = self._interpret_protobuf(signal)
|
| 128 |
+
self._learn_pattern(signal, event)
|
| 129 |
+
return event
|
| 130 |
+
|
| 131 |
+
# Fallback: convert to string and interpret
|
| 132 |
+
event = self._interpret_string(str(signal))
|
| 133 |
+
return event
|
| 134 |
+
|
| 135 |
+
def _interpret_dict(self, signal: Dict[str, Any]) -> Event:
|
| 136 |
+
"""Interpret a dictionary signal."""
|
| 137 |
+
# Extract common fields
|
| 138 |
+
timestamp = signal.get('timestamp', signal.get('time', time.time()))
|
| 139 |
+
if isinstance(timestamp, str):
|
| 140 |
+
try:
|
| 141 |
+
from datetime import datetime
|
| 142 |
+
timestamp = datetime.fromisoformat(timestamp).timestamp()
|
| 143 |
+
except:
|
| 144 |
+
timestamp = time.time()
|
| 145 |
+
|
| 146 |
+
component = signal.get('component', signal.get('source', 'unknown'))
|
| 147 |
+
event_type = signal.get('event_type', signal.get('type', 'state_change'))
|
| 148 |
+
|
| 149 |
+
# Everything else goes in data
|
| 150 |
+
reserved = {'timestamp', 'time', 'component', 'source', 'event_type', 'type'}
|
| 151 |
+
data = {k: v for k, v in signal.items() if k not in reserved}
|
| 152 |
+
|
| 153 |
+
return Event(
|
| 154 |
+
timestamp=timestamp,
|
| 155 |
+
component=component,
|
| 156 |
+
event_type=event_type,
|
| 157 |
+
data=data,
|
| 158 |
+
source_signal=signal,
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
def _interpret_string(self, signal: str) -> Event:
|
| 162 |
+
"""Interpret a string signal (log line, message, etc.)."""
|
| 163 |
+
signal = signal.strip()
|
| 164 |
+
|
| 165 |
+
# Try each log pattern
|
| 166 |
+
for pattern in self._log_patterns:
|
| 167 |
+
match = pattern.match(signal)
|
| 168 |
+
if match:
|
| 169 |
+
groups = match.groups()
|
| 170 |
+
if len(groups) >= 3:
|
| 171 |
+
timestamp_str, level_or_component, rest = groups[0], groups[1], groups[-1]
|
| 172 |
+
|
| 173 |
+
# Parse timestamp
|
| 174 |
+
try:
|
| 175 |
+
from datetime import datetime
|
| 176 |
+
timestamp = datetime.fromisoformat(timestamp_str.replace(' ', 'T')).timestamp()
|
| 177 |
+
except:
|
| 178 |
+
timestamp = time.time()
|
| 179 |
+
|
| 180 |
+
# Extract metrics from the rest
|
| 181 |
+
data = self._extract_metrics(rest)
|
| 182 |
+
data['raw_message'] = rest
|
| 183 |
+
|
| 184 |
+
# Determine event type from keywords
|
| 185 |
+
event_type = self._infer_event_type(signal)
|
| 186 |
+
|
| 187 |
+
return Event(
|
| 188 |
+
timestamp=timestamp,
|
| 189 |
+
component=level_or_component.lower(),
|
| 190 |
+
event_type=event_type,
|
| 191 |
+
data=data,
|
| 192 |
+
source_signal=signal,
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# Fallback: extract what we can with smarter component detection
|
| 196 |
+
data = self._extract_metrics(signal)
|
| 197 |
+
data['raw_message'] = signal
|
| 198 |
+
|
| 199 |
+
# Infer component from content
|
| 200 |
+
component = self._infer_component(signal)
|
| 201 |
+
|
| 202 |
+
return Event(
|
| 203 |
+
timestamp=time.time(),
|
| 204 |
+
component=component,
|
| 205 |
+
event_type=self._infer_event_type(signal),
|
| 206 |
+
data=data,
|
| 207 |
+
source_signal=signal,
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
def _interpret_list(self, signal: List[Any]) -> Event:
|
| 211 |
+
"""Interpret a list signal."""
|
| 212 |
+
# Convert to dict with indices
|
| 213 |
+
data = {f'item_{i}': v for i, v in enumerate(signal)}
|
| 214 |
+
data['length'] = len(signal)
|
| 215 |
+
|
| 216 |
+
# Check if it looks like numeric data
|
| 217 |
+
if all(isinstance(x, (int, float)) for x in signal):
|
| 218 |
+
data['mean'] = sum(signal) / len(signal) if signal else 0
|
| 219 |
+
data['min'] = min(signal) if signal else 0
|
| 220 |
+
data['max'] = max(signal) if signal else 0
|
| 221 |
+
|
| 222 |
+
return Event(
|
| 223 |
+
timestamp=time.time(),
|
| 224 |
+
component='data',
|
| 225 |
+
event_type='list_signal',
|
| 226 |
+
data=data,
|
| 227 |
+
source_signal=signal,
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
def _interpret_tensor(self, signal: Any) -> Event:
|
| 231 |
+
"""Interpret a tensor-like signal (PyTorch, NumPy, etc.)."""
|
| 232 |
+
# Try to get numpy array
|
| 233 |
+
try:
|
| 234 |
+
if hasattr(signal, 'detach'):
|
| 235 |
+
arr = signal.detach().cpu().numpy()
|
| 236 |
+
elif hasattr(signal, 'numpy'):
|
| 237 |
+
arr = signal.numpy()
|
| 238 |
+
else:
|
| 239 |
+
arr = signal
|
| 240 |
+
|
| 241 |
+
data = {
|
| 242 |
+
'shape': list(arr.shape) if hasattr(arr, 'shape') else [],
|
| 243 |
+
'dtype': str(arr.dtype) if hasattr(arr, 'dtype') else 'unknown',
|
| 244 |
+
'mean': float(arr.mean()) if hasattr(arr, 'mean') else 0,
|
| 245 |
+
'std': float(arr.std()) if hasattr(arr, 'std') else 0,
|
| 246 |
+
'min': float(arr.min()) if hasattr(arr, 'min') else 0,
|
| 247 |
+
'max': float(arr.max()) if hasattr(arr, 'max') else 0,
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
# Check for NaN/Inf (common in gradient explosions)
|
| 251 |
+
if hasattr(arr, 'isnan'):
|
| 252 |
+
data['has_nan'] = bool(arr.isnan().any())
|
| 253 |
+
if hasattr(arr, 'isinf'):
|
| 254 |
+
data['has_inf'] = bool(arr.isinf().any())
|
| 255 |
+
|
| 256 |
+
except Exception as e:
|
| 257 |
+
data = {'error': str(e), 'type': str(type(signal))}
|
| 258 |
+
|
| 259 |
+
return Event(
|
| 260 |
+
timestamp=time.time(),
|
| 261 |
+
component='tensor',
|
| 262 |
+
event_type='tensor_signal',
|
| 263 |
+
data=data,
|
| 264 |
+
source_signal=None, # Don't store tensor to save memory
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
def _interpret_protobuf(self, signal: Any) -> Event:
|
| 268 |
+
"""Interpret a protobuf-like signal."""
|
| 269 |
+
try:
|
| 270 |
+
# Try to convert to dict
|
| 271 |
+
if hasattr(signal, 'DESCRIPTOR'):
|
| 272 |
+
from google.protobuf.json_format import MessageToDict
|
| 273 |
+
data = MessageToDict(signal)
|
| 274 |
+
else:
|
| 275 |
+
data = {'raw': str(signal)}
|
| 276 |
+
except:
|
| 277 |
+
data = {'raw': str(signal)}
|
| 278 |
+
|
| 279 |
+
return Event(
|
| 280 |
+
timestamp=time.time(),
|
| 281 |
+
component='protobuf',
|
| 282 |
+
event_type='protobuf_signal',
|
| 283 |
+
data=data,
|
| 284 |
+
source_signal=None,
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
def _extract_metrics(self, text: str) -> Dict[str, Any]:
|
| 288 |
+
"""Extract numeric metrics from text - STRICT, only real training metrics."""
|
| 289 |
+
metrics = {}
|
| 290 |
+
|
| 291 |
+
# nanoGPT format: "iter 0: loss=4.2176, time 46.76ms, mfu 0.62%"
|
| 292 |
+
nano_match = re.search(r'iter\s+(\d+).*loss[=:]?\s*([\d.]+)', text, re.I)
|
| 293 |
+
if nano_match:
|
| 294 |
+
metrics['iter'] = int(nano_match.group(1))
|
| 295 |
+
metrics['loss'] = float(nano_match.group(2))
|
| 296 |
+
|
| 297 |
+
# Diffusers/tqdm format: "step_loss=0.1234" or "step_loss: 0.1234"
|
| 298 |
+
step_loss_match = re.search(r'step_loss[=:]\s*([\d.e+-]+)', text, re.I)
|
| 299 |
+
if step_loss_match:
|
| 300 |
+
metrics['loss'] = float(step_loss_match.group(1))
|
| 301 |
+
|
| 302 |
+
# train_loss format from accelerator.log
|
| 303 |
+
train_loss_match = re.search(r'train_loss[=:]\s*([\d.e+-]+)', text, re.I)
|
| 304 |
+
if train_loss_match:
|
| 305 |
+
metrics['loss'] = float(train_loss_match.group(1))
|
| 306 |
+
|
| 307 |
+
# tqdm progress format: " 5%|█ | 5/100 [00:30<09:30, step_loss=0.234, lr=1e-5]"
|
| 308 |
+
tqdm_match = re.search(r'(\d+)%\|.*\|\s*(\d+)/(\d+)', text)
|
| 309 |
+
if tqdm_match:
|
| 310 |
+
metrics['progress_pct'] = int(tqdm_match.group(1))
|
| 311 |
+
metrics['step'] = int(tqdm_match.group(2))
|
| 312 |
+
metrics['total_steps'] = int(tqdm_match.group(3))
|
| 313 |
+
|
| 314 |
+
# Generic loss patterns
|
| 315 |
+
generic_loss = re.search(r'\bloss[=:]\s*([\d.e+-]+)', text, re.I)
|
| 316 |
+
if generic_loss and 'loss' not in metrics:
|
| 317 |
+
metrics['loss'] = float(generic_loss.group(1))
|
| 318 |
+
|
| 319 |
+
# mfu extraction
|
| 320 |
+
mfu_match = re.search(r'mfu\s*[=:]?\s*([\d.]+)%?', text, re.I)
|
| 321 |
+
if mfu_match:
|
| 322 |
+
metrics['mfu'] = float(mfu_match.group(1))
|
| 323 |
+
|
| 324 |
+
# time extraction (ms)
|
| 325 |
+
time_match = re.search(r'time\s*[=:]?\s*([\d.]+)\s*ms', text, re.I)
|
| 326 |
+
if time_match:
|
| 327 |
+
metrics['time_ms'] = float(time_match.group(1))
|
| 328 |
+
|
| 329 |
+
# learning rate - multiple formats
|
| 330 |
+
lr_match = re.search(r'\b(?:lr|learning_rate)\s*[=:]\s*([\d.e+-]+)', text, re.I)
|
| 331 |
+
if lr_match:
|
| 332 |
+
metrics['lr'] = float(lr_match.group(1))
|
| 333 |
+
|
| 334 |
+
# epoch/step for other frameworks
|
| 335 |
+
epoch_match = re.search(r'\bepoch\s*[=:]\s*(\d+)', text, re.I)
|
| 336 |
+
if epoch_match:
|
| 337 |
+
metrics['epoch'] = int(epoch_match.group(1))
|
| 338 |
+
|
| 339 |
+
step_match = re.search(r'\bstep\s*[=:]\s*(\d+)', text, re.I)
|
| 340 |
+
if step_match and 'step' not in metrics:
|
| 341 |
+
metrics['step'] = int(step_match.group(1))
|
| 342 |
+
|
| 343 |
+
# global_step from diffusers
|
| 344 |
+
global_step_match = re.search(r'global_step[=:]\s*(\d+)', text, re.I)
|
| 345 |
+
if global_step_match:
|
| 346 |
+
metrics['step'] = int(global_step_match.group(1))
|
| 347 |
+
|
| 348 |
+
return metrics
|
| 349 |
+
|
| 350 |
+
def _infer_event_type(self, text: str) -> str:
|
| 351 |
+
"""Infer event type from text content."""
|
| 352 |
+
text_lower = text.lower()
|
| 353 |
+
|
| 354 |
+
# Training iteration logs (highest priority)
|
| 355 |
+
if re.search(r'iter\s+\d+.*loss', text_lower):
|
| 356 |
+
return 'training_step'
|
| 357 |
+
if re.search(r'step\s+\d+.*loss', text_lower):
|
| 358 |
+
return 'training_step'
|
| 359 |
+
|
| 360 |
+
if any(kw in text_lower for kw in ['error', 'exception', 'failed', 'crash']):
|
| 361 |
+
return 'error'
|
| 362 |
+
if any(kw in text_lower for kw in ['warning', 'warn']):
|
| 363 |
+
return 'warning'
|
| 364 |
+
if any(kw in text_lower for kw in ['gradient', 'backward']):
|
| 365 |
+
return 'training'
|
| 366 |
+
if 'loss' in text_lower and 'val' in text_lower:
|
| 367 |
+
return 'validation'
|
| 368 |
+
if any(kw in text_lower for kw in ['inference', 'predict', 'forward']):
|
| 369 |
+
return 'inference'
|
| 370 |
+
if any(kw in text_lower for kw in ['epoch', 'step', 'iteration', 'iter']):
|
| 371 |
+
return 'progress'
|
| 372 |
+
if any(kw in text_lower for kw in ['nan', 'inf', 'explode', 'overflow']):
|
| 373 |
+
return 'anomaly'
|
| 374 |
+
if any(kw in text_lower for kw in ['save', 'checkpoint', 'load', 'saving']):
|
| 375 |
+
return 'checkpoint'
|
| 376 |
+
if any(kw in text_lower for kw in ['config', 'setting', 'parameter', 'device', 'gpu', 'cuda']):
|
| 377 |
+
return 'config'
|
| 378 |
+
if any(kw in text_lower for kw in ['initializ', 'loading model', 'compiling']):
|
| 379 |
+
return 'init'
|
| 380 |
+
|
| 381 |
+
return 'state_change'
|
| 382 |
+
|
| 383 |
+
def _infer_component(self, text: str) -> str:
|
| 384 |
+
"""Infer component from text content - NO MORE 'unknown'."""
|
| 385 |
+
text_lower = text.lower()
|
| 386 |
+
|
| 387 |
+
# Training/optimizer related
|
| 388 |
+
if any(kw in text_lower for kw in ['iter', 'step', 'epoch', 'batch']):
|
| 389 |
+
return 'trainer'
|
| 390 |
+
if any(kw in text_lower for kw in ['loss', 'backward', 'gradient']):
|
| 391 |
+
return 'loss'
|
| 392 |
+
if any(kw in text_lower for kw in ['optim', 'adam', 'sgd', 'lr', 'learning']):
|
| 393 |
+
return 'optimizer'
|
| 394 |
+
if any(kw in text_lower for kw in ['model', 'layer', 'param', 'weight']):
|
| 395 |
+
return 'model'
|
| 396 |
+
if any(kw in text_lower for kw in ['data', 'batch', 'loader', 'dataset']):
|
| 397 |
+
return 'data'
|
| 398 |
+
if any(kw in text_lower for kw in ['cuda', 'gpu', 'device', 'memory']):
|
| 399 |
+
return 'device'
|
| 400 |
+
if any(kw in text_lower for kw in ['checkpoint', 'save', 'load']):
|
| 401 |
+
return 'checkpoint'
|
| 402 |
+
if any(kw in text_lower for kw in ['config', 'setting', 'override']):
|
| 403 |
+
return 'config'
|
| 404 |
+
if any(kw in text_lower for kw in ['eval', 'valid', 'test']):
|
| 405 |
+
return 'evaluator'
|
| 406 |
+
if any(kw in text_lower for kw in ['token', 'vocab', 'embed']):
|
| 407 |
+
return 'tokenizer'
|
| 408 |
+
|
| 409 |
+
return 'system' # Generic fallback, not "unknown"
|
| 410 |
+
|
| 411 |
+
def _get_cache_key(self, signal: Any) -> str:
|
| 412 |
+
"""Generate a cache key for a signal's structure."""
|
| 413 |
+
if isinstance(signal, dict):
|
| 414 |
+
# Key based on dict keys
|
| 415 |
+
return f"dict:{':'.join(sorted(signal.keys()))}"
|
| 416 |
+
elif isinstance(signal, str):
|
| 417 |
+
# Key based on first word
|
| 418 |
+
first_word = signal.split()[0] if signal.split() else ''
|
| 419 |
+
return f"str:{first_word[:20]}"
|
| 420 |
+
else:
|
| 421 |
+
return f"type:{type(signal).__name__}"
|
| 422 |
+
|
| 423 |
+
def _learn_pattern(self, signal: Any, event: Event) -> None:
|
| 424 |
+
"""Learn a pattern from a successful interpretation."""
|
| 425 |
+
cache_key = self._get_cache_key(signal)
|
| 426 |
+
pattern = SignalPattern(
|
| 427 |
+
pattern_type=type(signal).__name__,
|
| 428 |
+
component=event.component,
|
| 429 |
+
event_type=event.event_type,
|
| 430 |
+
confidence=0.5,
|
| 431 |
+
match_count=1,
|
| 432 |
+
)
|
| 433 |
+
self._interpretation_cache[cache_key] = pattern
|
| 434 |
+
self._patterns.append(pattern)
|
| 435 |
+
|
| 436 |
+
def _apply_pattern(self, signal: Any, pattern: SignalPattern) -> Event:
|
| 437 |
+
"""Apply a learned pattern to interpret a signal."""
|
| 438 |
+
# Re-interpret with learned hints - use direct interpreters to avoid recursion
|
| 439 |
+
if isinstance(signal, dict):
|
| 440 |
+
event = self._interpret_dict(signal)
|
| 441 |
+
# Apply learned component/type if more confident
|
| 442 |
+
if pattern.confidence > 0.7:
|
| 443 |
+
return Event(
|
| 444 |
+
timestamp=event.timestamp,
|
| 445 |
+
component=pattern.component,
|
| 446 |
+
event_type=pattern.event_type,
|
| 447 |
+
data=event.data,
|
| 448 |
+
source_signal=signal,
|
| 449 |
+
)
|
| 450 |
+
return event
|
| 451 |
+
elif isinstance(signal, str):
|
| 452 |
+
return self._interpret_string(signal)
|
| 453 |
+
elif isinstance(signal, list):
|
| 454 |
+
return self._interpret_list(signal)
|
| 455 |
+
else:
|
| 456 |
+
# Fallback: interpret as string without recursion
|
| 457 |
+
return self._interpret_string(str(signal))
|
| 458 |
+
|
| 459 |
+
@property
|
| 460 |
+
def learned_patterns(self) -> List[SignalPattern]:
|
| 461 |
+
"""Get all learned signal patterns."""
|
| 462 |
+
return sorted(self._patterns, key=lambda p: p.match_count, reverse=True)
|
| 463 |
+
|
| 464 |
+
@property
|
| 465 |
+
def signal_count(self) -> int:
|
| 466 |
+
"""Total number of signals interpreted."""
|
| 467 |
+
return self._signal_count
|
| 468 |
+
|
| 469 |
+
def __repr__(self) -> str:
|
| 470 |
+
return f"<SymbioticAdapter | {self._signal_count} signals, {len(self._patterns)} patterns>"
|
cascade/core/event.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cascade Core - Event and CausationLink primitives.
|
| 3 |
+
|
| 4 |
+
These are the fundamental data structures that represent causation.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
from typing import Dict, List, Any, Optional
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
import time
|
| 11 |
+
import uuid
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _generate_event_id() -> str:
|
| 15 |
+
"""Generate a unique event ID with timestamp prefix for ordering."""
|
| 16 |
+
timestamp = int(time.time() * 1000000)
|
| 17 |
+
unique = uuid.uuid4().hex[:8]
|
| 18 |
+
return f"evt_{timestamp}_{unique}"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class Event:
|
| 23 |
+
"""
|
| 24 |
+
A discrete event in the causation graph.
|
| 25 |
+
|
| 26 |
+
Events are the nodes in your causation graph. Each event represents
|
| 27 |
+
something that happened in your system at a point in time.
|
| 28 |
+
|
| 29 |
+
Attributes:
|
| 30 |
+
event_id: Unique identifier (auto-generated if not provided)
|
| 31 |
+
timestamp: Unix timestamp when event occurred
|
| 32 |
+
component: Which system component generated this event
|
| 33 |
+
event_type: Category of event (e.g., 'training', 'inference', 'error')
|
| 34 |
+
data: Arbitrary key-value data associated with the event
|
| 35 |
+
source_signal: The original signal that created this event (for debugging)
|
| 36 |
+
|
| 37 |
+
Example:
|
| 38 |
+
>>> event = Event(
|
| 39 |
+
... timestamp=time.time(),
|
| 40 |
+
... component="neural_network",
|
| 41 |
+
... event_type="gradient_explosion",
|
| 42 |
+
... data={"layer": "fc3", "magnitude": 1e12}
|
| 43 |
+
... )
|
| 44 |
+
"""
|
| 45 |
+
timestamp: float
|
| 46 |
+
component: str
|
| 47 |
+
event_type: str
|
| 48 |
+
data: Dict[str, Any] = field(default_factory=dict)
|
| 49 |
+
event_id: str = field(default_factory=_generate_event_id)
|
| 50 |
+
source_signal: Optional[Any] = field(default=None, repr=False)
|
| 51 |
+
|
| 52 |
+
def __post_init__(self):
|
| 53 |
+
"""Ensure timestamp is float."""
|
| 54 |
+
if isinstance(self.timestamp, datetime):
|
| 55 |
+
self.timestamp = self.timestamp.timestamp()
|
| 56 |
+
|
| 57 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 58 |
+
"""Serialize event to dictionary."""
|
| 59 |
+
return {
|
| 60 |
+
"event_id": self.event_id,
|
| 61 |
+
"timestamp": self.timestamp,
|
| 62 |
+
"component": self.component,
|
| 63 |
+
"event_type": self.event_type,
|
| 64 |
+
"data": self.data,
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
@classmethod
|
| 68 |
+
def from_dict(cls, d: Dict[str, Any]) -> "Event":
|
| 69 |
+
"""Deserialize event from dictionary."""
|
| 70 |
+
return cls(
|
| 71 |
+
event_id=d.get("event_id", _generate_event_id()),
|
| 72 |
+
timestamp=d["timestamp"],
|
| 73 |
+
component=d["component"],
|
| 74 |
+
event_type=d["event_type"],
|
| 75 |
+
data=d.get("data", {}),
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
def __hash__(self):
|
| 79 |
+
return hash(self.event_id)
|
| 80 |
+
|
| 81 |
+
def __eq__(self, other):
|
| 82 |
+
if isinstance(other, Event):
|
| 83 |
+
return self.event_id == other.event_id
|
| 84 |
+
return False
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
@dataclass
|
| 88 |
+
class CausationLink:
|
| 89 |
+
"""
|
| 90 |
+
A causal relationship between two events.
|
| 91 |
+
|
| 92 |
+
Links are the edges in your causation graph. Each link represents
|
| 93 |
+
a cause-effect relationship: event A caused event B.
|
| 94 |
+
|
| 95 |
+
Attributes:
|
| 96 |
+
from_event: ID of the causing event
|
| 97 |
+
to_event: ID of the caused event
|
| 98 |
+
causation_type: How the causation was detected
|
| 99 |
+
- 'temporal': A happened shortly before B
|
| 100 |
+
- 'correlation': A and B metrics moved together
|
| 101 |
+
- 'threshold': A crossed a threshold triggering B
|
| 102 |
+
- 'direct': Explicit causation declared in code
|
| 103 |
+
strength: Confidence in the causal relationship (0.0 to 1.0)
|
| 104 |
+
explanation: Human-readable explanation of the link
|
| 105 |
+
metrics_involved: Which metrics connect these events
|
| 106 |
+
|
| 107 |
+
Example:
|
| 108 |
+
>>> link = CausationLink(
|
| 109 |
+
... from_event="evt_123",
|
| 110 |
+
... to_event="evt_456",
|
| 111 |
+
... causation_type="threshold",
|
| 112 |
+
... strength=0.95,
|
| 113 |
+
... explanation="Loss exceeded 10.0, triggering gradient clipping"
|
| 114 |
+
... )
|
| 115 |
+
"""
|
| 116 |
+
from_event: str
|
| 117 |
+
to_event: str
|
| 118 |
+
causation_type: str # 'temporal', 'correlation', 'threshold', 'direct'
|
| 119 |
+
strength: float = 1.0
|
| 120 |
+
explanation: str = ""
|
| 121 |
+
metrics_involved: List[str] = field(default_factory=list)
|
| 122 |
+
|
| 123 |
+
def __post_init__(self):
|
| 124 |
+
"""Validate strength is in range."""
|
| 125 |
+
self.strength = max(0.0, min(1.0, self.strength))
|
| 126 |
+
|
| 127 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 128 |
+
"""Serialize link to dictionary."""
|
| 129 |
+
return {
|
| 130 |
+
"from_event": self.from_event,
|
| 131 |
+
"to_event": self.to_event,
|
| 132 |
+
"causation_type": self.causation_type,
|
| 133 |
+
"strength": self.strength,
|
| 134 |
+
"explanation": self.explanation,
|
| 135 |
+
"metrics_involved": self.metrics_involved,
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
@classmethod
|
| 139 |
+
def from_dict(cls, d: Dict[str, Any]) -> "CausationLink":
|
| 140 |
+
"""Deserialize link from dictionary."""
|
| 141 |
+
return cls(
|
| 142 |
+
from_event=d["from_event"],
|
| 143 |
+
to_event=d["to_event"],
|
| 144 |
+
causation_type=d["causation_type"],
|
| 145 |
+
strength=d.get("strength", 1.0),
|
| 146 |
+
explanation=d.get("explanation", ""),
|
| 147 |
+
metrics_involved=d.get("metrics_involved", []),
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
@dataclass
|
| 152 |
+
class CausationChain:
|
| 153 |
+
"""
|
| 154 |
+
A chain of causal events from origin to destination.
|
| 155 |
+
|
| 156 |
+
Represents a full causal path through the graph.
|
| 157 |
+
|
| 158 |
+
Attributes:
|
| 159 |
+
events: List of events in causal order
|
| 160 |
+
links: List of links connecting the events
|
| 161 |
+
total_strength: Combined strength of all links
|
| 162 |
+
depth: Number of hops in the chain
|
| 163 |
+
narrative: Human-readable story of what happened
|
| 164 |
+
"""
|
| 165 |
+
events: List[Event]
|
| 166 |
+
links: List[CausationLink]
|
| 167 |
+
total_strength: float = 1.0
|
| 168 |
+
depth: int = 0
|
| 169 |
+
narrative: str = ""
|
| 170 |
+
|
| 171 |
+
def __post_init__(self):
|
| 172 |
+
self.depth = len(self.links)
|
| 173 |
+
if not self.total_strength and self.links:
|
| 174 |
+
# Calculate combined strength
|
| 175 |
+
self.total_strength = 1.0
|
| 176 |
+
for link in self.links:
|
| 177 |
+
self.total_strength *= link.strength
|
cascade/core/graph.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cascade Core - Causation Graph Engine.
|
| 3 |
+
|
| 4 |
+
The graph stores events and their causal relationships, enabling
|
| 5 |
+
bidirectional traversal through time.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import threading
|
| 9 |
+
from typing import Dict, List, Optional, Set, Any, Iterator
|
| 10 |
+
from collections import defaultdict
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
import networkx as nx
|
| 15 |
+
HAS_NETWORKX = True
|
| 16 |
+
except ImportError:
|
| 17 |
+
HAS_NETWORKX = False
|
| 18 |
+
|
| 19 |
+
from cascade.core.event import Event, CausationLink
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class CausationGraph:
|
| 23 |
+
"""
|
| 24 |
+
A directed graph of causal relationships between events.
|
| 25 |
+
|
| 26 |
+
The graph enables bidirectional traversal:
|
| 27 |
+
- Backwards: "What caused this event?"
|
| 28 |
+
- Forwards: "What did this event cause?"
|
| 29 |
+
|
| 30 |
+
Thread-safe for concurrent event ingestion.
|
| 31 |
+
|
| 32 |
+
Example:
|
| 33 |
+
>>> graph = CausationGraph()
|
| 34 |
+
>>> graph.add_event(event1)
|
| 35 |
+
>>> graph.add_event(event2)
|
| 36 |
+
>>> graph.add_link(CausationLink(
|
| 37 |
+
... from_event=event1.event_id,
|
| 38 |
+
... to_event=event2.event_id,
|
| 39 |
+
... causation_type="temporal",
|
| 40 |
+
... strength=0.9
|
| 41 |
+
... ))
|
| 42 |
+
>>>
|
| 43 |
+
>>> # Find what caused event2
|
| 44 |
+
>>> causes = graph.get_causes(event2.event_id)
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
def __init__(self):
|
| 48 |
+
"""Initialize an empty causation graph."""
|
| 49 |
+
self._lock = threading.RLock()
|
| 50 |
+
|
| 51 |
+
# Event storage
|
| 52 |
+
self._events: Dict[str, Event] = {}
|
| 53 |
+
self._events_by_component: Dict[str, List[str]] = defaultdict(list)
|
| 54 |
+
self._events_by_type: Dict[str, List[str]] = defaultdict(list)
|
| 55 |
+
self._events_by_time: List[str] = [] # Ordered by timestamp
|
| 56 |
+
|
| 57 |
+
# Link storage
|
| 58 |
+
self._links: Dict[str, CausationLink] = {} # link_id -> link
|
| 59 |
+
self._causes: Dict[str, Set[str]] = defaultdict(set) # event_id -> set of cause event_ids
|
| 60 |
+
self._effects: Dict[str, Set[str]] = defaultdict(set) # event_id -> set of effect event_ids
|
| 61 |
+
|
| 62 |
+
# NetworkX graph for advanced algorithms (optional)
|
| 63 |
+
if HAS_NETWORKX:
|
| 64 |
+
self._nx_graph = nx.DiGraph()
|
| 65 |
+
else:
|
| 66 |
+
self._nx_graph = None
|
| 67 |
+
|
| 68 |
+
# Statistics
|
| 69 |
+
self._event_count = 0
|
| 70 |
+
self._link_count = 0
|
| 71 |
+
|
| 72 |
+
def add_event(self, event: Event) -> None:
|
| 73 |
+
"""
|
| 74 |
+
Add an event to the graph.
|
| 75 |
+
|
| 76 |
+
Thread-safe. Automatically detects potential causations with recent events.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
event: The event to add
|
| 80 |
+
"""
|
| 81 |
+
with self._lock:
|
| 82 |
+
if event.event_id in self._events:
|
| 83 |
+
return # Already exists
|
| 84 |
+
|
| 85 |
+
self._events[event.event_id] = event
|
| 86 |
+
self._events_by_component[event.component].append(event.event_id)
|
| 87 |
+
self._events_by_type[event.event_type].append(event.event_id)
|
| 88 |
+
self._events_by_time.append(event.event_id)
|
| 89 |
+
self._event_count += 1
|
| 90 |
+
|
| 91 |
+
if self._nx_graph is not None:
|
| 92 |
+
self._nx_graph.add_node(event.event_id, **event.to_dict())
|
| 93 |
+
|
| 94 |
+
def add_link(self, link: CausationLink) -> None:
|
| 95 |
+
"""
|
| 96 |
+
Add a causal link between two events.
|
| 97 |
+
|
| 98 |
+
Thread-safe.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
link: The causation link to add
|
| 102 |
+
"""
|
| 103 |
+
with self._lock:
|
| 104 |
+
link_id = f"{link.from_event}->{link.to_event}"
|
| 105 |
+
|
| 106 |
+
if link_id in self._links:
|
| 107 |
+
# Update existing link if new one is stronger
|
| 108 |
+
if link.strength > self._links[link_id].strength:
|
| 109 |
+
self._links[link_id] = link
|
| 110 |
+
return
|
| 111 |
+
|
| 112 |
+
self._links[link_id] = link
|
| 113 |
+
self._causes[link.to_event].add(link.from_event)
|
| 114 |
+
self._effects[link.from_event].add(link.to_event)
|
| 115 |
+
self._link_count += 1
|
| 116 |
+
|
| 117 |
+
if self._nx_graph is not None:
|
| 118 |
+
self._nx_graph.add_edge(
|
| 119 |
+
link.from_event,
|
| 120 |
+
link.to_event,
|
| 121 |
+
**link.to_dict()
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
def get_event(self, event_id: str) -> Optional[Event]:
|
| 125 |
+
"""Get an event by ID."""
|
| 126 |
+
with self._lock:
|
| 127 |
+
return self._events.get(event_id)
|
| 128 |
+
|
| 129 |
+
def get_causes(self, event_id: str) -> List[Event]:
|
| 130 |
+
"""
|
| 131 |
+
Get all events that directly caused this event.
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
event_id: ID of the effect event
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
List of causing events
|
| 138 |
+
"""
|
| 139 |
+
with self._lock:
|
| 140 |
+
cause_ids = self._causes.get(event_id, set())
|
| 141 |
+
return [self._events[cid] for cid in cause_ids if cid in self._events]
|
| 142 |
+
|
| 143 |
+
def get_effects(self, event_id: str) -> List[Event]:
|
| 144 |
+
"""
|
| 145 |
+
Get all events that were directly caused by this event.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
event_id: ID of the cause event
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
List of effect events
|
| 152 |
+
"""
|
| 153 |
+
with self._lock:
|
| 154 |
+
effect_ids = self._effects.get(event_id, set())
|
| 155 |
+
return [self._events[eid] for eid in effect_ids if eid in self._events]
|
| 156 |
+
|
| 157 |
+
def get_link(self, from_event: str, to_event: str) -> Optional[CausationLink]:
|
| 158 |
+
"""Get the causation link between two events."""
|
| 159 |
+
with self._lock:
|
| 160 |
+
link_id = f"{from_event}->{to_event}"
|
| 161 |
+
return self._links.get(link_id)
|
| 162 |
+
|
| 163 |
+
def get_all_links(self) -> List[CausationLink]:
|
| 164 |
+
"""Get all causal links in the graph."""
|
| 165 |
+
with self._lock:
|
| 166 |
+
return list(self._links.values())
|
| 167 |
+
|
| 168 |
+
def get_component_connections(self) -> Dict[str, Dict[str, float]]:
|
| 169 |
+
"""
|
| 170 |
+
Aggregate causal links into component-to-component connections.
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
Dict mapping (from_component, to_component) -> total strength
|
| 174 |
+
"""
|
| 175 |
+
with self._lock:
|
| 176 |
+
connections: Dict[tuple, float] = {}
|
| 177 |
+
|
| 178 |
+
for link in self._links.values():
|
| 179 |
+
from_event = self._events.get(link.from_event)
|
| 180 |
+
to_event = self._events.get(link.to_event)
|
| 181 |
+
|
| 182 |
+
if from_event and to_event:
|
| 183 |
+
from_comp = from_event.component
|
| 184 |
+
to_comp = to_event.component
|
| 185 |
+
|
| 186 |
+
if from_comp != to_comp: # Skip self-links
|
| 187 |
+
key = (from_comp, to_comp)
|
| 188 |
+
connections[key] = connections.get(key, 0) + link.strength
|
| 189 |
+
|
| 190 |
+
return connections
|
| 191 |
+
|
| 192 |
+
def get_recent_events(self, count: int = 100) -> List[Event]:
|
| 193 |
+
"""Get the most recent events by timestamp."""
|
| 194 |
+
with self._lock:
|
| 195 |
+
ids = self._events_by_time[-count:]
|
| 196 |
+
return [self._events[eid] for eid in reversed(ids)]
|
| 197 |
+
|
| 198 |
+
def get_events_by_component(self, component: str) -> List[Event]:
|
| 199 |
+
"""Get all events from a specific component."""
|
| 200 |
+
with self._lock:
|
| 201 |
+
ids = self._events_by_component.get(component, [])
|
| 202 |
+
return [self._events[eid] for eid in ids]
|
| 203 |
+
|
| 204 |
+
def get_events_by_type(self, event_type: str) -> List[Event]:
|
| 205 |
+
"""Get all events of a specific type."""
|
| 206 |
+
with self._lock:
|
| 207 |
+
ids = self._events_by_type.get(event_type, [])
|
| 208 |
+
return [self._events[eid] for eid in ids]
|
| 209 |
+
|
| 210 |
+
def find_path(self, from_event: str, to_event: str) -> Optional[List[str]]:
|
| 211 |
+
"""
|
| 212 |
+
Find the shortest causal path between two events.
|
| 213 |
+
|
| 214 |
+
Uses NetworkX if available, otherwise falls back to BFS.
|
| 215 |
+
|
| 216 |
+
Args:
|
| 217 |
+
from_event: Starting event ID
|
| 218 |
+
to_event: Target event ID
|
| 219 |
+
|
| 220 |
+
Returns:
|
| 221 |
+
List of event IDs in the path, or None if no path exists
|
| 222 |
+
"""
|
| 223 |
+
with self._lock:
|
| 224 |
+
if self._nx_graph is not None:
|
| 225 |
+
try:
|
| 226 |
+
return nx.shortest_path(self._nx_graph, from_event, to_event)
|
| 227 |
+
except nx.NetworkXNoPath:
|
| 228 |
+
return None
|
| 229 |
+
except nx.NodeNotFound:
|
| 230 |
+
return None
|
| 231 |
+
else:
|
| 232 |
+
# BFS fallback
|
| 233 |
+
return self._bfs_path(from_event, to_event)
|
| 234 |
+
|
| 235 |
+
def _bfs_path(self, from_event: str, to_event: str) -> Optional[List[str]]:
|
| 236 |
+
"""BFS path finding without NetworkX."""
|
| 237 |
+
from collections import deque
|
| 238 |
+
|
| 239 |
+
if from_event not in self._events or to_event not in self._events:
|
| 240 |
+
return None
|
| 241 |
+
|
| 242 |
+
queue = deque([(from_event, [from_event])])
|
| 243 |
+
visited = {from_event}
|
| 244 |
+
|
| 245 |
+
while queue:
|
| 246 |
+
current, path = queue.popleft()
|
| 247 |
+
|
| 248 |
+
if current == to_event:
|
| 249 |
+
return path
|
| 250 |
+
|
| 251 |
+
for effect_id in self._effects.get(current, set()):
|
| 252 |
+
if effect_id not in visited:
|
| 253 |
+
visited.add(effect_id)
|
| 254 |
+
queue.append((effect_id, path + [effect_id]))
|
| 255 |
+
|
| 256 |
+
return None
|
| 257 |
+
|
| 258 |
+
def get_root_events(self) -> List[Event]:
|
| 259 |
+
"""Get events with no causes (entry points)."""
|
| 260 |
+
with self._lock:
|
| 261 |
+
roots = []
|
| 262 |
+
for event_id, event in self._events.items():
|
| 263 |
+
if not self._causes.get(event_id):
|
| 264 |
+
roots.append(event)
|
| 265 |
+
return sorted(roots, key=lambda e: e.timestamp)
|
| 266 |
+
|
| 267 |
+
def get_leaf_events(self) -> List[Event]:
|
| 268 |
+
"""Get events with no effects (endpoints)."""
|
| 269 |
+
with self._lock:
|
| 270 |
+
leaves = []
|
| 271 |
+
for event_id, event in self._events.items():
|
| 272 |
+
if not self._effects.get(event_id):
|
| 273 |
+
leaves.append(event)
|
| 274 |
+
return sorted(leaves, key=lambda e: e.timestamp, reverse=True)
|
| 275 |
+
|
| 276 |
+
def get_stats(self) -> Dict[str, Any]:
|
| 277 |
+
"""Get statistics about the graph."""
|
| 278 |
+
with self._lock:
|
| 279 |
+
return {
|
| 280 |
+
"event_count": self._event_count,
|
| 281 |
+
"link_count": self._link_count,
|
| 282 |
+
"components": list(self._events_by_component.keys()),
|
| 283 |
+
"event_types": list(self._events_by_type.keys()),
|
| 284 |
+
"root_count": len(self.get_root_events()),
|
| 285 |
+
"leaf_count": len(self.get_leaf_events()),
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
def __len__(self) -> int:
|
| 289 |
+
return self._event_count
|
| 290 |
+
|
| 291 |
+
def __repr__(self) -> str:
|
| 292 |
+
return f"<CausationGraph | {self._event_count} events, {self._link_count} links>"
|
cascade/core/provenance.py
ADDED
|
@@ -0,0 +1,601 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE // PROVENANCE ENGINE
|
| 3 |
+
Cryptographic lineage tracking for neural network activations.
|
| 4 |
+
|
| 5 |
+
Due process infrastructure for AI - immutable evidence chains
|
| 6 |
+
that enable governance without prescribing decisions.
|
| 7 |
+
|
| 8 |
+
Architecture:
|
| 9 |
+
Input → [Layer₀] → [Layer₁] → ... → [Layerₙ] → Output
|
| 10 |
+
│ │ │
|
| 11 |
+
▼ ▼ ▼
|
| 12 |
+
Hash₀ ──► Hash₁ ──► ... ──► Hashₙ
|
| 13 |
+
│ │
|
| 14 |
+
└───────── Merkle Root ─────┘
|
| 15 |
+
|
| 16 |
+
Each hash includes:
|
| 17 |
+
- Tensor state (sampled for efficiency)
|
| 18 |
+
- Parent hashes (inputs to this layer)
|
| 19 |
+
- Layer identity (name, params hash)
|
| 20 |
+
- Execution context (order, timestamp)
|
| 21 |
+
|
| 22 |
+
This creates verifiable, tamper-evident records of
|
| 23 |
+
what happened inside the network.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
import hashlib
|
| 27 |
+
import json
|
| 28 |
+
import time
|
| 29 |
+
from dataclasses import dataclass, field, asdict
|
| 30 |
+
from typing import Dict, List, Optional, Any, Tuple
|
| 31 |
+
from collections import OrderedDict
|
| 32 |
+
import numpy as np
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class ProvenanceRecord:
|
| 37 |
+
"""Immutable record of a single layer's activation state."""
|
| 38 |
+
|
| 39 |
+
# Identity
|
| 40 |
+
layer_name: str
|
| 41 |
+
layer_idx: int
|
| 42 |
+
|
| 43 |
+
# Lineage
|
| 44 |
+
state_hash: str # Hash of this layer's output
|
| 45 |
+
parent_hashes: List[str] # Hashes of inputs (usually 1, but attention has multiple)
|
| 46 |
+
params_hash: Optional[str] = None # Hash of layer weights (frozen reference)
|
| 47 |
+
|
| 48 |
+
# Tensor metadata
|
| 49 |
+
shape: List[int] = field(default_factory=list)
|
| 50 |
+
dtype: str = "float32"
|
| 51 |
+
|
| 52 |
+
# Statistics (for visualization, not hashed)
|
| 53 |
+
stats: Dict[str, float] = field(default_factory=dict)
|
| 54 |
+
|
| 55 |
+
# Execution context
|
| 56 |
+
execution_order: int = 0
|
| 57 |
+
timestamp: float = field(default_factory=time.time)
|
| 58 |
+
|
| 59 |
+
# Merkle tree position
|
| 60 |
+
merkle_depth: int = 0
|
| 61 |
+
merkle_path: List[str] = field(default_factory=list)
|
| 62 |
+
|
| 63 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 64 |
+
"""Serialize for JSON export."""
|
| 65 |
+
return asdict(self)
|
| 66 |
+
|
| 67 |
+
@classmethod
|
| 68 |
+
def from_dict(cls, data: Dict[str, Any]) -> 'ProvenanceRecord':
|
| 69 |
+
"""Deserialize from JSON."""
|
| 70 |
+
return cls(**data)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@dataclass
|
| 74 |
+
class ProvenanceChain:
|
| 75 |
+
"""Complete provenance chain for a forward pass."""
|
| 76 |
+
|
| 77 |
+
# Session identity
|
| 78 |
+
session_id: str
|
| 79 |
+
model_id: str
|
| 80 |
+
model_hash: str
|
| 81 |
+
|
| 82 |
+
# Input/output
|
| 83 |
+
input_hash: str
|
| 84 |
+
output_hash: Optional[str] = None
|
| 85 |
+
|
| 86 |
+
# The chain itself
|
| 87 |
+
records: Dict[str, ProvenanceRecord] = field(default_factory=OrderedDict)
|
| 88 |
+
|
| 89 |
+
# External system roots (for inter-system linking)
|
| 90 |
+
# When this chain depends on another system's computation,
|
| 91 |
+
# include their merkle_root here. This creates the lattice.
|
| 92 |
+
external_roots: List[str] = field(default_factory=list)
|
| 93 |
+
|
| 94 |
+
# Merkle root (computed after chain complete)
|
| 95 |
+
merkle_root: Optional[str] = None
|
| 96 |
+
|
| 97 |
+
# Metadata
|
| 98 |
+
created_at: float = field(default_factory=time.time)
|
| 99 |
+
finalized: bool = False
|
| 100 |
+
|
| 101 |
+
def add_record(self, record: ProvenanceRecord) -> None:
|
| 102 |
+
"""Add a record to the chain. Chain must not be finalized."""
|
| 103 |
+
if self.finalized:
|
| 104 |
+
raise ValueError("Cannot add to finalized chain")
|
| 105 |
+
self.records[record.layer_name] = record
|
| 106 |
+
|
| 107 |
+
def finalize(self) -> str:
|
| 108 |
+
"""Compute Merkle root and lock the chain."""
|
| 109 |
+
if self.finalized:
|
| 110 |
+
return self.merkle_root
|
| 111 |
+
|
| 112 |
+
# Build Merkle tree from record hashes + external roots
|
| 113 |
+
# External roots create cryptographic proof of inter-system dependency
|
| 114 |
+
hashes = [r.state_hash for r in self.records.values()]
|
| 115 |
+
hashes.extend(self.external_roots) # Include external system roots
|
| 116 |
+
self.merkle_root = compute_merkle_root(hashes)
|
| 117 |
+
self.finalized = True
|
| 118 |
+
return self.merkle_root
|
| 119 |
+
|
| 120 |
+
def verify(self) -> Tuple[bool, Optional[str]]:
|
| 121 |
+
"""Verify chain integrity."""
|
| 122 |
+
if not self.finalized:
|
| 123 |
+
return False, "Chain not finalized"
|
| 124 |
+
|
| 125 |
+
# Recompute Merkle root (including external roots)
|
| 126 |
+
hashes = [r.state_hash for r in self.records.values()]
|
| 127 |
+
hashes.extend(self.external_roots) # Must include external roots
|
| 128 |
+
computed_root = compute_merkle_root(hashes)
|
| 129 |
+
|
| 130 |
+
if computed_root != self.merkle_root:
|
| 131 |
+
return False, f"Merkle root mismatch: {computed_root} != {self.merkle_root}"
|
| 132 |
+
|
| 133 |
+
return True, None
|
| 134 |
+
|
| 135 |
+
def link_external(self, external_merkle_root: str, source_id: str = None) -> None:
|
| 136 |
+
"""
|
| 137 |
+
Link this chain to another system's merkle root.
|
| 138 |
+
|
| 139 |
+
This creates the neural internetwork - cryptographic proof
|
| 140 |
+
that this computation depended on another system's output.
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
external_merkle_root: The merkle root from the external system
|
| 144 |
+
source_id: Optional identifier of the source system
|
| 145 |
+
"""
|
| 146 |
+
if self.finalized:
|
| 147 |
+
raise ValueError("Cannot link external root to finalized chain")
|
| 148 |
+
self.external_roots.append(external_merkle_root)
|
| 149 |
+
|
| 150 |
+
def get_lineage(self, layer_name: str) -> List[ProvenanceRecord]:
|
| 151 |
+
"""Trace back from a layer to its ancestors."""
|
| 152 |
+
if layer_name not in self.records:
|
| 153 |
+
return []
|
| 154 |
+
|
| 155 |
+
lineage = []
|
| 156 |
+
current = self.records[layer_name]
|
| 157 |
+
visited = set()
|
| 158 |
+
|
| 159 |
+
def trace_back(record: ProvenanceRecord):
|
| 160 |
+
if record.layer_name in visited:
|
| 161 |
+
return
|
| 162 |
+
visited.add(record.layer_name)
|
| 163 |
+
lineage.append(record)
|
| 164 |
+
|
| 165 |
+
for parent_hash in record.parent_hashes:
|
| 166 |
+
# Find record with this hash
|
| 167 |
+
for r in self.records.values():
|
| 168 |
+
if r.state_hash == parent_hash:
|
| 169 |
+
trace_back(r)
|
| 170 |
+
break
|
| 171 |
+
|
| 172 |
+
trace_back(current)
|
| 173 |
+
return lineage
|
| 174 |
+
|
| 175 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 176 |
+
"""Serialize entire chain."""
|
| 177 |
+
return {
|
| 178 |
+
"session_id": self.session_id,
|
| 179 |
+
"model_id": self.model_id,
|
| 180 |
+
"model_hash": self.model_hash,
|
| 181 |
+
"input_hash": self.input_hash,
|
| 182 |
+
"output_hash": self.output_hash,
|
| 183 |
+
"external_roots": self.external_roots, # Inter-system links
|
| 184 |
+
"merkle_root": self.merkle_root,
|
| 185 |
+
"created_at": self.created_at,
|
| 186 |
+
"finalized": self.finalized,
|
| 187 |
+
"records": {k: v.to_dict() for k, v in self.records.items()}
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
def to_json(self, indent: int = 2) -> str:
|
| 191 |
+
"""Export as JSON."""
|
| 192 |
+
return json.dumps(self.to_dict(), indent=indent)
|
| 193 |
+
|
| 194 |
+
@classmethod
|
| 195 |
+
def from_dict(cls, data: Dict[str, Any]) -> 'ProvenanceChain':
|
| 196 |
+
"""Deserialize from dict."""
|
| 197 |
+
records = OrderedDict()
|
| 198 |
+
for k, v in data.get("records", {}).items():
|
| 199 |
+
records[k] = ProvenanceRecord.from_dict(v)
|
| 200 |
+
|
| 201 |
+
chain = cls(
|
| 202 |
+
session_id=data["session_id"],
|
| 203 |
+
model_id=data["model_id"],
|
| 204 |
+
model_hash=data["model_hash"],
|
| 205 |
+
input_hash=data["input_hash"],
|
| 206 |
+
output_hash=data.get("output_hash"),
|
| 207 |
+
external_roots=data.get("external_roots", []), # Inter-system links
|
| 208 |
+
merkle_root=data.get("merkle_root"),
|
| 209 |
+
created_at=data.get("created_at", time.time()),
|
| 210 |
+
finalized=data.get("finalized", False),
|
| 211 |
+
)
|
| 212 |
+
chain.records = records
|
| 213 |
+
return chain
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
# =============================================================================
|
| 217 |
+
# HASHING FUNCTIONS
|
| 218 |
+
# =============================================================================
|
| 219 |
+
|
| 220 |
+
def hash_tensor(tensor, sample_size: int = 1000) -> str:
|
| 221 |
+
"""
|
| 222 |
+
Compute deterministic hash of tensor state.
|
| 223 |
+
|
| 224 |
+
Samples tensor for efficiency - full hash would be too slow
|
| 225 |
+
for large activations. Sample is deterministic (first N elements
|
| 226 |
+
after flatten) so hash is reproducible.
|
| 227 |
+
|
| 228 |
+
Args:
|
| 229 |
+
tensor: PyTorch tensor or numpy array
|
| 230 |
+
sample_size: Number of elements to sample
|
| 231 |
+
|
| 232 |
+
Returns:
|
| 233 |
+
16-character hex hash
|
| 234 |
+
"""
|
| 235 |
+
# Convert to numpy if needed
|
| 236 |
+
if hasattr(tensor, 'detach'):
|
| 237 |
+
# PyTorch tensor
|
| 238 |
+
arr = tensor.detach().cpu().float().numpy()
|
| 239 |
+
elif hasattr(tensor, 'numpy'):
|
| 240 |
+
arr = tensor.numpy()
|
| 241 |
+
else:
|
| 242 |
+
arr = np.array(tensor)
|
| 243 |
+
|
| 244 |
+
# Flatten and sample
|
| 245 |
+
flat = arr.flatten()
|
| 246 |
+
sample = flat[:min(sample_size, len(flat))]
|
| 247 |
+
|
| 248 |
+
# Hash the bytes
|
| 249 |
+
# Include shape in hash so same values in different shapes hash differently
|
| 250 |
+
shape_bytes = str(arr.shape).encode('utf-8')
|
| 251 |
+
tensor_bytes = sample.astype(np.float32).tobytes()
|
| 252 |
+
|
| 253 |
+
combined = shape_bytes + tensor_bytes
|
| 254 |
+
return hashlib.sha256(combined).hexdigest()[:16]
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def hash_params(module) -> str:
|
| 258 |
+
"""
|
| 259 |
+
Hash a module's parameters (weights, biases).
|
| 260 |
+
|
| 261 |
+
This creates a frozen reference to the model state at observation time.
|
| 262 |
+
If weights change, this hash changes.
|
| 263 |
+
"""
|
| 264 |
+
param_hashes = []
|
| 265 |
+
|
| 266 |
+
for name, param in module.named_parameters(recurse=False):
|
| 267 |
+
if param is not None:
|
| 268 |
+
h = hash_tensor(param.data, sample_size=500)
|
| 269 |
+
param_hashes.append(f"{name}:{h}")
|
| 270 |
+
|
| 271 |
+
if not param_hashes:
|
| 272 |
+
return "no_params"
|
| 273 |
+
|
| 274 |
+
combined = "|".join(sorted(param_hashes))
|
| 275 |
+
return hashlib.sha256(combined.encode()).hexdigest()[:16]
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def hash_model(model) -> str:
|
| 279 |
+
"""
|
| 280 |
+
Hash entire model state.
|
| 281 |
+
|
| 282 |
+
This is the model's identity hash - changes if any weight changes.
|
| 283 |
+
"""
|
| 284 |
+
all_hashes = []
|
| 285 |
+
|
| 286 |
+
for name, param in model.named_parameters():
|
| 287 |
+
h = hash_tensor(param.data, sample_size=100)
|
| 288 |
+
all_hashes.append(f"{name}:{h}")
|
| 289 |
+
|
| 290 |
+
combined = "|".join(all_hashes)
|
| 291 |
+
return hashlib.sha256(combined.encode()).hexdigest()[:32]
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
def hash_input(data: Any) -> str:
|
| 295 |
+
"""
|
| 296 |
+
Hash input data (text, tokens, images, etc).
|
| 297 |
+
"""
|
| 298 |
+
if isinstance(data, str):
|
| 299 |
+
return hashlib.sha256(data.encode('utf-8')).hexdigest()[:16]
|
| 300 |
+
elif hasattr(data, 'detach'):
|
| 301 |
+
return hash_tensor(data)
|
| 302 |
+
elif isinstance(data, dict):
|
| 303 |
+
# Tokenizer output
|
| 304 |
+
combined = json.dumps({k: str(v) for k, v in sorted(data.items())})
|
| 305 |
+
return hashlib.sha256(combined.encode()).hexdigest()[:16]
|
| 306 |
+
else:
|
| 307 |
+
return hashlib.sha256(str(data).encode()).hexdigest()[:16]
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def compute_merkle_root(hashes: List[str]) -> str:
|
| 311 |
+
"""
|
| 312 |
+
Compute Merkle root from list of hashes.
|
| 313 |
+
|
| 314 |
+
Standard Merkle tree construction - pairs hashes bottom-up
|
| 315 |
+
until single root remains.
|
| 316 |
+
"""
|
| 317 |
+
if not hashes:
|
| 318 |
+
return hashlib.sha256(b"empty").hexdigest()[:16]
|
| 319 |
+
|
| 320 |
+
if len(hashes) == 1:
|
| 321 |
+
return hashes[0]
|
| 322 |
+
|
| 323 |
+
# Pad to even length
|
| 324 |
+
if len(hashes) % 2 == 1:
|
| 325 |
+
hashes = hashes + [hashes[-1]]
|
| 326 |
+
|
| 327 |
+
# Compute next level
|
| 328 |
+
next_level = []
|
| 329 |
+
for i in range(0, len(hashes), 2):
|
| 330 |
+
combined = hashes[i] + hashes[i + 1]
|
| 331 |
+
next_hash = hashlib.sha256(combined.encode()).hexdigest()[:16]
|
| 332 |
+
next_level.append(next_hash)
|
| 333 |
+
|
| 334 |
+
return compute_merkle_root(next_level)
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
# =============================================================================
|
| 338 |
+
# PROVENANCE TRACKER (attaches to model)
|
| 339 |
+
# =============================================================================
|
| 340 |
+
|
| 341 |
+
class ProvenanceTracker:
|
| 342 |
+
"""
|
| 343 |
+
Tracks provenance during model forward pass.
|
| 344 |
+
|
| 345 |
+
Usage:
|
| 346 |
+
tracker = ProvenanceTracker(model, model_id="gpt2")
|
| 347 |
+
tracker.start_session(input_text)
|
| 348 |
+
|
| 349 |
+
# Run forward pass - hooks capture everything
|
| 350 |
+
output = model(**inputs)
|
| 351 |
+
|
| 352 |
+
chain = tracker.finalize_session()
|
| 353 |
+
print(chain.merkle_root)
|
| 354 |
+
|
| 355 |
+
NEW: Now writes to tape file (JSONL) for redundant logging!
|
| 356 |
+
Correlative with the Live Tracer - both systems log independently.
|
| 357 |
+
"""
|
| 358 |
+
|
| 359 |
+
def __init__(self, model, model_id: str, log_dir: str = "./logs"):
|
| 360 |
+
self.model = model
|
| 361 |
+
self.model_id = model_id
|
| 362 |
+
self.model_hash = hash_model(model)
|
| 363 |
+
|
| 364 |
+
self.hooks = []
|
| 365 |
+
self.current_chain: Optional[ProvenanceChain] = None
|
| 366 |
+
self.execution_counter = 0
|
| 367 |
+
self.last_hash = None # Track for parent linking
|
| 368 |
+
self.layer_hashes: Dict[str, str] = {} # layer_name -> hash
|
| 369 |
+
|
| 370 |
+
# === TAPE FILE FOR REDUNDANT LOGGING ===
|
| 371 |
+
from pathlib import Path
|
| 372 |
+
from threading import Lock
|
| 373 |
+
self._log_dir = Path(log_dir)
|
| 374 |
+
self._log_dir.mkdir(parents=True, exist_ok=True)
|
| 375 |
+
self._session_id = int(time.time())
|
| 376 |
+
self._tape_path = self._log_dir / f"provenance_tape_{self._session_id}.jsonl"
|
| 377 |
+
self._tape_file = None
|
| 378 |
+
self._tape_lock = Lock()
|
| 379 |
+
self._record_count = 0
|
| 380 |
+
|
| 381 |
+
def start_session(self, input_data: Any) -> str:
|
| 382 |
+
"""Start a new provenance tracking session."""
|
| 383 |
+
import uuid
|
| 384 |
+
|
| 385 |
+
session_id = str(uuid.uuid4())[:8]
|
| 386 |
+
input_hash = hash_input(input_data)
|
| 387 |
+
|
| 388 |
+
self.current_chain = ProvenanceChain(
|
| 389 |
+
session_id=session_id,
|
| 390 |
+
model_id=self.model_id,
|
| 391 |
+
model_hash=self.model_hash,
|
| 392 |
+
input_hash=input_hash
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
self.execution_counter = 0
|
| 396 |
+
self.last_hash = input_hash
|
| 397 |
+
self.layer_hashes = {"input": input_hash}
|
| 398 |
+
|
| 399 |
+
# Register hooks
|
| 400 |
+
self._register_hooks()
|
| 401 |
+
|
| 402 |
+
return session_id
|
| 403 |
+
|
| 404 |
+
def _register_hooks(self):
|
| 405 |
+
"""Register forward hooks on all modules."""
|
| 406 |
+
self._remove_hooks() # Clean up any existing
|
| 407 |
+
|
| 408 |
+
for name, module in self.model.named_modules():
|
| 409 |
+
if name: # Skip root
|
| 410 |
+
hook = module.register_forward_hook(
|
| 411 |
+
self._make_hook(name)
|
| 412 |
+
)
|
| 413 |
+
self.hooks.append(hook)
|
| 414 |
+
|
| 415 |
+
def _make_hook(self, layer_name: str):
|
| 416 |
+
"""Create a forward hook for a specific layer."""
|
| 417 |
+
def hook(module, inp, out):
|
| 418 |
+
# Extract tensor
|
| 419 |
+
tensor = None
|
| 420 |
+
if hasattr(out, 'detach'):
|
| 421 |
+
tensor = out
|
| 422 |
+
elif isinstance(out, tuple) and len(out) > 0 and hasattr(out[0], 'detach'):
|
| 423 |
+
tensor = out[0]
|
| 424 |
+
elif hasattr(out, 'last_hidden_state'):
|
| 425 |
+
tensor = out.last_hidden_state
|
| 426 |
+
elif hasattr(out, 'logits'):
|
| 427 |
+
tensor = out.logits
|
| 428 |
+
|
| 429 |
+
if tensor is None or not hasattr(tensor, 'numel') or tensor.numel() == 0:
|
| 430 |
+
return
|
| 431 |
+
|
| 432 |
+
# Compute hashes
|
| 433 |
+
state_hash = hash_tensor(tensor)
|
| 434 |
+
params_hash = hash_params(module)
|
| 435 |
+
|
| 436 |
+
# Determine parent hashes
|
| 437 |
+
# For now, use last layer's hash. More sophisticated: track actual data flow.
|
| 438 |
+
parent_hashes = [self.last_hash] if self.last_hash else []
|
| 439 |
+
|
| 440 |
+
# Compute stats
|
| 441 |
+
t = tensor.float()
|
| 442 |
+
stats = {
|
| 443 |
+
"mean": t.mean().item(),
|
| 444 |
+
"std": t.std().item(),
|
| 445 |
+
"min": t.min().item(),
|
| 446 |
+
"max": t.max().item(),
|
| 447 |
+
"sparsity": (tensor == 0).float().mean().item(),
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
# Create record
|
| 451 |
+
record = ProvenanceRecord(
|
| 452 |
+
layer_name=layer_name,
|
| 453 |
+
layer_idx=self.execution_counter,
|
| 454 |
+
state_hash=state_hash,
|
| 455 |
+
parent_hashes=parent_hashes,
|
| 456 |
+
params_hash=params_hash,
|
| 457 |
+
shape=list(tensor.shape),
|
| 458 |
+
dtype=str(tensor.dtype),
|
| 459 |
+
stats=stats,
|
| 460 |
+
execution_order=self.execution_counter,
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
+
# Add to chain
|
| 464 |
+
if self.current_chain:
|
| 465 |
+
self.current_chain.add_record(record)
|
| 466 |
+
|
| 467 |
+
# === WRITE TO TAPE (REDUNDANT LOGGING) ===
|
| 468 |
+
self._write_to_tape(record)
|
| 469 |
+
|
| 470 |
+
# Update tracking
|
| 471 |
+
self.last_hash = state_hash
|
| 472 |
+
self.layer_hashes[layer_name] = state_hash
|
| 473 |
+
self.execution_counter += 1
|
| 474 |
+
self._record_count += 1
|
| 475 |
+
|
| 476 |
+
return hook
|
| 477 |
+
|
| 478 |
+
def _write_to_tape(self, record: ProvenanceRecord):
|
| 479 |
+
"""Write provenance record to tape file for redundant logging."""
|
| 480 |
+
import json
|
| 481 |
+
try:
|
| 482 |
+
with self._tape_lock:
|
| 483 |
+
if self._tape_file is None:
|
| 484 |
+
self._tape_file = open(self._tape_path, "a", encoding="utf-8")
|
| 485 |
+
print(f"[CASCADE] 📼 Provenance tape started: {self._tape_path}")
|
| 486 |
+
|
| 487 |
+
tape_record = {
|
| 488 |
+
"seq": self._record_count,
|
| 489 |
+
"record": record.to_dict(),
|
| 490 |
+
"session_id": self._session_id,
|
| 491 |
+
"model_id": self.model_id,
|
| 492 |
+
}
|
| 493 |
+
self._tape_file.write(json.dumps(tape_record, default=str) + "\n")
|
| 494 |
+
self._tape_file.flush()
|
| 495 |
+
except Exception as e:
|
| 496 |
+
pass # Don't let tape errors break the main flow
|
| 497 |
+
|
| 498 |
+
def close_tape(self):
|
| 499 |
+
"""Close the tape file."""
|
| 500 |
+
with self._tape_lock:
|
| 501 |
+
if self._tape_file:
|
| 502 |
+
self._tape_file.close()
|
| 503 |
+
self._tape_file = None
|
| 504 |
+
print(f"[CASCADE] 📼 Provenance tape closed: {self._record_count} records → {self._tape_path}")
|
| 505 |
+
|
| 506 |
+
def get_tape_path(self):
|
| 507 |
+
"""Get the current tape file path."""
|
| 508 |
+
return self._tape_path
|
| 509 |
+
|
| 510 |
+
def _remove_hooks(self):
|
| 511 |
+
"""Remove all registered hooks."""
|
| 512 |
+
for hook in self.hooks:
|
| 513 |
+
hook.remove()
|
| 514 |
+
self.hooks = []
|
| 515 |
+
|
| 516 |
+
def finalize_session(self, output_data: Any = None) -> ProvenanceChain:
|
| 517 |
+
"""Finalize session, compute Merkle root, return chain."""
|
| 518 |
+
self._remove_hooks()
|
| 519 |
+
|
| 520 |
+
if self.current_chain is None:
|
| 521 |
+
raise ValueError("No active session")
|
| 522 |
+
|
| 523 |
+
if output_data is not None:
|
| 524 |
+
self.current_chain.output_hash = hash_input(output_data)
|
| 525 |
+
|
| 526 |
+
self.current_chain.finalize()
|
| 527 |
+
|
| 528 |
+
# Close tape (session complete)
|
| 529 |
+
self.close_tape()
|
| 530 |
+
|
| 531 |
+
chain = self.current_chain
|
| 532 |
+
self.current_chain = None
|
| 533 |
+
|
| 534 |
+
return chain
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
# =============================================================================
|
| 538 |
+
# VERIFICATION & COMPARISON
|
| 539 |
+
# =============================================================================
|
| 540 |
+
|
| 541 |
+
def verify_chain(chain: ProvenanceChain) -> Tuple[bool, str]:
|
| 542 |
+
"""Verify a provenance chain's integrity."""
|
| 543 |
+
return chain.verify()
|
| 544 |
+
|
| 545 |
+
|
| 546 |
+
def compare_chains(chain_a: ProvenanceChain, chain_b: ProvenanceChain) -> Dict[str, Any]:
|
| 547 |
+
"""
|
| 548 |
+
Compare two provenance chains.
|
| 549 |
+
|
| 550 |
+
Useful for:
|
| 551 |
+
- Same model, different inputs (where did outputs diverge?)
|
| 552 |
+
- Different models, same input (structural comparison)
|
| 553 |
+
- Same everything (reproducibility check)
|
| 554 |
+
"""
|
| 555 |
+
result = {
|
| 556 |
+
"model_match": chain_a.model_hash == chain_b.model_hash,
|
| 557 |
+
"input_match": chain_a.input_hash == chain_b.input_hash,
|
| 558 |
+
"output_match": chain_a.output_hash == chain_b.output_hash,
|
| 559 |
+
"merkle_match": chain_a.merkle_root == chain_b.merkle_root,
|
| 560 |
+
"divergence_points": [],
|
| 561 |
+
"a_only_layers": [],
|
| 562 |
+
"b_only_layers": [],
|
| 563 |
+
"matching_layers": [],
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
a_layers = set(chain_a.records.keys())
|
| 567 |
+
b_layers = set(chain_b.records.keys())
|
| 568 |
+
|
| 569 |
+
result["a_only_layers"] = list(a_layers - b_layers)
|
| 570 |
+
result["b_only_layers"] = list(b_layers - a_layers)
|
| 571 |
+
|
| 572 |
+
# Compare matching layers
|
| 573 |
+
for layer in a_layers & b_layers:
|
| 574 |
+
rec_a = chain_a.records[layer]
|
| 575 |
+
rec_b = chain_b.records[layer]
|
| 576 |
+
|
| 577 |
+
if rec_a.state_hash == rec_b.state_hash:
|
| 578 |
+
result["matching_layers"].append(layer)
|
| 579 |
+
else:
|
| 580 |
+
result["divergence_points"].append({
|
| 581 |
+
"layer": layer,
|
| 582 |
+
"hash_a": rec_a.state_hash,
|
| 583 |
+
"hash_b": rec_b.state_hash,
|
| 584 |
+
"stats_a": rec_a.stats,
|
| 585 |
+
"stats_b": rec_b.stats,
|
| 586 |
+
})
|
| 587 |
+
|
| 588 |
+
return result
|
| 589 |
+
|
| 590 |
+
|
| 591 |
+
def export_chain_for_audit(chain: ProvenanceChain, filepath: str) -> None:
|
| 592 |
+
"""Export chain to file for external audit."""
|
| 593 |
+
with open(filepath, 'w') as f:
|
| 594 |
+
f.write(chain.to_json(indent=2))
|
| 595 |
+
|
| 596 |
+
|
| 597 |
+
def import_chain_for_audit(filepath: str) -> ProvenanceChain:
|
| 598 |
+
"""Import chain from audit file."""
|
| 599 |
+
with open(filepath, 'r') as f:
|
| 600 |
+
data = json.load(f)
|
| 601 |
+
return ProvenanceChain.from_dict(data)
|
cascade/core/web3_bridge.py
ADDED
|
@@ -0,0 +1,846 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE // WEB3 BRIDGE
|
| 3 |
+
Blockchain integration for AI provenance.
|
| 4 |
+
|
| 5 |
+
The bridge between neural networks and decentralized infrastructure.
|
| 6 |
+
|
| 7 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 8 |
+
│ THE IMMUTABLE RECORD │
|
| 9 |
+
│ │
|
| 10 |
+
│ AI Inference ──► Provenance Chain ──► Merkle Root ──► Chain │
|
| 11 |
+
│ │ │
|
| 12 |
+
│ ▼ │
|
| 13 |
+
│ ┌─────────────────────────────────┐ │
|
| 14 |
+
│ │ ETHEREUM / SOLANA / etc │ │
|
| 15 |
+
│ │ ┌───────────────────────────┐ │ │
|
| 16 |
+
│ │ │ Attestation Contract │ │ │
|
| 17 |
+
│ │ │ - Model hash │ │ │
|
| 18 |
+
│ │ │ - Input hash │ │ │
|
| 19 |
+
│ │ │ - Merkle root │ │ │
|
| 20 |
+
│ │ │ - Timestamp │ │ │
|
| 21 |
+
│ │ └───────────────────────────┘ │ │
|
| 22 |
+
│ └─────────────────────────────────┘ │
|
| 23 |
+
│ │ │
|
| 24 |
+
│ ▼ │
|
| 25 |
+
│ IPFS / Arweave / Filecoin │
|
| 26 |
+
│ (Full provenance chain storage) │
|
| 27 |
+
└─────────────────────────────────────────────────────────────────┘
|
| 28 |
+
|
| 29 |
+
Web3 provides:
|
| 30 |
+
- Timestamping (block finality)
|
| 31 |
+
- Immutability (blockchain consensus)
|
| 32 |
+
- Decentralized storage (IPFS)
|
| 33 |
+
- Public verifiability (anyone can audit)
|
| 34 |
+
- Economic incentives (staking, reputation)
|
| 35 |
+
|
| 36 |
+
This module provides:
|
| 37 |
+
- EIP-712 typed data signatures (Ethereum standard)
|
| 38 |
+
- IPFS CID computation (content addressing)
|
| 39 |
+
- Smart contract ABI for attestation
|
| 40 |
+
- Multi-chain attestation format
|
| 41 |
+
- NFT metadata for provenance tokens
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
import hashlib
|
| 45 |
+
import json
|
| 46 |
+
import time
|
| 47 |
+
import struct
|
| 48 |
+
from typing import Dict, List, Optional, Any, Tuple
|
| 49 |
+
from dataclasses import dataclass, field, asdict
|
| 50 |
+
import base64
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
from .provenance import ProvenanceChain, ProvenanceRecord, compute_merkle_root
|
| 54 |
+
except ImportError:
|
| 55 |
+
from provenance import ProvenanceChain, ProvenanceRecord, compute_merkle_root
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# =============================================================================
|
| 59 |
+
# CONSTANTS
|
| 60 |
+
# =============================================================================
|
| 61 |
+
|
| 62 |
+
# EIP-712 Domain for CASCADE attestations
|
| 63 |
+
CASCADE_DOMAIN = {
|
| 64 |
+
"name": "CASCADE Provenance",
|
| 65 |
+
"version": "1",
|
| 66 |
+
"chainId": 1, # Ethereum mainnet, override for other chains
|
| 67 |
+
"verifyingContract": "0x0000000000000000000000000000000000000000", # Set on deployment
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
# Attestation type definition for EIP-712
|
| 71 |
+
ATTESTATION_TYPES = {
|
| 72 |
+
"Attestation": [
|
| 73 |
+
{"name": "model_hash", "type": "bytes32"},
|
| 74 |
+
{"name": "input_hash", "type": "bytes32"},
|
| 75 |
+
{"name": "merkle_root", "type": "bytes32"},
|
| 76 |
+
{"name": "timestamp", "type": "uint256"},
|
| 77 |
+
{"name": "session_id", "type": "string"},
|
| 78 |
+
{"name": "layer_count", "type": "uint256"},
|
| 79 |
+
]
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# =============================================================================
|
| 84 |
+
# ATTESTATION RECORD
|
| 85 |
+
# =============================================================================
|
| 86 |
+
|
| 87 |
+
@dataclass
|
| 88 |
+
class Web3Attestation:
|
| 89 |
+
"""
|
| 90 |
+
Blockchain-ready attestation of AI inference provenance.
|
| 91 |
+
|
| 92 |
+
This is the "receipt" that can be posted on-chain.
|
| 93 |
+
Minimal data for on-chain storage, full data on IPFS.
|
| 94 |
+
"""
|
| 95 |
+
|
| 96 |
+
# Core identity
|
| 97 |
+
model_hash: str # 32-byte hash of model weights
|
| 98 |
+
input_hash: str # 32-byte hash of input data
|
| 99 |
+
output_hash: str # 32-byte hash of output
|
| 100 |
+
merkle_root: str # Merkle root of provenance chain
|
| 101 |
+
|
| 102 |
+
# Metadata
|
| 103 |
+
session_id: str # Unique session identifier
|
| 104 |
+
timestamp: int # Unix timestamp
|
| 105 |
+
layer_count: int # Number of layers in chain
|
| 106 |
+
|
| 107 |
+
# Content addressing
|
| 108 |
+
ipfs_cid: Optional[str] = None # IPFS CID for full chain
|
| 109 |
+
arweave_id: Optional[str] = None # Arweave transaction ID
|
| 110 |
+
|
| 111 |
+
# Signatures (set by wallet)
|
| 112 |
+
signature: Optional[str] = None # EIP-712 signature
|
| 113 |
+
signer: Optional[str] = None # Ethereum address
|
| 114 |
+
|
| 115 |
+
# Chain info
|
| 116 |
+
chain_id: int = 1 # 1=Ethereum, 137=Polygon, etc.
|
| 117 |
+
contract_address: Optional[str] = None
|
| 118 |
+
tx_hash: Optional[str] = None # Transaction hash after posting
|
| 119 |
+
|
| 120 |
+
def to_eip712_message(self, domain: Optional[Dict] = None) -> Dict[str, Any]:
|
| 121 |
+
"""
|
| 122 |
+
Format as EIP-712 typed data for signing.
|
| 123 |
+
|
| 124 |
+
This is the standard Ethereum signing format that wallets understand.
|
| 125 |
+
"""
|
| 126 |
+
domain = domain or CASCADE_DOMAIN
|
| 127 |
+
|
| 128 |
+
return {
|
| 129 |
+
"types": {
|
| 130 |
+
"EIP712Domain": [
|
| 131 |
+
{"name": "name", "type": "string"},
|
| 132 |
+
{"name": "version", "type": "string"},
|
| 133 |
+
{"name": "chainId", "type": "uint256"},
|
| 134 |
+
{"name": "verifyingContract", "type": "address"},
|
| 135 |
+
],
|
| 136 |
+
**ATTESTATION_TYPES
|
| 137 |
+
},
|
| 138 |
+
"primaryType": "Attestation",
|
| 139 |
+
"domain": domain,
|
| 140 |
+
"message": {
|
| 141 |
+
"model_hash": self._to_bytes32(self.model_hash),
|
| 142 |
+
"input_hash": self._to_bytes32(self.input_hash),
|
| 143 |
+
"merkle_root": self._to_bytes32(self.merkle_root),
|
| 144 |
+
"timestamp": self.timestamp,
|
| 145 |
+
"session_id": self.session_id,
|
| 146 |
+
"layer_count": self.layer_count,
|
| 147 |
+
}
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
def _to_bytes32(self, hex_str: str) -> str:
|
| 151 |
+
"""Pad hash to bytes32 format."""
|
| 152 |
+
# Remove 0x prefix if present
|
| 153 |
+
clean = hex_str.replace("0x", "")
|
| 154 |
+
# Pad to 64 chars (32 bytes)
|
| 155 |
+
padded = clean.zfill(64)
|
| 156 |
+
return "0x" + padded
|
| 157 |
+
|
| 158 |
+
def to_contract_args(self) -> Tuple:
|
| 159 |
+
"""
|
| 160 |
+
Format for smart contract function call.
|
| 161 |
+
|
| 162 |
+
Returns tuple matching:
|
| 163 |
+
function attest(bytes32 modelHash, bytes32 inputHash, bytes32 merkleRoot,
|
| 164 |
+
string memory sessionId, uint256 layerCount)
|
| 165 |
+
"""
|
| 166 |
+
return (
|
| 167 |
+
bytes.fromhex(self.model_hash.replace("0x", "").zfill(64)),
|
| 168 |
+
bytes.fromhex(self.input_hash.replace("0x", "").zfill(64)),
|
| 169 |
+
bytes.fromhex(self.merkle_root.replace("0x", "").zfill(64)),
|
| 170 |
+
self.session_id,
|
| 171 |
+
self.layer_count,
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 175 |
+
"""Serialize for storage/transmission."""
|
| 176 |
+
return asdict(self)
|
| 177 |
+
|
| 178 |
+
def to_json(self) -> str:
|
| 179 |
+
"""JSON export."""
|
| 180 |
+
return json.dumps(self.to_dict(), indent=2)
|
| 181 |
+
|
| 182 |
+
@classmethod
|
| 183 |
+
def from_chain(cls, chain: ProvenanceChain) -> 'Web3Attestation':
|
| 184 |
+
"""Create attestation from provenance chain."""
|
| 185 |
+
if not chain.finalized:
|
| 186 |
+
chain.finalize()
|
| 187 |
+
|
| 188 |
+
return cls(
|
| 189 |
+
model_hash=chain.model_hash,
|
| 190 |
+
input_hash=chain.input_hash,
|
| 191 |
+
output_hash=chain.output_hash or "0" * 16,
|
| 192 |
+
merkle_root=chain.merkle_root,
|
| 193 |
+
session_id=chain.session_id,
|
| 194 |
+
timestamp=int(chain.created_at),
|
| 195 |
+
layer_count=len(chain.records),
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
# =============================================================================
|
| 200 |
+
# IPFS CONTENT ADDRESSING
|
| 201 |
+
# =============================================================================
|
| 202 |
+
|
| 203 |
+
def compute_ipfs_cid_v0(data: bytes) -> str:
|
| 204 |
+
"""
|
| 205 |
+
Compute IPFS CID v0 (Qm...) for data.
|
| 206 |
+
|
| 207 |
+
This is a simplified computation - actual IPFS uses more complex
|
| 208 |
+
chunking for large files. Suitable for JSON chain data.
|
| 209 |
+
|
| 210 |
+
CIDv0 format: Base58(0x12 || 0x20 || SHA256(data))
|
| 211 |
+
"""
|
| 212 |
+
# SHA-256 hash
|
| 213 |
+
sha_hash = hashlib.sha256(data).digest()
|
| 214 |
+
|
| 215 |
+
# Multihash prefix: 0x12 (sha2-256), 0x20 (32 bytes)
|
| 216 |
+
multihash = bytes([0x12, 0x20]) + sha_hash
|
| 217 |
+
|
| 218 |
+
# Base58 encode (Bitcoin alphabet)
|
| 219 |
+
return base58_encode(multihash)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def compute_ipfs_cid_v1(data: bytes) -> str:
|
| 223 |
+
"""
|
| 224 |
+
Compute IPFS CID v1 (bafy...) for data.
|
| 225 |
+
|
| 226 |
+
CIDv1 format: multibase || version || codec || multihash
|
| 227 |
+
"""
|
| 228 |
+
# SHA-256 hash
|
| 229 |
+
sha_hash = hashlib.sha256(data).digest()
|
| 230 |
+
|
| 231 |
+
# Build CIDv1:
|
| 232 |
+
# 0x01 = CID version 1
|
| 233 |
+
# 0x55 = raw binary codec (could also use 0x71 for dag-cbor)
|
| 234 |
+
# 0x12 = sha2-256
|
| 235 |
+
# 0x20 = 32 bytes
|
| 236 |
+
cid_bytes = bytes([0x01, 0x55, 0x12, 0x20]) + sha_hash
|
| 237 |
+
|
| 238 |
+
# Base32 lower with 'b' prefix (multibase)
|
| 239 |
+
import base64
|
| 240 |
+
b32 = base64.b32encode(cid_bytes).decode('ascii').lower().rstrip('=')
|
| 241 |
+
return 'b' + b32
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def base58_encode(data: bytes) -> str:
|
| 245 |
+
"""Base58 encoding (Bitcoin alphabet)."""
|
| 246 |
+
ALPHABET = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"
|
| 247 |
+
|
| 248 |
+
# Count leading zeros
|
| 249 |
+
leading_zeros = 0
|
| 250 |
+
for byte in data:
|
| 251 |
+
if byte == 0:
|
| 252 |
+
leading_zeros += 1
|
| 253 |
+
else:
|
| 254 |
+
break
|
| 255 |
+
|
| 256 |
+
# Convert to integer
|
| 257 |
+
num = int.from_bytes(data, 'big')
|
| 258 |
+
|
| 259 |
+
# Convert to base58
|
| 260 |
+
result = ""
|
| 261 |
+
while num > 0:
|
| 262 |
+
num, remainder = divmod(num, 58)
|
| 263 |
+
result = ALPHABET[remainder] + result
|
| 264 |
+
|
| 265 |
+
# Add leading '1's for each leading zero byte
|
| 266 |
+
return '1' * leading_zeros + result
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
def chain_to_ipfs_ready(chain: ProvenanceChain) -> Tuple[bytes, str]:
|
| 270 |
+
"""
|
| 271 |
+
Prepare provenance chain for IPFS upload.
|
| 272 |
+
|
| 273 |
+
Returns:
|
| 274 |
+
(data_bytes, cid) - The data to upload and its expected CID
|
| 275 |
+
"""
|
| 276 |
+
json_data = chain.to_json().encode('utf-8')
|
| 277 |
+
cid = compute_ipfs_cid_v0(json_data)
|
| 278 |
+
return json_data, cid
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
# =============================================================================
|
| 282 |
+
# SMART CONTRACT ABI
|
| 283 |
+
# =============================================================================
|
| 284 |
+
|
| 285 |
+
CASCADE_ATTESTATION_ABI = [
|
| 286 |
+
{
|
| 287 |
+
"name": "Attest",
|
| 288 |
+
"type": "event",
|
| 289 |
+
"inputs": [
|
| 290 |
+
{"name": "attester", "type": "address", "indexed": True},
|
| 291 |
+
{"name": "modelHash", "type": "bytes32", "indexed": True},
|
| 292 |
+
{"name": "merkleRoot", "type": "bytes32", "indexed": False},
|
| 293 |
+
{"name": "sessionId", "type": "string", "indexed": False},
|
| 294 |
+
{"name": "timestamp", "type": "uint256", "indexed": False},
|
| 295 |
+
]
|
| 296 |
+
},
|
| 297 |
+
{
|
| 298 |
+
"name": "attest",
|
| 299 |
+
"type": "function",
|
| 300 |
+
"stateMutability": "nonpayable",
|
| 301 |
+
"inputs": [
|
| 302 |
+
{"name": "modelHash", "type": "bytes32"},
|
| 303 |
+
{"name": "inputHash", "type": "bytes32"},
|
| 304 |
+
{"name": "merkleRoot", "type": "bytes32"},
|
| 305 |
+
{"name": "sessionId", "type": "string"},
|
| 306 |
+
{"name": "layerCount", "type": "uint256"},
|
| 307 |
+
],
|
| 308 |
+
"outputs": [{"name": "attestationId", "type": "uint256"}]
|
| 309 |
+
},
|
| 310 |
+
{
|
| 311 |
+
"name": "verify",
|
| 312 |
+
"type": "function",
|
| 313 |
+
"stateMutability": "view",
|
| 314 |
+
"inputs": [
|
| 315 |
+
{"name": "attestationId", "type": "uint256"},
|
| 316 |
+
],
|
| 317 |
+
"outputs": [
|
| 318 |
+
{"name": "valid", "type": "bool"},
|
| 319 |
+
{"name": "attester", "type": "address"},
|
| 320 |
+
{"name": "modelHash", "type": "bytes32"},
|
| 321 |
+
{"name": "merkleRoot", "type": "bytes32"},
|
| 322 |
+
]
|
| 323 |
+
},
|
| 324 |
+
{
|
| 325 |
+
"name": "getAttestation",
|
| 326 |
+
"type": "function",
|
| 327 |
+
"stateMutability": "view",
|
| 328 |
+
"inputs": [
|
| 329 |
+
{"name": "attestationId", "type": "uint256"},
|
| 330 |
+
],
|
| 331 |
+
"outputs": [
|
| 332 |
+
{"name": "attester", "type": "address"},
|
| 333 |
+
{"name": "modelHash", "type": "bytes32"},
|
| 334 |
+
{"name": "inputHash", "type": "bytes32"},
|
| 335 |
+
{"name": "merkleRoot", "type": "bytes32"},
|
| 336 |
+
{"name": "sessionId", "type": "string"},
|
| 337 |
+
{"name": "layerCount", "type": "uint256"},
|
| 338 |
+
{"name": "timestamp", "type": "uint256"},
|
| 339 |
+
]
|
| 340 |
+
},
|
| 341 |
+
{
|
| 342 |
+
"name": "attestationsByModel",
|
| 343 |
+
"type": "function",
|
| 344 |
+
"stateMutability": "view",
|
| 345 |
+
"inputs": [
|
| 346 |
+
{"name": "modelHash", "type": "bytes32"},
|
| 347 |
+
],
|
| 348 |
+
"outputs": [
|
| 349 |
+
{"name": "attestationIds", "type": "uint256[]"},
|
| 350 |
+
]
|
| 351 |
+
},
|
| 352 |
+
]
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
# Solidity source for the attestation contract
|
| 356 |
+
CASCADE_ATTESTATION_SOLIDITY = '''
|
| 357 |
+
// SPDX-License-Identifier: MIT
|
| 358 |
+
pragma solidity ^0.8.19;
|
| 359 |
+
|
| 360 |
+
/**
|
| 361 |
+
* @title CascadeAttestation
|
| 362 |
+
* @notice On-chain attestation of AI inference provenance
|
| 363 |
+
* @dev Stores Merkle roots for off-chain provenance chains
|
| 364 |
+
*/
|
| 365 |
+
contract CascadeAttestation {
|
| 366 |
+
|
| 367 |
+
struct Attestation {
|
| 368 |
+
address attester;
|
| 369 |
+
bytes32 modelHash;
|
| 370 |
+
bytes32 inputHash;
|
| 371 |
+
bytes32 merkleRoot;
|
| 372 |
+
string sessionId;
|
| 373 |
+
uint256 layerCount;
|
| 374 |
+
uint256 timestamp;
|
| 375 |
+
string ipfsCid; // Optional: full chain on IPFS
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
// Attestation storage
|
| 379 |
+
mapping(uint256 => Attestation) public attestations;
|
| 380 |
+
uint256 public attestationCount;
|
| 381 |
+
|
| 382 |
+
// Index by model
|
| 383 |
+
mapping(bytes32 => uint256[]) public attestationsByModel;
|
| 384 |
+
|
| 385 |
+
// Index by attester
|
| 386 |
+
mapping(address => uint256[]) public attestationsByAttester;
|
| 387 |
+
|
| 388 |
+
// Events
|
| 389 |
+
event Attested(
|
| 390 |
+
uint256 indexed attestationId,
|
| 391 |
+
address indexed attester,
|
| 392 |
+
bytes32 indexed modelHash,
|
| 393 |
+
bytes32 merkleRoot,
|
| 394 |
+
string sessionId
|
| 395 |
+
);
|
| 396 |
+
|
| 397 |
+
/**
|
| 398 |
+
* @notice Create a new attestation
|
| 399 |
+
* @param modelHash Hash of the model weights
|
| 400 |
+
* @param inputHash Hash of the input data
|
| 401 |
+
* @param merkleRoot Merkle root of the provenance chain
|
| 402 |
+
* @param sessionId Unique session identifier
|
| 403 |
+
* @param layerCount Number of layers in the chain
|
| 404 |
+
* @return attestationId The ID of the new attestation
|
| 405 |
+
*/
|
| 406 |
+
function attest(
|
| 407 |
+
bytes32 modelHash,
|
| 408 |
+
bytes32 inputHash,
|
| 409 |
+
bytes32 merkleRoot,
|
| 410 |
+
string memory sessionId,
|
| 411 |
+
uint256 layerCount
|
| 412 |
+
) external returns (uint256 attestationId) {
|
| 413 |
+
attestationId = attestationCount++;
|
| 414 |
+
|
| 415 |
+
attestations[attestationId] = Attestation({
|
| 416 |
+
attester: msg.sender,
|
| 417 |
+
modelHash: modelHash,
|
| 418 |
+
inputHash: inputHash,
|
| 419 |
+
merkleRoot: merkleRoot,
|
| 420 |
+
sessionId: sessionId,
|
| 421 |
+
layerCount: layerCount,
|
| 422 |
+
timestamp: block.timestamp,
|
| 423 |
+
ipfsCid: ""
|
| 424 |
+
});
|
| 425 |
+
|
| 426 |
+
attestationsByModel[modelHash].push(attestationId);
|
| 427 |
+
attestationsByAttester[msg.sender].push(attestationId);
|
| 428 |
+
|
| 429 |
+
emit Attested(attestationId, msg.sender, modelHash, merkleRoot, sessionId);
|
| 430 |
+
|
| 431 |
+
return attestationId;
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
/**
|
| 435 |
+
* @notice Attest with IPFS CID for full chain data
|
| 436 |
+
*/
|
| 437 |
+
function attestWithIPFS(
|
| 438 |
+
bytes32 modelHash,
|
| 439 |
+
bytes32 inputHash,
|
| 440 |
+
bytes32 merkleRoot,
|
| 441 |
+
string memory sessionId,
|
| 442 |
+
uint256 layerCount,
|
| 443 |
+
string memory ipfsCid
|
| 444 |
+
) external returns (uint256 attestationId) {
|
| 445 |
+
attestationId = this.attest(modelHash, inputHash, merkleRoot, sessionId, layerCount);
|
| 446 |
+
attestations[attestationId].ipfsCid = ipfsCid;
|
| 447 |
+
return attestationId;
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
/**
|
| 451 |
+
* @notice Verify an attestation exists and return core data
|
| 452 |
+
*/
|
| 453 |
+
function verify(uint256 attestationId) external view returns (
|
| 454 |
+
bool valid,
|
| 455 |
+
address attester,
|
| 456 |
+
bytes32 modelHash,
|
| 457 |
+
bytes32 merkleRoot
|
| 458 |
+
) {
|
| 459 |
+
if (attestationId >= attestationCount) {
|
| 460 |
+
return (false, address(0), bytes32(0), bytes32(0));
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
Attestation storage a = attestations[attestationId];
|
| 464 |
+
return (true, a.attester, a.modelHash, a.merkleRoot);
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
/**
|
| 468 |
+
* @notice Get all attestations for a model
|
| 469 |
+
*/
|
| 470 |
+
function getModelAttestations(bytes32 modelHash) external view returns (uint256[] memory) {
|
| 471 |
+
return attestationsByModel[modelHash];
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
/**
|
| 475 |
+
* @notice Get all attestations by an address
|
| 476 |
+
*/
|
| 477 |
+
function getAttesterAttestations(address attester) external view returns (uint256[] memory) {
|
| 478 |
+
return attestationsByAttester[attester];
|
| 479 |
+
}
|
| 480 |
+
}
|
| 481 |
+
'''
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
# =============================================================================
|
| 485 |
+
# NFT METADATA (for provenance tokens)
|
| 486 |
+
# =============================================================================
|
| 487 |
+
|
| 488 |
+
def generate_nft_metadata(chain: ProvenanceChain,
|
| 489 |
+
image_url: Optional[str] = None,
|
| 490 |
+
animation_url: Optional[str] = None) -> Dict[str, Any]:
|
| 491 |
+
"""
|
| 492 |
+
Generate ERC-721 compatible metadata for a provenance NFT.
|
| 493 |
+
|
| 494 |
+
Each unique model×input×output combination could be an NFT,
|
| 495 |
+
proving that this specific inference happened.
|
| 496 |
+
"""
|
| 497 |
+
if not chain.finalized:
|
| 498 |
+
chain.finalize()
|
| 499 |
+
|
| 500 |
+
# Generate attributes from chain
|
| 501 |
+
attributes = [
|
| 502 |
+
{"trait_type": "Model Hash", "value": chain.model_hash[:16]},
|
| 503 |
+
{"trait_type": "Input Hash", "value": chain.input_hash},
|
| 504 |
+
{"trait_type": "Merkle Root", "value": chain.merkle_root},
|
| 505 |
+
{"trait_type": "Layer Count", "value": len(chain.records)},
|
| 506 |
+
{"trait_type": "Timestamp", "value": int(chain.created_at)},
|
| 507 |
+
]
|
| 508 |
+
|
| 509 |
+
# Add layer statistics as traits
|
| 510 |
+
if chain.records:
|
| 511 |
+
total_params = 0
|
| 512 |
+
layer_types = set()
|
| 513 |
+
for record in chain.records.values():
|
| 514 |
+
if record.params_hash != "no_params":
|
| 515 |
+
total_params += 1
|
| 516 |
+
# Extract layer type from name
|
| 517 |
+
parts = record.layer_name.split('.')
|
| 518 |
+
if len(parts) >= 2:
|
| 519 |
+
layer_types.add(parts[-1])
|
| 520 |
+
|
| 521 |
+
attributes.append({"trait_type": "Parameterized Layers", "value": total_params})
|
| 522 |
+
for lt in list(layer_types)[:5]: # Max 5 layer types
|
| 523 |
+
attributes.append({"trait_type": f"Has {lt}", "value": "Yes"})
|
| 524 |
+
|
| 525 |
+
return {
|
| 526 |
+
"name": f"CASCADE Provenance #{chain.session_id}",
|
| 527 |
+
"description": f"Cryptographic proof of AI inference. Model: {chain.model_id}. "
|
| 528 |
+
f"This NFT attests that a specific input was processed through "
|
| 529 |
+
f"the model, producing a verifiable Merkle root of all layer activations.",
|
| 530 |
+
"image": image_url or "ipfs://QmDefaultCascadeImage", # Placeholder
|
| 531 |
+
"animation_url": animation_url, # Could link to 3D visualization
|
| 532 |
+
"external_url": f"https://cascade.ai/verify/{chain.session_id}",
|
| 533 |
+
"attributes": attributes,
|
| 534 |
+
"properties": {
|
| 535 |
+
"model_id": chain.model_id,
|
| 536 |
+
"model_hash": chain.model_hash,
|
| 537 |
+
"input_hash": chain.input_hash,
|
| 538 |
+
"output_hash": chain.output_hash,
|
| 539 |
+
"merkle_root": chain.merkle_root,
|
| 540 |
+
"session_id": chain.session_id,
|
| 541 |
+
"layer_count": len(chain.records),
|
| 542 |
+
"created_at": chain.created_at,
|
| 543 |
+
}
|
| 544 |
+
}
|
| 545 |
+
|
| 546 |
+
|
| 547 |
+
# =============================================================================
|
| 548 |
+
# MULTI-CHAIN SUPPORT
|
| 549 |
+
# =============================================================================
|
| 550 |
+
|
| 551 |
+
CHAIN_CONFIGS = {
|
| 552 |
+
"ethereum": {
|
| 553 |
+
"chain_id": 1,
|
| 554 |
+
"name": "Ethereum Mainnet",
|
| 555 |
+
"explorer": "https://etherscan.io",
|
| 556 |
+
"native_token": "ETH",
|
| 557 |
+
},
|
| 558 |
+
"polygon": {
|
| 559 |
+
"chain_id": 137,
|
| 560 |
+
"name": "Polygon",
|
| 561 |
+
"explorer": "https://polygonscan.com",
|
| 562 |
+
"native_token": "MATIC",
|
| 563 |
+
},
|
| 564 |
+
"arbitrum": {
|
| 565 |
+
"chain_id": 42161,
|
| 566 |
+
"name": "Arbitrum One",
|
| 567 |
+
"explorer": "https://arbiscan.io",
|
| 568 |
+
"native_token": "ETH",
|
| 569 |
+
},
|
| 570 |
+
"optimism": {
|
| 571 |
+
"chain_id": 10,
|
| 572 |
+
"name": "Optimism",
|
| 573 |
+
"explorer": "https://optimistic.etherscan.io",
|
| 574 |
+
"native_token": "ETH",
|
| 575 |
+
},
|
| 576 |
+
"base": {
|
| 577 |
+
"chain_id": 8453,
|
| 578 |
+
"name": "Base",
|
| 579 |
+
"explorer": "https://basescan.org",
|
| 580 |
+
"native_token": "ETH",
|
| 581 |
+
},
|
| 582 |
+
"solana": {
|
| 583 |
+
"chain_id": -1, # Not EVM
|
| 584 |
+
"name": "Solana",
|
| 585 |
+
"explorer": "https://solscan.io",
|
| 586 |
+
"native_token": "SOL",
|
| 587 |
+
},
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
|
| 591 |
+
def get_chain_config(chain_name: str) -> Dict[str, Any]:
|
| 592 |
+
"""Get configuration for a specific blockchain."""
|
| 593 |
+
return CHAIN_CONFIGS.get(chain_name.lower(), CHAIN_CONFIGS["ethereum"])
|
| 594 |
+
|
| 595 |
+
|
| 596 |
+
# =============================================================================
|
| 597 |
+
# WEB3 EXPORT UTILITIES
|
| 598 |
+
# =============================================================================
|
| 599 |
+
|
| 600 |
+
def export_for_web3(chain: ProvenanceChain,
|
| 601 |
+
chain_name: str = "ethereum",
|
| 602 |
+
include_full_chain: bool = True) -> Dict[str, Any]:
|
| 603 |
+
"""
|
| 604 |
+
Export provenance chain in Web3-ready format.
|
| 605 |
+
|
| 606 |
+
Returns everything needed to post attestation on-chain.
|
| 607 |
+
"""
|
| 608 |
+
attestation = Web3Attestation.from_chain(chain)
|
| 609 |
+
chain_config = get_chain_config(chain_name)
|
| 610 |
+
|
| 611 |
+
result = {
|
| 612 |
+
"attestation": attestation.to_dict(),
|
| 613 |
+
"eip712": attestation.to_eip712_message({
|
| 614 |
+
**CASCADE_DOMAIN,
|
| 615 |
+
"chainId": chain_config["chain_id"]
|
| 616 |
+
}),
|
| 617 |
+
"contract_abi": CASCADE_ATTESTATION_ABI,
|
| 618 |
+
"chain_config": chain_config,
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
if include_full_chain:
|
| 622 |
+
data, cid = chain_to_ipfs_ready(chain)
|
| 623 |
+
result["ipfs"] = {
|
| 624 |
+
"data": base64.b64encode(data).decode('ascii'),
|
| 625 |
+
"cid": cid,
|
| 626 |
+
"size_bytes": len(data),
|
| 627 |
+
}
|
| 628 |
+
|
| 629 |
+
return result
|
| 630 |
+
|
| 631 |
+
|
| 632 |
+
def generate_verification_page(attestation: Web3Attestation,
|
| 633 |
+
chain: Optional[ProvenanceChain] = None) -> str:
|
| 634 |
+
"""
|
| 635 |
+
Generate an HTML verification page for an attestation.
|
| 636 |
+
|
| 637 |
+
This can be hosted anywhere and allows public verification.
|
| 638 |
+
"""
|
| 639 |
+
records_html = ""
|
| 640 |
+
if chain:
|
| 641 |
+
for record in chain.records.values():
|
| 642 |
+
records_html += f"""
|
| 643 |
+
<tr>
|
| 644 |
+
<td>{record.layer_name}</td>
|
| 645 |
+
<td><code>{record.state_hash}</code></td>
|
| 646 |
+
<td>{record.shape}</td>
|
| 647 |
+
<td>{record.stats.get('mean', 0):.4f}</td>
|
| 648 |
+
</tr>
|
| 649 |
+
"""
|
| 650 |
+
|
| 651 |
+
return f"""<!DOCTYPE html>
|
| 652 |
+
<html>
|
| 653 |
+
<head>
|
| 654 |
+
<title>CASCADE Provenance Verification</title>
|
| 655 |
+
<meta charset="utf-8">
|
| 656 |
+
<style>
|
| 657 |
+
body {{ font-family: 'Courier New', monospace; background: #0a0a0a; color: #00ff88; padding: 40px; }}
|
| 658 |
+
.container {{ max-width: 900px; margin: 0 auto; }}
|
| 659 |
+
h1 {{ color: #00ffcc; border-bottom: 2px solid #00ff88; padding-bottom: 10px; }}
|
| 660 |
+
.hash {{ font-family: monospace; background: #1a1a2e; padding: 10px; border-radius: 4px; word-break: break-all; }}
|
| 661 |
+
.verified {{ color: #00ff88; }}
|
| 662 |
+
.label {{ color: #888; font-size: 0.9em; }}
|
| 663 |
+
table {{ width: 100%; border-collapse: collapse; margin-top: 20px; }}
|
| 664 |
+
th, td {{ padding: 8px; border: 1px solid #333; text-align: left; }}
|
| 665 |
+
th {{ background: #1a1a2e; }}
|
| 666 |
+
code {{ background: #1a1a2e; padding: 2px 6px; border-radius: 3px; }}
|
| 667 |
+
.merkle {{ font-size: 1.5em; color: #ffcc00; text-align: center; padding: 20px; background: #1a1a2e; border-radius: 8px; margin: 20px 0; }}
|
| 668 |
+
</style>
|
| 669 |
+
</head>
|
| 670 |
+
<body>
|
| 671 |
+
<div class="container">
|
| 672 |
+
<h1>🔗 CASCADE Provenance Verification</h1>
|
| 673 |
+
|
| 674 |
+
<div class="merkle">
|
| 675 |
+
Merkle Root: <code>{attestation.merkle_root}</code>
|
| 676 |
+
</div>
|
| 677 |
+
|
| 678 |
+
<h2>Attestation Details</h2>
|
| 679 |
+
<p class="label">Session ID</p>
|
| 680 |
+
<div class="hash">{attestation.session_id}</div>
|
| 681 |
+
|
| 682 |
+
<p class="label">Model Hash</p>
|
| 683 |
+
<div class="hash">{attestation.model_hash}</div>
|
| 684 |
+
|
| 685 |
+
<p class="label">Input Hash</p>
|
| 686 |
+
<div class="hash">{attestation.input_hash}</div>
|
| 687 |
+
|
| 688 |
+
<p class="label">Output Hash</p>
|
| 689 |
+
<div class="hash">{attestation.output_hash}</div>
|
| 690 |
+
|
| 691 |
+
<p class="label">Timestamp</p>
|
| 692 |
+
<div class="hash">{attestation.timestamp} ({time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime(attestation.timestamp))})</div>
|
| 693 |
+
|
| 694 |
+
<p class="label">Layer Count</p>
|
| 695 |
+
<div class="hash">{attestation.layer_count} layers</div>
|
| 696 |
+
|
| 697 |
+
{"<h2>Provenance Chain</h2><table><tr><th>Layer</th><th>State Hash</th><th>Shape</th><th>Mean</th></tr>" + records_html + "</table>" if chain else ""}
|
| 698 |
+
|
| 699 |
+
<h2>On-Chain Verification</h2>
|
| 700 |
+
<p>{"<span class='verified'>✓ Verified on " + get_chain_config('ethereum')['name'] + "</span>" if attestation.tx_hash else "⏳ Pending on-chain attestation"}</p>
|
| 701 |
+
{f"<p class='label'>Transaction</p><div class='hash'><a href='{get_chain_config('ethereum')['explorer']}/tx/{attestation.tx_hash}' style='color: #00ff88;'>{attestation.tx_hash}</a></div>" if attestation.tx_hash else ""}
|
| 702 |
+
|
| 703 |
+
<h2>IPFS Storage</h2>
|
| 704 |
+
<p>{f"<a href='https://ipfs.io/ipfs/{attestation.ipfs_cid}' style='color: #00ff88;'>{attestation.ipfs_cid}</a>" if attestation.ipfs_cid else "Full chain not yet pinned to IPFS"}</p>
|
| 705 |
+
|
| 706 |
+
<hr style="border-color: #333; margin: 40px 0;">
|
| 707 |
+
<p style="color: #666; text-align: center;">CASCADE Provenance Engine • Due process infrastructure for AI</p>
|
| 708 |
+
</div>
|
| 709 |
+
</body>
|
| 710 |
+
</html>
|
| 711 |
+
"""
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
# =============================================================================
|
| 715 |
+
# SIGNATURE UTILITIES (for wallet integration)
|
| 716 |
+
# =============================================================================
|
| 717 |
+
|
| 718 |
+
def prepare_for_signing(attestation: Web3Attestation,
|
| 719 |
+
chain_name: str = "ethereum") -> Dict[str, Any]:
|
| 720 |
+
"""
|
| 721 |
+
Prepare attestation for wallet signing (MetaMask, etc).
|
| 722 |
+
|
| 723 |
+
Returns the EIP-712 message that wallets can sign.
|
| 724 |
+
"""
|
| 725 |
+
chain_config = get_chain_config(chain_name)
|
| 726 |
+
|
| 727 |
+
eip712 = attestation.to_eip712_message({
|
| 728 |
+
**CASCADE_DOMAIN,
|
| 729 |
+
"chainId": chain_config["chain_id"]
|
| 730 |
+
})
|
| 731 |
+
|
| 732 |
+
return {
|
| 733 |
+
"method": "eth_signTypedData_v4",
|
| 734 |
+
"params": [
|
| 735 |
+
None, # Address filled by wallet
|
| 736 |
+
json.dumps(eip712)
|
| 737 |
+
],
|
| 738 |
+
"display": {
|
| 739 |
+
"title": "Sign CASCADE Attestation",
|
| 740 |
+
"description": f"Attest that model {attestation.model_hash[:16]}... "
|
| 741 |
+
f"processed input {attestation.input_hash[:16]}...",
|
| 742 |
+
"merkle_root": attestation.merkle_root,
|
| 743 |
+
}
|
| 744 |
+
}
|
| 745 |
+
|
| 746 |
+
|
| 747 |
+
def verify_signature(attestation: Web3Attestation,
|
| 748 |
+
signature: str,
|
| 749 |
+
expected_signer: str) -> Tuple[bool, str]:
|
| 750 |
+
"""
|
| 751 |
+
Verify an EIP-712 signature.
|
| 752 |
+
|
| 753 |
+
Note: Full verification requires eth_utils/web3.py.
|
| 754 |
+
This is a structural check only.
|
| 755 |
+
"""
|
| 756 |
+
if not signature or len(signature) < 130:
|
| 757 |
+
return False, "Invalid signature length"
|
| 758 |
+
|
| 759 |
+
if not signature.startswith("0x"):
|
| 760 |
+
return False, "Signature must start with 0x"
|
| 761 |
+
|
| 762 |
+
# Extract r, s, v components
|
| 763 |
+
try:
|
| 764 |
+
sig_bytes = bytes.fromhex(signature[2:])
|
| 765 |
+
if len(sig_bytes) != 65:
|
| 766 |
+
return False, f"Signature must be 65 bytes, got {len(sig_bytes)}"
|
| 767 |
+
|
| 768 |
+
r = sig_bytes[:32]
|
| 769 |
+
s = sig_bytes[32:64]
|
| 770 |
+
v = sig_bytes[64]
|
| 771 |
+
|
| 772 |
+
# v should be 27 or 28 (or 0/1 for some implementations)
|
| 773 |
+
if v not in [0, 1, 27, 28]:
|
| 774 |
+
return False, f"Invalid v value: {v}"
|
| 775 |
+
|
| 776 |
+
# Structural validation passed
|
| 777 |
+
# Full cryptographic verification requires ecrecover
|
| 778 |
+
return True, "Signature structure valid (full verification requires web3.py)"
|
| 779 |
+
|
| 780 |
+
except Exception as e:
|
| 781 |
+
return False, f"Signature parsing error: {str(e)}"
|
| 782 |
+
|
| 783 |
+
|
| 784 |
+
# =============================================================================
|
| 785 |
+
# CONVENIENCE FUNCTIONS
|
| 786 |
+
# =============================================================================
|
| 787 |
+
|
| 788 |
+
def attest_inference(chain: ProvenanceChain,
|
| 789 |
+
chain_name: str = "ethereum") -> Web3Attestation:
|
| 790 |
+
"""
|
| 791 |
+
One-liner to create attestation from provenance chain.
|
| 792 |
+
|
| 793 |
+
Usage:
|
| 794 |
+
attestation = attest_inference(chain)
|
| 795 |
+
print(attestation.merkle_root)
|
| 796 |
+
"""
|
| 797 |
+
if not chain.finalized:
|
| 798 |
+
chain.finalize()
|
| 799 |
+
|
| 800 |
+
attestation = Web3Attestation.from_chain(chain)
|
| 801 |
+
|
| 802 |
+
# Compute IPFS CID
|
| 803 |
+
data, cid = chain_to_ipfs_ready(chain)
|
| 804 |
+
attestation.ipfs_cid = cid
|
| 805 |
+
|
| 806 |
+
# Set chain
|
| 807 |
+
attestation.chain_id = get_chain_config(chain_name)["chain_id"]
|
| 808 |
+
|
| 809 |
+
return attestation
|
| 810 |
+
|
| 811 |
+
|
| 812 |
+
def quick_verify(merkle_root: str, layer_hashes: List[str]) -> bool:
|
| 813 |
+
"""
|
| 814 |
+
Quick verification that layer hashes produce expected Merkle root.
|
| 815 |
+
"""
|
| 816 |
+
computed = compute_merkle_root(layer_hashes)
|
| 817 |
+
return computed == merkle_root
|
| 818 |
+
|
| 819 |
+
|
| 820 |
+
# =============================================================================
|
| 821 |
+
# COMMAND LINE INTERFACE
|
| 822 |
+
# =============================================================================
|
| 823 |
+
|
| 824 |
+
if __name__ == "__main__":
|
| 825 |
+
import sys
|
| 826 |
+
|
| 827 |
+
print("CASCADE // WEB3 BRIDGE")
|
| 828 |
+
print("=" * 50)
|
| 829 |
+
print()
|
| 830 |
+
print("Smart Contract (Solidity):")
|
| 831 |
+
print("-" * 50)
|
| 832 |
+
print(CASCADE_ATTESTATION_SOLIDITY[:500] + "...")
|
| 833 |
+
print()
|
| 834 |
+
print("Contract ABI:")
|
| 835 |
+
print("-" * 50)
|
| 836 |
+
print(json.dumps(CASCADE_ATTESTATION_ABI, indent=2)[:500] + "...")
|
| 837 |
+
print()
|
| 838 |
+
print("Supported Chains:")
|
| 839 |
+
print("-" * 50)
|
| 840 |
+
for name, config in CHAIN_CONFIGS.items():
|
| 841 |
+
print(f" {name}: Chain ID {config['chain_id']}")
|
| 842 |
+
print()
|
| 843 |
+
print("Usage:")
|
| 844 |
+
print(" from cascade.core.web3_bridge import attest_inference, export_for_web3")
|
| 845 |
+
print(" attestation = attest_inference(provenance_chain)")
|
| 846 |
+
print(" web3_data = export_for_web3(provenance_chain, 'polygon')")
|
cascade/data/__init__.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Data Observatory
|
| 3 |
+
|
| 4 |
+
Dataset observation with the same rigor as model observation.
|
| 5 |
+
Tracks provenance, schema, lineage using W3C PROV-O standard.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from .entities import (
|
| 9 |
+
DatasetEntity,
|
| 10 |
+
Activity,
|
| 11 |
+
Agent,
|
| 12 |
+
Relationship,
|
| 13 |
+
RelationType,
|
| 14 |
+
ActivityType,
|
| 15 |
+
AgentType,
|
| 16 |
+
create_system_agent,
|
| 17 |
+
create_model_agent,
|
| 18 |
+
create_user_agent,
|
| 19 |
+
)
|
| 20 |
+
from .observer import DatasetObserver, ObservationContext
|
| 21 |
+
from .provenance import ProvenanceGraph
|
| 22 |
+
from .schema import SchemaObserver, DatasetSchema, FieldSchema, hash_content
|
| 23 |
+
from .croissant import CroissantExporter, export_to_croissant
|
| 24 |
+
from .hub import HubIntegration, AccountabilityBundle, push_to_hub, pull_from_hub
|
| 25 |
+
from .license import (
|
| 26 |
+
SPDXLicense,
|
| 27 |
+
LicenseCategory,
|
| 28 |
+
LicenseRestriction,
|
| 29 |
+
LicenseCompatibility,
|
| 30 |
+
LicenseAnalyzer,
|
| 31 |
+
SPDX_LICENSES,
|
| 32 |
+
get_license,
|
| 33 |
+
check_license_compatibility,
|
| 34 |
+
get_derived_license,
|
| 35 |
+
)
|
| 36 |
+
from .pii import (
|
| 37 |
+
PIIType,
|
| 38 |
+
PIISeverity,
|
| 39 |
+
PIIMatch,
|
| 40 |
+
PIIScanResult,
|
| 41 |
+
PIIScanner,
|
| 42 |
+
scan_for_pii,
|
| 43 |
+
quick_pii_check,
|
| 44 |
+
)
|
| 45 |
+
from .live import (
|
| 46 |
+
LiveDocumentTracer,
|
| 47 |
+
TraceEvent,
|
| 48 |
+
TraceEventType,
|
| 49 |
+
DocumentSpan,
|
| 50 |
+
DocumentAssociation,
|
| 51 |
+
ConsoleTraceRenderer,
|
| 52 |
+
create_live_tracer,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
__all__ = [
|
| 56 |
+
# Entities (PROV-O)
|
| 57 |
+
"DatasetEntity",
|
| 58 |
+
"Activity",
|
| 59 |
+
"Agent",
|
| 60 |
+
"Relationship",
|
| 61 |
+
"RelationType",
|
| 62 |
+
"ActivityType",
|
| 63 |
+
"AgentType",
|
| 64 |
+
"create_system_agent",
|
| 65 |
+
"create_model_agent",
|
| 66 |
+
"create_user_agent",
|
| 67 |
+
# Observer
|
| 68 |
+
"DatasetObserver",
|
| 69 |
+
"ObservationContext",
|
| 70 |
+
# Provenance
|
| 71 |
+
"ProvenanceGraph",
|
| 72 |
+
# Schema
|
| 73 |
+
"SchemaObserver",
|
| 74 |
+
"DatasetSchema",
|
| 75 |
+
"FieldSchema",
|
| 76 |
+
"hash_content",
|
| 77 |
+
# Export
|
| 78 |
+
"CroissantExporter",
|
| 79 |
+
"export_to_croissant",
|
| 80 |
+
# Accountability
|
| 81 |
+
"AccountabilityBundle",
|
| 82 |
+
# Hub
|
| 83 |
+
"HubIntegration",
|
| 84 |
+
"push_to_hub",
|
| 85 |
+
"pull_from_hub",
|
| 86 |
+
# License
|
| 87 |
+
"SPDXLicense",
|
| 88 |
+
"LicenseCategory",
|
| 89 |
+
"LicenseRestriction",
|
| 90 |
+
"LicenseCompatibility",
|
| 91 |
+
"LicenseAnalyzer",
|
| 92 |
+
"SPDX_LICENSES",
|
| 93 |
+
"get_license",
|
| 94 |
+
"check_license_compatibility",
|
| 95 |
+
"get_derived_license",
|
| 96 |
+
# PII Detection
|
| 97 |
+
"PIIType",
|
| 98 |
+
"PIISeverity",
|
| 99 |
+
"PIIMatch",
|
| 100 |
+
"PIIScanResult",
|
| 101 |
+
"PIIScanner",
|
| 102 |
+
"scan_for_pii",
|
| 103 |
+
"quick_pii_check",
|
| 104 |
+
# Live Document Tracing
|
| 105 |
+
"LiveDocumentTracer",
|
| 106 |
+
"TraceEvent",
|
| 107 |
+
"TraceEventType",
|
| 108 |
+
"DocumentSpan",
|
| 109 |
+
"DocumentAssociation",
|
| 110 |
+
"ConsoleTraceRenderer",
|
| 111 |
+
"create_live_tracer",
|
| 112 |
+
]
|
cascade/data/croissant.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Croissant Exporter
|
| 3 |
+
|
| 4 |
+
Exports provenance graph to MLCommons Croissant format.
|
| 5 |
+
Croissant is the emerging standard for ML dataset metadata.
|
| 6 |
+
|
| 7 |
+
Reference: https://github.com/mlcommons/croissant
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import time
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from typing import Any, Dict, List, Optional
|
| 14 |
+
|
| 15 |
+
from .entities import DatasetEntity, Activity, Agent
|
| 16 |
+
from .provenance import ProvenanceGraph
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class CroissantExporter:
|
| 20 |
+
"""
|
| 21 |
+
Export provenance to Croissant JSON-LD format.
|
| 22 |
+
|
| 23 |
+
Croissant layers:
|
| 24 |
+
1. Metadata - description, license, citation
|
| 25 |
+
2. Resources - file descriptions
|
| 26 |
+
3. Structure - record sets and fields
|
| 27 |
+
4. ML Semantics - task types, splits
|
| 28 |
+
|
| 29 |
+
We add provenance as an extension.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
CROISSANT_VERSION = "1.0"
|
| 33 |
+
CROISSANT_CONTEXT = "http://mlcommons.org/croissant/1.0"
|
| 34 |
+
|
| 35 |
+
def __init__(self, graph: ProvenanceGraph):
|
| 36 |
+
self.graph = graph
|
| 37 |
+
|
| 38 |
+
def export(
|
| 39 |
+
self,
|
| 40 |
+
name: str = None,
|
| 41 |
+
description: str = None,
|
| 42 |
+
license_url: str = None,
|
| 43 |
+
citation: str = None,
|
| 44 |
+
url: str = None,
|
| 45 |
+
include_provenance: bool = True,
|
| 46 |
+
) -> Dict[str, Any]:
|
| 47 |
+
"""
|
| 48 |
+
Export to Croissant JSON-LD.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
name: Dataset name (defaults to graph name)
|
| 52 |
+
description: Dataset description
|
| 53 |
+
license_url: License URL
|
| 54 |
+
citation: Citation text
|
| 55 |
+
url: Dataset URL
|
| 56 |
+
include_provenance: Whether to include CASCADE provenance extension
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Croissant JSON-LD document
|
| 60 |
+
"""
|
| 61 |
+
name = name or self.graph.name
|
| 62 |
+
|
| 63 |
+
doc = {
|
| 64 |
+
"@context": {
|
| 65 |
+
"@vocab": "http://schema.org/",
|
| 66 |
+
"sc": "http://schema.org/",
|
| 67 |
+
"cr": "http://mlcommons.org/croissant/",
|
| 68 |
+
"rai": "http://mlcommons.org/croissant/RAI/",
|
| 69 |
+
"spdx": "http://spdx.org/rdf/terms#",
|
| 70 |
+
},
|
| 71 |
+
"@type": "sc:Dataset",
|
| 72 |
+
"name": name,
|
| 73 |
+
"conformsTo": self.CROISSANT_CONTEXT,
|
| 74 |
+
"dateCreated": datetime.fromtimestamp(self.graph.created_at).isoformat(),
|
| 75 |
+
"dateModified": datetime.now().isoformat(),
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
if description:
|
| 79 |
+
doc["description"] = description
|
| 80 |
+
if license_url:
|
| 81 |
+
doc["license"] = license_url
|
| 82 |
+
if citation:
|
| 83 |
+
doc["citation"] = citation
|
| 84 |
+
if url:
|
| 85 |
+
doc["url"] = url
|
| 86 |
+
|
| 87 |
+
# Add distributions (file objects)
|
| 88 |
+
doc["distribution"] = self._build_distributions()
|
| 89 |
+
|
| 90 |
+
# Add record sets
|
| 91 |
+
doc["recordSet"] = self._build_record_sets()
|
| 92 |
+
|
| 93 |
+
# Add provenance extension
|
| 94 |
+
if include_provenance:
|
| 95 |
+
doc["cr:provenance"] = self._build_provenance_extension()
|
| 96 |
+
|
| 97 |
+
return doc
|
| 98 |
+
|
| 99 |
+
def _build_distributions(self) -> List[Dict[str, Any]]:
|
| 100 |
+
"""Build distribution (FileObject) entries."""
|
| 101 |
+
distributions = []
|
| 102 |
+
|
| 103 |
+
for entity in self.graph.list_entities():
|
| 104 |
+
dist = {
|
| 105 |
+
"@type": "cr:FileObject",
|
| 106 |
+
"@id": entity.id,
|
| 107 |
+
"name": entity.name,
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
if entity.source_uri:
|
| 111 |
+
dist["contentUrl"] = entity.source_uri
|
| 112 |
+
|
| 113 |
+
if entity.content_hash:
|
| 114 |
+
dist["sha256"] = entity.content_hash
|
| 115 |
+
|
| 116 |
+
# License information (SPDX)
|
| 117 |
+
if entity.license_id:
|
| 118 |
+
dist["spdx:license"] = entity.license_id
|
| 119 |
+
if entity.license_url:
|
| 120 |
+
dist["sc:license"] = entity.license_url
|
| 121 |
+
else:
|
| 122 |
+
# Auto-generate SPDX license URL
|
| 123 |
+
dist["sc:license"] = f"https://spdx.org/licenses/{entity.license_id}.html"
|
| 124 |
+
|
| 125 |
+
# Infer encoding format from source type
|
| 126 |
+
format_map = {
|
| 127 |
+
"hf_dataset": "application/x-arrow",
|
| 128 |
+
"hf_hub": "application/x-arrow",
|
| 129 |
+
"parquet": "application/x-parquet",
|
| 130 |
+
"csv": "text/csv",
|
| 131 |
+
"json": "application/json",
|
| 132 |
+
"jsonl": "application/x-jsonlines",
|
| 133 |
+
}
|
| 134 |
+
if entity.source_type in format_map:
|
| 135 |
+
dist["encodingFormat"] = format_map[entity.source_type]
|
| 136 |
+
|
| 137 |
+
if entity.size_bytes:
|
| 138 |
+
dist["contentSize"] = f"{entity.size_bytes} bytes"
|
| 139 |
+
|
| 140 |
+
distributions.append(dist)
|
| 141 |
+
|
| 142 |
+
return distributions
|
| 143 |
+
|
| 144 |
+
def _build_record_sets(self) -> List[Dict[str, Any]]:
|
| 145 |
+
"""Build RecordSet entries from entity schemas."""
|
| 146 |
+
record_sets = []
|
| 147 |
+
|
| 148 |
+
for entity in self.graph.list_entities():
|
| 149 |
+
schema = entity.attributes.get("schema")
|
| 150 |
+
if not schema:
|
| 151 |
+
continue
|
| 152 |
+
|
| 153 |
+
fields = []
|
| 154 |
+
for field_name, field_info in schema.get("fields", {}).items():
|
| 155 |
+
field_entry = {
|
| 156 |
+
"@type": "cr:Field",
|
| 157 |
+
"name": field_name,
|
| 158 |
+
"dataType": self._map_dtype_to_croissant(field_info.get("dtype", "string")),
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
if field_info.get("description"):
|
| 162 |
+
field_entry["description"] = field_info["description"]
|
| 163 |
+
|
| 164 |
+
# Source reference
|
| 165 |
+
field_entry["source"] = {
|
| 166 |
+
"fileObject": {"@id": entity.id},
|
| 167 |
+
"extract": {"column": field_name},
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
fields.append(field_entry)
|
| 171 |
+
|
| 172 |
+
if fields:
|
| 173 |
+
record_set = {
|
| 174 |
+
"@type": "cr:RecordSet",
|
| 175 |
+
"@id": f"recordset_{entity.id}",
|
| 176 |
+
"name": f"{entity.name}_records",
|
| 177 |
+
"field": fields,
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
if entity.record_count:
|
| 181 |
+
record_set["cr:recordCount"] = entity.record_count
|
| 182 |
+
|
| 183 |
+
record_sets.append(record_set)
|
| 184 |
+
|
| 185 |
+
return record_sets
|
| 186 |
+
|
| 187 |
+
def _map_dtype_to_croissant(self, dtype: str) -> str:
|
| 188 |
+
"""Map internal dtype to Croissant/schema.org type."""
|
| 189 |
+
type_map = {
|
| 190 |
+
"string": "sc:Text",
|
| 191 |
+
"int8": "sc:Integer",
|
| 192 |
+
"int16": "sc:Integer",
|
| 193 |
+
"int32": "sc:Integer",
|
| 194 |
+
"int64": "sc:Integer",
|
| 195 |
+
"uint8": "sc:Integer",
|
| 196 |
+
"uint16": "sc:Integer",
|
| 197 |
+
"uint32": "sc:Integer",
|
| 198 |
+
"uint64": "sc:Integer",
|
| 199 |
+
"float16": "sc:Float",
|
| 200 |
+
"float32": "sc:Float",
|
| 201 |
+
"float64": "sc:Float",
|
| 202 |
+
"bool": "sc:Boolean",
|
| 203 |
+
"binary": "sc:Text", # Base64 encoded
|
| 204 |
+
"image": "sc:ImageObject",
|
| 205 |
+
"audio": "sc:AudioObject",
|
| 206 |
+
"categorical": "sc:Text", # With enumeration
|
| 207 |
+
"list": "sc:ItemList",
|
| 208 |
+
"struct": "sc:StructuredValue",
|
| 209 |
+
}
|
| 210 |
+
return type_map.get(dtype, "sc:Text")
|
| 211 |
+
|
| 212 |
+
def _build_provenance_extension(self) -> Dict[str, Any]:
|
| 213 |
+
"""Build CASCADE provenance extension."""
|
| 214 |
+
return {
|
| 215 |
+
"@type": "cascade:ProvenanceGraph",
|
| 216 |
+
"cascade:rootHash": self.graph.root_hash,
|
| 217 |
+
"cascade:createdAt": datetime.fromtimestamp(self.graph.created_at).isoformat(),
|
| 218 |
+
|
| 219 |
+
# Entities with lineage
|
| 220 |
+
"cascade:entities": [
|
| 221 |
+
{
|
| 222 |
+
"@id": e.id,
|
| 223 |
+
"cascade:name": e.name,
|
| 224 |
+
"cascade:contentHash": e.content_hash,
|
| 225 |
+
"cascade:schemaHash": e.schema_hash,
|
| 226 |
+
"cascade:version": e.version,
|
| 227 |
+
"cascade:recordCount": e.record_count,
|
| 228 |
+
"cascade:derivedFrom": self.graph.get_lineage(e.id, "upstream"),
|
| 229 |
+
}
|
| 230 |
+
for e in self.graph.list_entities()
|
| 231 |
+
],
|
| 232 |
+
|
| 233 |
+
# Activities
|
| 234 |
+
"cascade:activities": [
|
| 235 |
+
{
|
| 236 |
+
"@id": a.id,
|
| 237 |
+
"cascade:type": a.activity_type.value,
|
| 238 |
+
"cascade:name": a.name,
|
| 239 |
+
"cascade:startedAt": datetime.fromtimestamp(a.started_at).isoformat() if a.started_at else None,
|
| 240 |
+
"cascade:endedAt": datetime.fromtimestamp(a.ended_at).isoformat() if a.ended_at else None,
|
| 241 |
+
"cascade:inputs": a.inputs,
|
| 242 |
+
"cascade:outputs": a.outputs,
|
| 243 |
+
"cascade:parameters": a.parameters,
|
| 244 |
+
}
|
| 245 |
+
for a in self.graph.list_activities()
|
| 246 |
+
],
|
| 247 |
+
|
| 248 |
+
# Agents
|
| 249 |
+
"cascade:agents": [
|
| 250 |
+
{
|
| 251 |
+
"@id": a.id,
|
| 252 |
+
"cascade:type": a.agent_type.value,
|
| 253 |
+
"cascade:name": a.name,
|
| 254 |
+
"cascade:version": a.version,
|
| 255 |
+
}
|
| 256 |
+
for a in self.graph.list_agents()
|
| 257 |
+
],
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
def to_json(self, **kwargs) -> str:
|
| 261 |
+
"""Export to JSON string."""
|
| 262 |
+
return json.dumps(self.export(**kwargs), indent=2, default=str)
|
| 263 |
+
|
| 264 |
+
def save(self, path: str, **kwargs):
|
| 265 |
+
"""Save to file."""
|
| 266 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 267 |
+
f.write(self.to_json(**kwargs))
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
def export_to_croissant(
|
| 271 |
+
graph: ProvenanceGraph,
|
| 272 |
+
name: str = None,
|
| 273 |
+
description: str = None,
|
| 274 |
+
**kwargs,
|
| 275 |
+
) -> Dict[str, Any]:
|
| 276 |
+
"""
|
| 277 |
+
Convenience function to export provenance to Croissant.
|
| 278 |
+
|
| 279 |
+
Args:
|
| 280 |
+
graph: The provenance graph to export
|
| 281 |
+
name: Dataset name
|
| 282 |
+
description: Dataset description
|
| 283 |
+
**kwargs: Additional export options
|
| 284 |
+
|
| 285 |
+
Returns:
|
| 286 |
+
Croissant JSON-LD document
|
| 287 |
+
"""
|
| 288 |
+
exporter = CroissantExporter(graph)
|
| 289 |
+
return exporter.export(name=name, description=description, **kwargs)
|
cascade/data/entities.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PROV-O Entities for Dataset Observation
|
| 3 |
+
|
| 4 |
+
W3C PROV Data Model:
|
| 5 |
+
- Entity: A physical, digital, or conceptual thing (the dataset)
|
| 6 |
+
- Activity: Something that occurs over time and acts upon entities
|
| 7 |
+
- Agent: Something that bears responsibility for an activity
|
| 8 |
+
|
| 9 |
+
Relationships:
|
| 10 |
+
- wasGeneratedBy: Entity → Activity
|
| 11 |
+
- wasDerivedFrom: Entity → Entity
|
| 12 |
+
- wasAttributedTo: Entity → Agent
|
| 13 |
+
- used: Activity → Entity
|
| 14 |
+
- wasAssociatedWith: Activity → Agent
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import hashlib
|
| 18 |
+
import json
|
| 19 |
+
import time
|
| 20 |
+
from dataclasses import dataclass, field
|
| 21 |
+
from datetime import datetime
|
| 22 |
+
from enum import Enum
|
| 23 |
+
from typing import Any, Dict, List, Optional, Union
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class RelationType(Enum):
|
| 27 |
+
"""W3C PROV-O relationship types."""
|
| 28 |
+
# Entity relationships
|
| 29 |
+
WAS_GENERATED_BY = "wasGeneratedBy" # Entity → Activity
|
| 30 |
+
WAS_DERIVED_FROM = "wasDerivedFrom" # Entity → Entity
|
| 31 |
+
WAS_ATTRIBUTED_TO = "wasAttributedTo" # Entity → Agent
|
| 32 |
+
WAS_REVISION_OF = "wasRevisionOf" # Entity → Entity (versioning)
|
| 33 |
+
HAD_PRIMARY_SOURCE = "hadPrimarySource" # Entity → Entity
|
| 34 |
+
|
| 35 |
+
# Activity relationships
|
| 36 |
+
USED = "used" # Activity → Entity
|
| 37 |
+
WAS_ASSOCIATED_WITH = "wasAssociatedWith" # Activity → Agent
|
| 38 |
+
WAS_INFORMED_BY = "wasInformedBy" # Activity → Activity
|
| 39 |
+
WAS_STARTED_BY = "wasStartedBy" # Activity → Entity
|
| 40 |
+
WAS_ENDED_BY = "wasEndedBy" # Activity → Entity
|
| 41 |
+
|
| 42 |
+
# Agent relationships
|
| 43 |
+
ACTED_ON_BEHALF_OF = "actedOnBehalfOf" # Agent → Agent
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@dataclass
|
| 47 |
+
class Relationship:
|
| 48 |
+
"""A provenance relationship between two nodes."""
|
| 49 |
+
relation_type: RelationType
|
| 50 |
+
source_id: str
|
| 51 |
+
target_id: str
|
| 52 |
+
timestamp: float = field(default_factory=time.time)
|
| 53 |
+
attributes: Dict[str, Any] = field(default_factory=dict)
|
| 54 |
+
|
| 55 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 56 |
+
return {
|
| 57 |
+
"type": self.relation_type.value,
|
| 58 |
+
"source": self.source_id,
|
| 59 |
+
"target": self.target_id,
|
| 60 |
+
"timestamp": self.timestamp,
|
| 61 |
+
"attributes": self.attributes,
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
def to_prov_n(self) -> str:
|
| 65 |
+
"""Export as PROV-N notation."""
|
| 66 |
+
return f"{self.relation_type.value}({self.source_id}, {self.target_id})"
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@dataclass
|
| 70 |
+
class DatasetEntity:
|
| 71 |
+
"""
|
| 72 |
+
A dataset entity in the provenance graph.
|
| 73 |
+
|
| 74 |
+
Corresponds to prov:Entity - any physical, digital, or conceptual thing.
|
| 75 |
+
In our case: a dataset, a version of a dataset, or a split.
|
| 76 |
+
"""
|
| 77 |
+
id: str
|
| 78 |
+
name: str
|
| 79 |
+
|
| 80 |
+
# Content identification
|
| 81 |
+
content_hash: Optional[str] = None # SHA-256 of data content
|
| 82 |
+
schema_hash: Optional[str] = None # SHA-256 of schema/features
|
| 83 |
+
|
| 84 |
+
# Versioning
|
| 85 |
+
version: Optional[str] = None
|
| 86 |
+
previous_version: Optional[str] = None
|
| 87 |
+
|
| 88 |
+
# Source
|
| 89 |
+
source_type: str = "unknown" # hf_hub, local, s3, gcs, etc.
|
| 90 |
+
source_uri: Optional[str] = None
|
| 91 |
+
|
| 92 |
+
# License (SPDX identifier)
|
| 93 |
+
license_id: Optional[str] = None # e.g., "MIT", "CC-BY-4.0", "Apache-2.0"
|
| 94 |
+
license_url: Optional[str] = None # URL to license text
|
| 95 |
+
|
| 96 |
+
# Statistics
|
| 97 |
+
record_count: Optional[int] = None
|
| 98 |
+
size_bytes: Optional[int] = None
|
| 99 |
+
splits: Dict[str, int] = field(default_factory=dict) # split_name → count
|
| 100 |
+
|
| 101 |
+
# Metadata
|
| 102 |
+
attributes: Dict[str, Any] = field(default_factory=dict)
|
| 103 |
+
|
| 104 |
+
# Timestamps
|
| 105 |
+
created_at: float = field(default_factory=time.time)
|
| 106 |
+
|
| 107 |
+
def __post_init__(self):
|
| 108 |
+
"""Generate ID if not provided."""
|
| 109 |
+
if not self.id:
|
| 110 |
+
self.id = f"entity:{self.name}:{int(self.created_at * 1000)}"
|
| 111 |
+
|
| 112 |
+
def compute_hash(self) -> str:
|
| 113 |
+
"""Compute entity hash from content."""
|
| 114 |
+
content = json.dumps({
|
| 115 |
+
"id": self.id,
|
| 116 |
+
"name": self.name,
|
| 117 |
+
"content_hash": self.content_hash,
|
| 118 |
+
"schema_hash": self.schema_hash,
|
| 119 |
+
"version": self.version,
|
| 120 |
+
"record_count": self.record_count,
|
| 121 |
+
}, sort_keys=True)
|
| 122 |
+
return hashlib.sha256(content.encode()).hexdigest()
|
| 123 |
+
|
| 124 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 125 |
+
return {
|
| 126 |
+
"@type": "prov:Entity",
|
| 127 |
+
"@id": self.id,
|
| 128 |
+
"name": self.name,
|
| 129 |
+
"content_hash": self.content_hash,
|
| 130 |
+
"schema_hash": self.schema_hash,
|
| 131 |
+
"version": self.version,
|
| 132 |
+
"previous_version": self.previous_version,
|
| 133 |
+
"source_type": self.source_type,
|
| 134 |
+
"source_uri": self.source_uri,
|
| 135 |
+
"license_id": self.license_id,
|
| 136 |
+
"license_url": self.license_url,
|
| 137 |
+
"record_count": self.record_count,
|
| 138 |
+
"size_bytes": self.size_bytes,
|
| 139 |
+
"splits": self.splits,
|
| 140 |
+
"attributes": self.attributes,
|
| 141 |
+
"created_at": self.created_at,
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
def to_prov_n(self) -> str:
|
| 145 |
+
"""Export as PROV-N notation."""
|
| 146 |
+
attrs = ", ".join([
|
| 147 |
+
f'prov:label="{self.name}"',
|
| 148 |
+
f'cascade:contentHash="{self.content_hash or "unknown"}"',
|
| 149 |
+
f'cascade:recordCount="{self.record_count or 0}"',
|
| 150 |
+
f'cascade:license="{self.license_id or "unknown"}"',
|
| 151 |
+
])
|
| 152 |
+
return f"entity({self.id}, [{attrs}])"
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
class ActivityType(Enum):
|
| 156 |
+
"""Types of dataset activities."""
|
| 157 |
+
INGEST = "ingest" # Load from source
|
| 158 |
+
TRANSFORM = "transform" # Filter, map, join, etc.
|
| 159 |
+
SPLIT = "split" # Train/test/val split
|
| 160 |
+
AUGMENT = "augment" # Data augmentation
|
| 161 |
+
CLEAN = "clean" # Cleaning/preprocessing
|
| 162 |
+
MERGE = "merge" # Combining datasets
|
| 163 |
+
SAMPLE = "sample" # Sampling/subsetting
|
| 164 |
+
EXPORT = "export" # Export to format
|
| 165 |
+
TRAIN = "train" # Model training (consumption)
|
| 166 |
+
EVALUATE = "evaluate" # Model evaluation
|
| 167 |
+
INFERENCE = "inference" # Model inference
|
| 168 |
+
ENTITY_RESOLUTION = "entity_resolution" # Data Unity matching
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
@dataclass
|
| 172 |
+
class Activity:
|
| 173 |
+
"""
|
| 174 |
+
An activity in the provenance graph.
|
| 175 |
+
|
| 176 |
+
Corresponds to prov:Activity - something that occurs over time
|
| 177 |
+
and acts upon or with entities.
|
| 178 |
+
"""
|
| 179 |
+
id: str
|
| 180 |
+
activity_type: ActivityType
|
| 181 |
+
name: str
|
| 182 |
+
|
| 183 |
+
# Timing
|
| 184 |
+
started_at: Optional[float] = None
|
| 185 |
+
ended_at: Optional[float] = None
|
| 186 |
+
|
| 187 |
+
# Input/Output tracking
|
| 188 |
+
inputs: List[str] = field(default_factory=list) # Entity IDs
|
| 189 |
+
outputs: List[str] = field(default_factory=list) # Entity IDs
|
| 190 |
+
|
| 191 |
+
# Agent who performed this
|
| 192 |
+
agent_id: Optional[str] = None
|
| 193 |
+
|
| 194 |
+
# Parameters/configuration used
|
| 195 |
+
parameters: Dict[str, Any] = field(default_factory=dict)
|
| 196 |
+
|
| 197 |
+
# Metadata
|
| 198 |
+
attributes: Dict[str, Any] = field(default_factory=dict)
|
| 199 |
+
|
| 200 |
+
def __post_init__(self):
|
| 201 |
+
if not self.id:
|
| 202 |
+
self.id = f"activity:{self.activity_type.value}:{int(time.time() * 1000)}"
|
| 203 |
+
if self.started_at is None:
|
| 204 |
+
self.started_at = time.time()
|
| 205 |
+
|
| 206 |
+
def start(self):
|
| 207 |
+
"""Mark activity as started."""
|
| 208 |
+
self.started_at = time.time()
|
| 209 |
+
|
| 210 |
+
def end(self):
|
| 211 |
+
"""Mark activity as ended."""
|
| 212 |
+
self.ended_at = time.time()
|
| 213 |
+
|
| 214 |
+
@property
|
| 215 |
+
def duration(self) -> Optional[float]:
|
| 216 |
+
"""Duration in seconds."""
|
| 217 |
+
if self.started_at and self.ended_at:
|
| 218 |
+
return self.ended_at - self.started_at
|
| 219 |
+
return None
|
| 220 |
+
|
| 221 |
+
def add_input(self, entity_id: str):
|
| 222 |
+
"""Record an input entity."""
|
| 223 |
+
if entity_id not in self.inputs:
|
| 224 |
+
self.inputs.append(entity_id)
|
| 225 |
+
|
| 226 |
+
def add_output(self, entity_id: str):
|
| 227 |
+
"""Record an output entity."""
|
| 228 |
+
if entity_id not in self.outputs:
|
| 229 |
+
self.outputs.append(entity_id)
|
| 230 |
+
|
| 231 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 232 |
+
return {
|
| 233 |
+
"@type": "prov:Activity",
|
| 234 |
+
"@id": self.id,
|
| 235 |
+
"activity_type": self.activity_type.value,
|
| 236 |
+
"name": self.name,
|
| 237 |
+
"started_at": self.started_at,
|
| 238 |
+
"ended_at": self.ended_at,
|
| 239 |
+
"duration": self.duration,
|
| 240 |
+
"inputs": self.inputs,
|
| 241 |
+
"outputs": self.outputs,
|
| 242 |
+
"agent_id": self.agent_id,
|
| 243 |
+
"parameters": self.parameters,
|
| 244 |
+
"attributes": self.attributes,
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
def to_prov_n(self) -> str:
|
| 248 |
+
"""Export as PROV-N notation."""
|
| 249 |
+
start = datetime.fromtimestamp(self.started_at).isoformat() if self.started_at else "-"
|
| 250 |
+
end = datetime.fromtimestamp(self.ended_at).isoformat() if self.ended_at else "-"
|
| 251 |
+
attrs = f'prov:label="{self.name}", cascade:type="{self.activity_type.value}"'
|
| 252 |
+
return f"activity({self.id}, {start}, {end}, [{attrs}])"
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
class AgentType(Enum):
|
| 256 |
+
"""Types of agents."""
|
| 257 |
+
PERSON = "person"
|
| 258 |
+
ORGANIZATION = "organization"
|
| 259 |
+
SOFTWARE = "software"
|
| 260 |
+
MODEL = "model"
|
| 261 |
+
PIPELINE = "pipeline"
|
| 262 |
+
SYSTEM = "system"
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
@dataclass
|
| 266 |
+
class Agent:
|
| 267 |
+
"""
|
| 268 |
+
An agent in the provenance graph.
|
| 269 |
+
|
| 270 |
+
Corresponds to prov:Agent - something that bears responsibility
|
| 271 |
+
for an activity taking place.
|
| 272 |
+
"""
|
| 273 |
+
id: str
|
| 274 |
+
agent_type: AgentType
|
| 275 |
+
name: str
|
| 276 |
+
|
| 277 |
+
# For software/model agents
|
| 278 |
+
version: Optional[str] = None
|
| 279 |
+
|
| 280 |
+
# For organizational hierarchy
|
| 281 |
+
parent_agent_id: Optional[str] = None
|
| 282 |
+
|
| 283 |
+
# Contact/identification
|
| 284 |
+
identifier: Optional[str] = None # HF username, email, etc.
|
| 285 |
+
|
| 286 |
+
# Metadata
|
| 287 |
+
attributes: Dict[str, Any] = field(default_factory=dict)
|
| 288 |
+
|
| 289 |
+
# Timestamp
|
| 290 |
+
created_at: float = field(default_factory=time.time)
|
| 291 |
+
|
| 292 |
+
def __post_init__(self):
|
| 293 |
+
if not self.id:
|
| 294 |
+
self.id = f"agent:{self.agent_type.value}:{self.name}".replace(" ", "_").lower()
|
| 295 |
+
|
| 296 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 297 |
+
return {
|
| 298 |
+
"@type": "prov:Agent",
|
| 299 |
+
"@id": self.id,
|
| 300 |
+
"agent_type": self.agent_type.value,
|
| 301 |
+
"name": self.name,
|
| 302 |
+
"version": self.version,
|
| 303 |
+
"parent_agent_id": self.parent_agent_id,
|
| 304 |
+
"identifier": self.identifier,
|
| 305 |
+
"attributes": self.attributes,
|
| 306 |
+
"created_at": self.created_at,
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
def to_prov_n(self) -> str:
|
| 310 |
+
"""Export as PROV-N notation."""
|
| 311 |
+
attrs = f'prov:label="{self.name}", cascade:type="{self.agent_type.value}"'
|
| 312 |
+
if self.version:
|
| 313 |
+
attrs += f', cascade:version="{self.version}"'
|
| 314 |
+
return f"agent({self.id}, [{attrs}])"
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
# Convenience factory functions
|
| 318 |
+
def create_system_agent(name: str = "cascade", version: str = "1.0.0") -> Agent:
|
| 319 |
+
"""Create a system agent for automated operations."""
|
| 320 |
+
return Agent(
|
| 321 |
+
id=f"agent:system:{name}",
|
| 322 |
+
agent_type=AgentType.SYSTEM,
|
| 323 |
+
name=name,
|
| 324 |
+
version=version,
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
def create_model_agent(model_id: str, version: str = None) -> Agent:
|
| 329 |
+
"""Create an agent representing an ML model."""
|
| 330 |
+
return Agent(
|
| 331 |
+
id=f"agent:model:{model_id.replace('/', '_')}",
|
| 332 |
+
agent_type=AgentType.MODEL,
|
| 333 |
+
name=model_id,
|
| 334 |
+
version=version,
|
| 335 |
+
identifier=model_id,
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
def create_user_agent(username: str, org: str = None) -> Agent:
|
| 340 |
+
"""Create an agent representing a user."""
|
| 341 |
+
agent = Agent(
|
| 342 |
+
id=f"agent:person:{username}",
|
| 343 |
+
agent_type=AgentType.PERSON,
|
| 344 |
+
name=username,
|
| 345 |
+
identifier=username,
|
| 346 |
+
)
|
| 347 |
+
if org:
|
| 348 |
+
agent.parent_agent_id = f"agent:organization:{org}"
|
| 349 |
+
return agent
|
cascade/data/hub.py
ADDED
|
@@ -0,0 +1,533 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HuggingFace Hub Integration
|
| 3 |
+
|
| 4 |
+
Push and pull dataset provenance to/from HuggingFace Hub.
|
| 5 |
+
|
| 6 |
+
Exports complete W3C PROV-O accountability bundle:
|
| 7 |
+
- cascade_provenance.json (CASCADE native format)
|
| 8 |
+
- prov_o.jsonld (W3C PROV-O JSON-LD - interoperable)
|
| 9 |
+
- prov_n.txt (W3C PROV-N notation - human readable)
|
| 10 |
+
- activities.jsonl (Activity log for audit)
|
| 11 |
+
- agents.json (Agent attributions)
|
| 12 |
+
- croissant.json (MLCommons Croissant)
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import json
|
| 16 |
+
import time
|
| 17 |
+
from datetime import datetime, timezone
|
| 18 |
+
from typing import Any, Dict, List, Optional
|
| 19 |
+
|
| 20 |
+
from .provenance import ProvenanceGraph
|
| 21 |
+
from .croissant import CroissantExporter
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class AccountabilityBundle:
|
| 25 |
+
"""
|
| 26 |
+
Complete W3C PROV-O accountability package.
|
| 27 |
+
|
| 28 |
+
When a dataset is extracted, this bundle provides full audit trail:
|
| 29 |
+
- Who created/modified it (agents)
|
| 30 |
+
- What transformations occurred (activities)
|
| 31 |
+
- Where it came from (entity lineage)
|
| 32 |
+
- When everything happened (timestamps)
|
| 33 |
+
- How to verify integrity (hashes)
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
def __init__(self, graph: ProvenanceGraph):
|
| 37 |
+
self.graph = graph
|
| 38 |
+
self.created_at = datetime.now(timezone.utc).isoformat()
|
| 39 |
+
|
| 40 |
+
def to_prov_o_jsonld(self) -> Dict[str, Any]:
|
| 41 |
+
"""Export W3C PROV-O JSON-LD (interoperable standard)."""
|
| 42 |
+
return self.graph.to_prov_jsonld()
|
| 43 |
+
|
| 44 |
+
def to_prov_n(self) -> str:
|
| 45 |
+
"""Export W3C PROV-N notation (human readable)."""
|
| 46 |
+
return self.graph.to_prov_n()
|
| 47 |
+
|
| 48 |
+
def to_activity_log(self) -> List[Dict[str, Any]]:
|
| 49 |
+
"""Export activity log for audit (JSONL format)."""
|
| 50 |
+
activities = []
|
| 51 |
+
for activity in self.graph.list_activities():
|
| 52 |
+
activities.append({
|
| 53 |
+
"id": activity.id,
|
| 54 |
+
"name": activity.name,
|
| 55 |
+
"type": activity.activity_type.value,
|
| 56 |
+
"started_at": datetime.fromtimestamp(activity.started_at).isoformat() if activity.started_at else None,
|
| 57 |
+
"ended_at": datetime.fromtimestamp(activity.ended_at).isoformat() if activity.ended_at else None,
|
| 58 |
+
"duration_seconds": activity.duration,
|
| 59 |
+
"inputs": activity.inputs,
|
| 60 |
+
"outputs": activity.outputs,
|
| 61 |
+
"parameters": activity.parameters,
|
| 62 |
+
"attributes": activity.attributes,
|
| 63 |
+
})
|
| 64 |
+
return activities
|
| 65 |
+
|
| 66 |
+
def to_agent_attributions(self) -> Dict[str, Any]:
|
| 67 |
+
"""Export agent attributions for accountability."""
|
| 68 |
+
agents = {}
|
| 69 |
+
for agent in self.graph.list_agents():
|
| 70 |
+
agents[agent.id] = {
|
| 71 |
+
"name": agent.name,
|
| 72 |
+
"type": agent.agent_type.value,
|
| 73 |
+
"version": agent.version,
|
| 74 |
+
"identifier": agent.identifier,
|
| 75 |
+
"attributes": agent.attributes,
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
# Build attribution matrix: which agent did what
|
| 79 |
+
attributions = []
|
| 80 |
+
for rel in self.graph.list_relationships():
|
| 81 |
+
if rel.relation_type.value == "wasAssociatedWith":
|
| 82 |
+
activity = self.graph.get_activity(rel.source_id)
|
| 83 |
+
agent = self.graph.get_agent(rel.target_id)
|
| 84 |
+
if activity and agent:
|
| 85 |
+
attributions.append({
|
| 86 |
+
"activity_id": activity.id,
|
| 87 |
+
"activity_name": activity.name,
|
| 88 |
+
"agent_id": agent.id,
|
| 89 |
+
"agent_name": agent.name,
|
| 90 |
+
"timestamp": datetime.fromtimestamp(activity.started_at).isoformat() if activity.started_at else None,
|
| 91 |
+
})
|
| 92 |
+
|
| 93 |
+
return {
|
| 94 |
+
"agents": agents,
|
| 95 |
+
"attributions": attributions,
|
| 96 |
+
"total_agents": len(agents),
|
| 97 |
+
"total_attributions": len(attributions),
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
def to_integrity_manifest(self) -> Dict[str, Any]:
|
| 101 |
+
"""Export integrity manifest for verification."""
|
| 102 |
+
is_valid, invalid_ids = self.graph.verify_integrity()
|
| 103 |
+
|
| 104 |
+
return {
|
| 105 |
+
"root_hash": self.graph.root_hash,
|
| 106 |
+
"created_at": self.created_at,
|
| 107 |
+
"is_valid": is_valid,
|
| 108 |
+
"invalid_entity_ids": invalid_ids,
|
| 109 |
+
"entity_hashes": {
|
| 110 |
+
entity.id: {
|
| 111 |
+
"content_hash": entity.content_hash,
|
| 112 |
+
"schema_hash": entity.schema_hash,
|
| 113 |
+
}
|
| 114 |
+
for entity in self.graph.list_entities()
|
| 115 |
+
},
|
| 116 |
+
"verification_note": (
|
| 117 |
+
"To verify: recompute content hashes and compare against this manifest. "
|
| 118 |
+
"Any mismatch indicates data tampering."
|
| 119 |
+
),
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
def export(self, output_dir: str):
|
| 123 |
+
"""Export all accountability artifacts to a directory."""
|
| 124 |
+
import os
|
| 125 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 126 |
+
|
| 127 |
+
# 1. CASCADE provenance JSON
|
| 128 |
+
with open(os.path.join(output_dir, "cascade_provenance.json"), "w") as f:
|
| 129 |
+
json.dump(self.graph.to_dict(), f, indent=2, default=str)
|
| 130 |
+
|
| 131 |
+
# 2. W3C PROV-O JSON-LD
|
| 132 |
+
with open(os.path.join(output_dir, "prov_o.jsonld"), "w") as f:
|
| 133 |
+
json.dump(self.to_prov_o_jsonld(), f, indent=2, default=str)
|
| 134 |
+
|
| 135 |
+
# 3. W3C PROV-N notation
|
| 136 |
+
with open(os.path.join(output_dir, "prov_n.txt"), "w") as f:
|
| 137 |
+
f.write(self.to_prov_n())
|
| 138 |
+
|
| 139 |
+
# 4. Activity log
|
| 140 |
+
with open(os.path.join(output_dir, "activities.jsonl"), "w") as f:
|
| 141 |
+
for activity in self.to_activity_log():
|
| 142 |
+
f.write(json.dumps(activity, default=str) + "\n")
|
| 143 |
+
|
| 144 |
+
# 5. Agent attributions
|
| 145 |
+
with open(os.path.join(output_dir, "agents.json"), "w") as f:
|
| 146 |
+
json.dump(self.to_agent_attributions(), f, indent=2, default=str)
|
| 147 |
+
|
| 148 |
+
# 6. Integrity manifest
|
| 149 |
+
with open(os.path.join(output_dir, "integrity_manifest.json"), "w") as f:
|
| 150 |
+
json.dump(self.to_integrity_manifest(), f, indent=2, default=str)
|
| 151 |
+
|
| 152 |
+
# 7. Croissant metadata
|
| 153 |
+
exporter = CroissantExporter(self.graph)
|
| 154 |
+
croissant_content = exporter.to_json(name="dataset", url="local://")
|
| 155 |
+
with open(os.path.join(output_dir, "croissant.json"), "w") as f:
|
| 156 |
+
f.write(croissant_content)
|
| 157 |
+
|
| 158 |
+
def summary(self) -> Dict[str, Any]:
|
| 159 |
+
"""Summary of the accountability bundle."""
|
| 160 |
+
stats = self.graph.stats
|
| 161 |
+
return {
|
| 162 |
+
"bundle_created_at": self.created_at,
|
| 163 |
+
"graph_name": self.graph.name,
|
| 164 |
+
"root_hash": self.graph.root_hash,
|
| 165 |
+
"entities": stats["entities"],
|
| 166 |
+
"activities": stats["activities"],
|
| 167 |
+
"agents": stats["agents"],
|
| 168 |
+
"relationships": stats["relationships"],
|
| 169 |
+
"files_included": [
|
| 170 |
+
"cascade_provenance.json",
|
| 171 |
+
"prov_o.jsonld",
|
| 172 |
+
"prov_n.txt",
|
| 173 |
+
"activities.jsonl",
|
| 174 |
+
"agents.json",
|
| 175 |
+
"integrity_manifest.json",
|
| 176 |
+
"croissant.json",
|
| 177 |
+
],
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
class HubIntegration:
|
| 182 |
+
"""
|
| 183 |
+
Integration with HuggingFace Hub for dataset provenance.
|
| 184 |
+
|
| 185 |
+
Stores complete accountability bundle:
|
| 186 |
+
1. cascade_provenance.json - CASCADE native format
|
| 187 |
+
2. prov_o.jsonld - W3C PROV-O JSON-LD (interoperable)
|
| 188 |
+
3. prov_n.txt - W3C PROV-N notation (human readable)
|
| 189 |
+
4. activities.jsonl - Activity log for audit
|
| 190 |
+
5. agents.json - Agent attributions
|
| 191 |
+
6. integrity_manifest.json - Hash verification
|
| 192 |
+
7. croissant.json - MLCommons Croissant
|
| 193 |
+
8. README.md - Human-readable provenance section
|
| 194 |
+
"""
|
| 195 |
+
|
| 196 |
+
PROVENANCE_FILENAME = "cascade_provenance.json"
|
| 197 |
+
PROV_O_FILENAME = "prov_o.jsonld"
|
| 198 |
+
PROV_N_FILENAME = "prov_n.txt"
|
| 199 |
+
ACTIVITIES_FILENAME = "activities.jsonl"
|
| 200 |
+
AGENTS_FILENAME = "agents.json"
|
| 201 |
+
INTEGRITY_FILENAME = "integrity_manifest.json"
|
| 202 |
+
CROISSANT_FILENAME = "croissant.json"
|
| 203 |
+
|
| 204 |
+
def __init__(self, token: str = None):
|
| 205 |
+
"""
|
| 206 |
+
Initialize Hub integration.
|
| 207 |
+
|
| 208 |
+
Args:
|
| 209 |
+
token: HuggingFace API token (optional, uses cached token if not provided)
|
| 210 |
+
"""
|
| 211 |
+
self.token = token
|
| 212 |
+
|
| 213 |
+
def push_provenance(
|
| 214 |
+
self,
|
| 215 |
+
graph: ProvenanceGraph,
|
| 216 |
+
repo_id: str,
|
| 217 |
+
commit_message: str = "Update provenance",
|
| 218 |
+
private: bool = False,
|
| 219 |
+
include_croissant: bool = True,
|
| 220 |
+
full_accountability: bool = True,
|
| 221 |
+
) -> str:
|
| 222 |
+
"""
|
| 223 |
+
Push complete accountability bundle to HuggingFace Hub.
|
| 224 |
+
|
| 225 |
+
Args:
|
| 226 |
+
graph: The provenance graph to push
|
| 227 |
+
repo_id: HuggingFace repo ID (e.g., "username/dataset-name")
|
| 228 |
+
commit_message: Commit message
|
| 229 |
+
private: Whether the repo should be private
|
| 230 |
+
include_croissant: Whether to include Croissant JSON-LD
|
| 231 |
+
full_accountability: Whether to include full W3C PROV-O bundle
|
| 232 |
+
|
| 233 |
+
Returns:
|
| 234 |
+
URL of the pushed provenance
|
| 235 |
+
"""
|
| 236 |
+
from huggingface_hub import HfApi, CommitOperationAdd
|
| 237 |
+
|
| 238 |
+
api = HfApi(token=self.token)
|
| 239 |
+
|
| 240 |
+
# Ensure repo exists
|
| 241 |
+
api.create_repo(
|
| 242 |
+
repo_id=repo_id,
|
| 243 |
+
repo_type="dataset",
|
| 244 |
+
private=private,
|
| 245 |
+
exist_ok=True,
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
operations = []
|
| 249 |
+
bundle = AccountabilityBundle(graph)
|
| 250 |
+
|
| 251 |
+
# 1. CASCADE provenance JSON (native format)
|
| 252 |
+
provenance_content = json.dumps(graph.to_dict(), indent=2, default=str)
|
| 253 |
+
operations.append(CommitOperationAdd(
|
| 254 |
+
path_in_repo=self.PROVENANCE_FILENAME,
|
| 255 |
+
path_or_fileobj=provenance_content.encode("utf-8"),
|
| 256 |
+
))
|
| 257 |
+
|
| 258 |
+
if full_accountability:
|
| 259 |
+
# 2. W3C PROV-O JSON-LD (interoperable standard)
|
| 260 |
+
prov_o_content = json.dumps(bundle.to_prov_o_jsonld(), indent=2, default=str)
|
| 261 |
+
operations.append(CommitOperationAdd(
|
| 262 |
+
path_in_repo=self.PROV_O_FILENAME,
|
| 263 |
+
path_or_fileobj=prov_o_content.encode("utf-8"),
|
| 264 |
+
))
|
| 265 |
+
|
| 266 |
+
# 3. W3C PROV-N notation (human readable)
|
| 267 |
+
prov_n_content = bundle.to_prov_n()
|
| 268 |
+
operations.append(CommitOperationAdd(
|
| 269 |
+
path_in_repo=self.PROV_N_FILENAME,
|
| 270 |
+
path_or_fileobj=prov_n_content.encode("utf-8"),
|
| 271 |
+
))
|
| 272 |
+
|
| 273 |
+
# 4. Activity log (JSONL for easy grep/audit)
|
| 274 |
+
activities = bundle.to_activity_log()
|
| 275 |
+
activities_content = "\n".join(json.dumps(a, default=str) for a in activities)
|
| 276 |
+
operations.append(CommitOperationAdd(
|
| 277 |
+
path_in_repo=self.ACTIVITIES_FILENAME,
|
| 278 |
+
path_or_fileobj=activities_content.encode("utf-8"),
|
| 279 |
+
))
|
| 280 |
+
|
| 281 |
+
# 5. Agent attributions
|
| 282 |
+
agents_content = json.dumps(bundle.to_agent_attributions(), indent=2, default=str)
|
| 283 |
+
operations.append(CommitOperationAdd(
|
| 284 |
+
path_in_repo=self.AGENTS_FILENAME,
|
| 285 |
+
path_or_fileobj=agents_content.encode("utf-8"),
|
| 286 |
+
))
|
| 287 |
+
|
| 288 |
+
# 6. Integrity manifest (for verification)
|
| 289 |
+
integrity_content = json.dumps(bundle.to_integrity_manifest(), indent=2, default=str)
|
| 290 |
+
operations.append(CommitOperationAdd(
|
| 291 |
+
path_in_repo=self.INTEGRITY_FILENAME,
|
| 292 |
+
path_or_fileobj=integrity_content.encode("utf-8"),
|
| 293 |
+
))
|
| 294 |
+
|
| 295 |
+
# 7. Croissant JSON-LD (MLCommons standard)
|
| 296 |
+
if include_croissant:
|
| 297 |
+
exporter = CroissantExporter(graph)
|
| 298 |
+
croissant_content = exporter.to_json(
|
| 299 |
+
name=repo_id.split("/")[-1],
|
| 300 |
+
url=f"https://huggingface.co/datasets/{repo_id}",
|
| 301 |
+
)
|
| 302 |
+
operations.append(CommitOperationAdd(
|
| 303 |
+
path_in_repo=self.CROISSANT_FILENAME,
|
| 304 |
+
path_or_fileobj=croissant_content.encode("utf-8"),
|
| 305 |
+
))
|
| 306 |
+
|
| 307 |
+
# Commit all accountability artifacts
|
| 308 |
+
api.create_commit(
|
| 309 |
+
repo_id=repo_id,
|
| 310 |
+
repo_type="dataset",
|
| 311 |
+
operations=operations,
|
| 312 |
+
commit_message=commit_message,
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
return f"https://huggingface.co/datasets/{repo_id}"
|
| 316 |
+
|
| 317 |
+
def pull_provenance(self, repo_id: str) -> Optional[ProvenanceGraph]:
|
| 318 |
+
"""
|
| 319 |
+
Pull provenance from HuggingFace Hub.
|
| 320 |
+
|
| 321 |
+
Args:
|
| 322 |
+
repo_id: HuggingFace repo ID
|
| 323 |
+
|
| 324 |
+
Returns:
|
| 325 |
+
ProvenanceGraph if found, None otherwise
|
| 326 |
+
"""
|
| 327 |
+
from huggingface_hub import hf_hub_download
|
| 328 |
+
|
| 329 |
+
try:
|
| 330 |
+
# Download provenance file
|
| 331 |
+
local_path = hf_hub_download(
|
| 332 |
+
repo_id=repo_id,
|
| 333 |
+
filename=self.PROVENANCE_FILENAME,
|
| 334 |
+
repo_type="dataset",
|
| 335 |
+
token=self.token,
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
with open(local_path, "r", encoding="utf-8") as f:
|
| 339 |
+
data = json.load(f)
|
| 340 |
+
|
| 341 |
+
return ProvenanceGraph.from_dict(data)
|
| 342 |
+
|
| 343 |
+
except Exception as e:
|
| 344 |
+
print(f"Could not pull provenance from {repo_id}: {e}")
|
| 345 |
+
return None
|
| 346 |
+
|
| 347 |
+
def get_dataset_provenance_url(self, repo_id: str) -> str:
|
| 348 |
+
"""Get URL to provenance file in Hub."""
|
| 349 |
+
return f"https://huggingface.co/datasets/{repo_id}/blob/main/{self.PROVENANCE_FILENAME}"
|
| 350 |
+
|
| 351 |
+
def update_dataset_card(
|
| 352 |
+
self,
|
| 353 |
+
repo_id: str,
|
| 354 |
+
graph: ProvenanceGraph,
|
| 355 |
+
) -> str:
|
| 356 |
+
"""
|
| 357 |
+
Update dataset card with provenance summary.
|
| 358 |
+
|
| 359 |
+
Adds/updates YAML front-matter with:
|
| 360 |
+
- Lineage information
|
| 361 |
+
- Root hash
|
| 362 |
+
- Entity/activity counts
|
| 363 |
+
|
| 364 |
+
Args:
|
| 365 |
+
repo_id: HuggingFace repo ID
|
| 366 |
+
graph: Provenance graph
|
| 367 |
+
|
| 368 |
+
Returns:
|
| 369 |
+
URL of the updated dataset
|
| 370 |
+
"""
|
| 371 |
+
from huggingface_hub import HfApi, hf_hub_download
|
| 372 |
+
|
| 373 |
+
api = HfApi(token=self.token)
|
| 374 |
+
|
| 375 |
+
# Build provenance section for README
|
| 376 |
+
provenance_section = self._build_readme_section(graph)
|
| 377 |
+
|
| 378 |
+
# Get current README
|
| 379 |
+
try:
|
| 380 |
+
readme_path = hf_hub_download(
|
| 381 |
+
repo_id=repo_id,
|
| 382 |
+
filename="README.md",
|
| 383 |
+
repo_type="dataset",
|
| 384 |
+
token=self.token,
|
| 385 |
+
)
|
| 386 |
+
with open(readme_path, "r", encoding="utf-8") as f:
|
| 387 |
+
current_readme = f.read()
|
| 388 |
+
except:
|
| 389 |
+
current_readme = f"# {repo_id.split('/')[-1]}\n\n"
|
| 390 |
+
|
| 391 |
+
# Update or append provenance section
|
| 392 |
+
marker_start = "<!-- CASCADE_PROVENANCE_START -->"
|
| 393 |
+
marker_end = "<!-- CASCADE_PROVENANCE_END -->"
|
| 394 |
+
|
| 395 |
+
if marker_start in current_readme:
|
| 396 |
+
# Replace existing section
|
| 397 |
+
import re
|
| 398 |
+
pattern = re.escape(marker_start) + r".*?" + re.escape(marker_end)
|
| 399 |
+
new_readme = re.sub(
|
| 400 |
+
pattern,
|
| 401 |
+
f"{marker_start}\n{provenance_section}\n{marker_end}",
|
| 402 |
+
current_readme,
|
| 403 |
+
flags=re.DOTALL,
|
| 404 |
+
)
|
| 405 |
+
else:
|
| 406 |
+
# Append section
|
| 407 |
+
new_readme = current_readme.rstrip() + f"\n\n{marker_start}\n{provenance_section}\n{marker_end}\n"
|
| 408 |
+
|
| 409 |
+
# Push updated README
|
| 410 |
+
api.upload_file(
|
| 411 |
+
path_or_fileobj=new_readme.encode("utf-8"),
|
| 412 |
+
path_in_repo="README.md",
|
| 413 |
+
repo_id=repo_id,
|
| 414 |
+
repo_type="dataset",
|
| 415 |
+
commit_message="Update provenance in README",
|
| 416 |
+
)
|
| 417 |
+
|
| 418 |
+
return f"https://huggingface.co/datasets/{repo_id}"
|
| 419 |
+
|
| 420 |
+
def _build_readme_section(self, graph: ProvenanceGraph) -> str:
|
| 421 |
+
"""Build provenance section for README."""
|
| 422 |
+
stats = graph.stats
|
| 423 |
+
bundle = AccountabilityBundle(graph)
|
| 424 |
+
|
| 425 |
+
lines = [
|
| 426 |
+
"## 🔗 Provenance & Accountability",
|
| 427 |
+
"",
|
| 428 |
+
"This dataset has CASCADE provenance tracking enabled with full W3C PROV-O compliance.",
|
| 429 |
+
"",
|
| 430 |
+
"### Integrity",
|
| 431 |
+
"",
|
| 432 |
+
f"| Metric | Value |",
|
| 433 |
+
f"|--------|-------|",
|
| 434 |
+
f"| Root Hash | `{graph.root_hash[:16]}...` |",
|
| 435 |
+
f"| Entities | {stats['entities']} |",
|
| 436 |
+
f"| Activities | {stats['activities']} |",
|
| 437 |
+
f"| Agents | {stats['agents']} |",
|
| 438 |
+
f"| Relationships | {stats['relationships']} |",
|
| 439 |
+
"",
|
| 440 |
+
]
|
| 441 |
+
|
| 442 |
+
# Add lineage summary
|
| 443 |
+
entities = graph.list_entities()
|
| 444 |
+
if entities:
|
| 445 |
+
lines.append("### Lineage")
|
| 446 |
+
lines.append("")
|
| 447 |
+
for entity in entities[:5]: # Show first 5
|
| 448 |
+
upstream = graph.get_lineage(entity.id, "upstream")
|
| 449 |
+
if upstream:
|
| 450 |
+
lines.append(f"- **{entity.name}** derived from: {', '.join(upstream[:3])}")
|
| 451 |
+
else:
|
| 452 |
+
lines.append(f"- **{entity.name}** (source)")
|
| 453 |
+
if len(entities) > 5:
|
| 454 |
+
lines.append(f"- ... and {len(entities) - 5} more entities")
|
| 455 |
+
lines.append("")
|
| 456 |
+
|
| 457 |
+
# Add activities summary
|
| 458 |
+
activities = graph.list_activities()
|
| 459 |
+
if activities:
|
| 460 |
+
lines.append("### Activities")
|
| 461 |
+
lines.append("")
|
| 462 |
+
for activity in activities[:5]:
|
| 463 |
+
duration = f" ({activity.duration:.2f}s)" if activity.duration else ""
|
| 464 |
+
lines.append(f"- **{activity.name}** [{activity.activity_type.value}]{duration}")
|
| 465 |
+
if len(activities) > 5:
|
| 466 |
+
lines.append(f"- ... and {len(activities) - 5} more activities")
|
| 467 |
+
lines.append("")
|
| 468 |
+
|
| 469 |
+
# Add agents summary
|
| 470 |
+
agents = graph.list_agents()
|
| 471 |
+
if agents:
|
| 472 |
+
lines.append("### Agents (Accountability)")
|
| 473 |
+
lines.append("")
|
| 474 |
+
for agent in agents[:5]:
|
| 475 |
+
lines.append(f"- **{agent.name}** [{agent.agent_type.value}]")
|
| 476 |
+
if len(agents) > 5:
|
| 477 |
+
lines.append(f"- ... and {len(agents) - 5} more agents")
|
| 478 |
+
lines.append("")
|
| 479 |
+
|
| 480 |
+
# Accountability bundle files
|
| 481 |
+
lines.extend([
|
| 482 |
+
"### Accountability Bundle",
|
| 483 |
+
"",
|
| 484 |
+
"| File | Standard | Description |",
|
| 485 |
+
"|------|----------|-------------|",
|
| 486 |
+
f"| [{self.PROVENANCE_FILENAME}]({self.PROVENANCE_FILENAME}) | CASCADE | Native provenance format |",
|
| 487 |
+
f"| [{self.PROV_O_FILENAME}]({self.PROV_O_FILENAME}) | W3C PROV-O | Interoperable JSON-LD |",
|
| 488 |
+
f"| [{self.PROV_N_FILENAME}]({self.PROV_N_FILENAME}) | W3C PROV-N | Human-readable notation |",
|
| 489 |
+
f"| [{self.ACTIVITIES_FILENAME}]({self.ACTIVITIES_FILENAME}) | JSONL | Activity audit log |",
|
| 490 |
+
f"| [{self.AGENTS_FILENAME}]({self.AGENTS_FILENAME}) | JSON | Agent attributions |",
|
| 491 |
+
f"| [{self.INTEGRITY_FILENAME}]({self.INTEGRITY_FILENAME}) | JSON | Hash verification manifest |",
|
| 492 |
+
f"| [{self.CROISSANT_FILENAME}]({self.CROISSANT_FILENAME}) | MLCommons | Croissant metadata |",
|
| 493 |
+
"",
|
| 494 |
+
])
|
| 495 |
+
|
| 496 |
+
return "\n".join(lines)
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
def push_to_hub(
|
| 500 |
+
graph: ProvenanceGraph,
|
| 501 |
+
repo_id: str,
|
| 502 |
+
token: str = None,
|
| 503 |
+
private: bool = False,
|
| 504 |
+
) -> str:
|
| 505 |
+
"""
|
| 506 |
+
Convenience function to push provenance to Hub.
|
| 507 |
+
|
| 508 |
+
Args:
|
| 509 |
+
graph: Provenance graph to push
|
| 510 |
+
repo_id: HuggingFace repo ID
|
| 511 |
+
token: HF token (optional)
|
| 512 |
+
private: Whether repo should be private
|
| 513 |
+
|
| 514 |
+
Returns:
|
| 515 |
+
URL of the pushed provenance
|
| 516 |
+
"""
|
| 517 |
+
hub = HubIntegration(token=token)
|
| 518 |
+
return hub.push_provenance(graph, repo_id, private=private)
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
def pull_from_hub(repo_id: str, token: str = None) -> Optional[ProvenanceGraph]:
|
| 522 |
+
"""
|
| 523 |
+
Convenience function to pull provenance from Hub.
|
| 524 |
+
|
| 525 |
+
Args:
|
| 526 |
+
repo_id: HuggingFace repo ID
|
| 527 |
+
token: HF token (optional)
|
| 528 |
+
|
| 529 |
+
Returns:
|
| 530 |
+
ProvenanceGraph if found
|
| 531 |
+
"""
|
| 532 |
+
hub = HubIntegration(token=token)
|
| 533 |
+
return hub.pull_provenance(repo_id)
|
cascade/data/license.py
ADDED
|
@@ -0,0 +1,635 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SPDX License Tracking for CASCADE
|
| 3 |
+
|
| 4 |
+
Industry standard license tracking based on:
|
| 5 |
+
- SPDX (Software Package Data Exchange) - Linux Foundation
|
| 6 |
+
- HuggingFace Dataset Cards license field
|
| 7 |
+
- Croissant metadata license property
|
| 8 |
+
|
| 9 |
+
License Compatibility Rules:
|
| 10 |
+
- Permissive (MIT, Apache-2.0) → Can derive into restrictive
|
| 11 |
+
- Copyleft (GPL-3.0) → Derivatives must also be copyleft
|
| 12 |
+
- NonCommercial (CC-BY-NC-*) → Propagates non-commercial restriction
|
| 13 |
+
- ShareAlike (CC-BY-SA-*) → Derivatives must use same license
|
| 14 |
+
- NoDerivatives (CC-BY-ND-*) → Cannot create derivatives
|
| 15 |
+
|
| 16 |
+
References:
|
| 17 |
+
- https://spdx.org/licenses/
|
| 18 |
+
- https://creativecommons.org/licenses/
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from dataclasses import dataclass, field
|
| 22 |
+
from enum import Enum
|
| 23 |
+
from typing import Dict, List, Optional, Set, Tuple, Any
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class LicenseCategory(Enum):
|
| 27 |
+
"""License categories for compatibility analysis."""
|
| 28 |
+
PERMISSIVE = "permissive" # MIT, Apache, BSD
|
| 29 |
+
WEAK_COPYLEFT = "weak-copyleft" # LGPL, MPL
|
| 30 |
+
STRONG_COPYLEFT = "strong-copyleft" # GPL, AGPL
|
| 31 |
+
CREATIVE_COMMONS = "creative-commons"
|
| 32 |
+
PUBLIC_DOMAIN = "public-domain" # CC0, Unlicense
|
| 33 |
+
PROPRIETARY = "proprietary"
|
| 34 |
+
UNKNOWN = "unknown"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class LicenseRestriction(Enum):
|
| 38 |
+
"""License restrictions that propagate to derivatives."""
|
| 39 |
+
NONE = "none"
|
| 40 |
+
ATTRIBUTION = "attribution" # Must credit original
|
| 41 |
+
SHARE_ALIKE = "share-alike" # Derivatives same license
|
| 42 |
+
NON_COMMERCIAL = "non-commercial" # No commercial use
|
| 43 |
+
NO_DERIVATIVES = "no-derivatives" # Cannot modify
|
| 44 |
+
COPYLEFT = "copyleft" # Must open source derivatives
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@dataclass
|
| 48 |
+
class SPDXLicense:
|
| 49 |
+
"""
|
| 50 |
+
SPDX License Information.
|
| 51 |
+
|
| 52 |
+
Based on SPDX License List: https://spdx.org/licenses/
|
| 53 |
+
"""
|
| 54 |
+
id: str # SPDX identifier (e.g., "MIT", "Apache-2.0")
|
| 55 |
+
name: str # Full name
|
| 56 |
+
category: LicenseCategory = LicenseCategory.UNKNOWN
|
| 57 |
+
restrictions: Set[LicenseRestriction] = field(default_factory=set)
|
| 58 |
+
osi_approved: bool = False # Open Source Initiative approved
|
| 59 |
+
fsf_libre: bool = False # FSF Free/Libre
|
| 60 |
+
url: Optional[str] = None # License text URL
|
| 61 |
+
|
| 62 |
+
def allows_commercial(self) -> bool:
|
| 63 |
+
"""Check if license allows commercial use."""
|
| 64 |
+
return LicenseRestriction.NON_COMMERCIAL not in self.restrictions
|
| 65 |
+
|
| 66 |
+
def allows_derivatives(self) -> bool:
|
| 67 |
+
"""Check if license allows creating derivatives."""
|
| 68 |
+
return LicenseRestriction.NO_DERIVATIVES not in self.restrictions
|
| 69 |
+
|
| 70 |
+
def requires_attribution(self) -> bool:
|
| 71 |
+
"""Check if license requires attribution."""
|
| 72 |
+
return LicenseRestriction.ATTRIBUTION in self.restrictions
|
| 73 |
+
|
| 74 |
+
def requires_share_alike(self) -> bool:
|
| 75 |
+
"""Check if license requires same license for derivatives."""
|
| 76 |
+
return (
|
| 77 |
+
LicenseRestriction.SHARE_ALIKE in self.restrictions or
|
| 78 |
+
LicenseRestriction.COPYLEFT in self.restrictions
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 82 |
+
return {
|
| 83 |
+
"spdx_id": self.id,
|
| 84 |
+
"name": self.name,
|
| 85 |
+
"category": self.category.value,
|
| 86 |
+
"restrictions": [r.value for r in self.restrictions],
|
| 87 |
+
"osi_approved": self.osi_approved,
|
| 88 |
+
"fsf_libre": self.fsf_libre,
|
| 89 |
+
"url": self.url,
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# SPDX License Registry - Common ML/Data licenses
|
| 94 |
+
SPDX_LICENSES: Dict[str, SPDXLicense] = {
|
| 95 |
+
# Public Domain
|
| 96 |
+
"CC0-1.0": SPDXLicense(
|
| 97 |
+
id="CC0-1.0",
|
| 98 |
+
name="Creative Commons Zero v1.0 Universal",
|
| 99 |
+
category=LicenseCategory.PUBLIC_DOMAIN,
|
| 100 |
+
restrictions=set(),
|
| 101 |
+
osi_approved=False,
|
| 102 |
+
fsf_libre=True,
|
| 103 |
+
url="https://creativecommons.org/publicdomain/zero/1.0/",
|
| 104 |
+
),
|
| 105 |
+
"Unlicense": SPDXLicense(
|
| 106 |
+
id="Unlicense",
|
| 107 |
+
name="The Unlicense",
|
| 108 |
+
category=LicenseCategory.PUBLIC_DOMAIN,
|
| 109 |
+
restrictions=set(),
|
| 110 |
+
osi_approved=True,
|
| 111 |
+
fsf_libre=True,
|
| 112 |
+
url="https://unlicense.org/",
|
| 113 |
+
),
|
| 114 |
+
|
| 115 |
+
# Permissive
|
| 116 |
+
"MIT": SPDXLicense(
|
| 117 |
+
id="MIT",
|
| 118 |
+
name="MIT License",
|
| 119 |
+
category=LicenseCategory.PERMISSIVE,
|
| 120 |
+
restrictions={LicenseRestriction.ATTRIBUTION},
|
| 121 |
+
osi_approved=True,
|
| 122 |
+
fsf_libre=True,
|
| 123 |
+
url="https://opensource.org/licenses/MIT",
|
| 124 |
+
),
|
| 125 |
+
"Apache-2.0": SPDXLicense(
|
| 126 |
+
id="Apache-2.0",
|
| 127 |
+
name="Apache License 2.0",
|
| 128 |
+
category=LicenseCategory.PERMISSIVE,
|
| 129 |
+
restrictions={LicenseRestriction.ATTRIBUTION},
|
| 130 |
+
osi_approved=True,
|
| 131 |
+
fsf_libre=True,
|
| 132 |
+
url="https://www.apache.org/licenses/LICENSE-2.0",
|
| 133 |
+
),
|
| 134 |
+
"BSD-2-Clause": SPDXLicense(
|
| 135 |
+
id="BSD-2-Clause",
|
| 136 |
+
name='BSD 2-Clause "Simplified" License',
|
| 137 |
+
category=LicenseCategory.PERMISSIVE,
|
| 138 |
+
restrictions={LicenseRestriction.ATTRIBUTION},
|
| 139 |
+
osi_approved=True,
|
| 140 |
+
fsf_libre=True,
|
| 141 |
+
url="https://opensource.org/licenses/BSD-2-Clause",
|
| 142 |
+
),
|
| 143 |
+
"BSD-3-Clause": SPDXLicense(
|
| 144 |
+
id="BSD-3-Clause",
|
| 145 |
+
name='BSD 3-Clause "New" or "Revised" License',
|
| 146 |
+
category=LicenseCategory.PERMISSIVE,
|
| 147 |
+
restrictions={LicenseRestriction.ATTRIBUTION},
|
| 148 |
+
osi_approved=True,
|
| 149 |
+
fsf_libre=True,
|
| 150 |
+
url="https://opensource.org/licenses/BSD-3-Clause",
|
| 151 |
+
),
|
| 152 |
+
|
| 153 |
+
# Creative Commons
|
| 154 |
+
"CC-BY-4.0": SPDXLicense(
|
| 155 |
+
id="CC-BY-4.0",
|
| 156 |
+
name="Creative Commons Attribution 4.0",
|
| 157 |
+
category=LicenseCategory.CREATIVE_COMMONS,
|
| 158 |
+
restrictions={LicenseRestriction.ATTRIBUTION},
|
| 159 |
+
osi_approved=False,
|
| 160 |
+
fsf_libre=True,
|
| 161 |
+
url="https://creativecommons.org/licenses/by/4.0/",
|
| 162 |
+
),
|
| 163 |
+
"CC-BY-SA-4.0": SPDXLicense(
|
| 164 |
+
id="CC-BY-SA-4.0",
|
| 165 |
+
name="Creative Commons Attribution ShareAlike 4.0",
|
| 166 |
+
category=LicenseCategory.CREATIVE_COMMONS,
|
| 167 |
+
restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.SHARE_ALIKE},
|
| 168 |
+
osi_approved=False,
|
| 169 |
+
fsf_libre=True,
|
| 170 |
+
url="https://creativecommons.org/licenses/by-sa/4.0/",
|
| 171 |
+
),
|
| 172 |
+
"CC-BY-NC-4.0": SPDXLicense(
|
| 173 |
+
id="CC-BY-NC-4.0",
|
| 174 |
+
name="Creative Commons Attribution NonCommercial 4.0",
|
| 175 |
+
category=LicenseCategory.CREATIVE_COMMONS,
|
| 176 |
+
restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.NON_COMMERCIAL},
|
| 177 |
+
osi_approved=False,
|
| 178 |
+
fsf_libre=False,
|
| 179 |
+
url="https://creativecommons.org/licenses/by-nc/4.0/",
|
| 180 |
+
),
|
| 181 |
+
"CC-BY-NC-SA-4.0": SPDXLicense(
|
| 182 |
+
id="CC-BY-NC-SA-4.0",
|
| 183 |
+
name="Creative Commons Attribution NonCommercial ShareAlike 4.0",
|
| 184 |
+
category=LicenseCategory.CREATIVE_COMMONS,
|
| 185 |
+
restrictions={
|
| 186 |
+
LicenseRestriction.ATTRIBUTION,
|
| 187 |
+
LicenseRestriction.NON_COMMERCIAL,
|
| 188 |
+
LicenseRestriction.SHARE_ALIKE,
|
| 189 |
+
},
|
| 190 |
+
osi_approved=False,
|
| 191 |
+
fsf_libre=False,
|
| 192 |
+
url="https://creativecommons.org/licenses/by-nc-sa/4.0/",
|
| 193 |
+
),
|
| 194 |
+
"CC-BY-ND-4.0": SPDXLicense(
|
| 195 |
+
id="CC-BY-ND-4.0",
|
| 196 |
+
name="Creative Commons Attribution NoDerivatives 4.0",
|
| 197 |
+
category=LicenseCategory.CREATIVE_COMMONS,
|
| 198 |
+
restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.NO_DERIVATIVES},
|
| 199 |
+
osi_approved=False,
|
| 200 |
+
fsf_libre=False,
|
| 201 |
+
url="https://creativecommons.org/licenses/by-nd/4.0/",
|
| 202 |
+
),
|
| 203 |
+
|
| 204 |
+
# Weak Copyleft
|
| 205 |
+
"LGPL-3.0": SPDXLicense(
|
| 206 |
+
id="LGPL-3.0",
|
| 207 |
+
name="GNU Lesser General Public License v3.0",
|
| 208 |
+
category=LicenseCategory.WEAK_COPYLEFT,
|
| 209 |
+
restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT},
|
| 210 |
+
osi_approved=True,
|
| 211 |
+
fsf_libre=True,
|
| 212 |
+
url="https://www.gnu.org/licenses/lgpl-3.0.html",
|
| 213 |
+
),
|
| 214 |
+
"MPL-2.0": SPDXLicense(
|
| 215 |
+
id="MPL-2.0",
|
| 216 |
+
name="Mozilla Public License 2.0",
|
| 217 |
+
category=LicenseCategory.WEAK_COPYLEFT,
|
| 218 |
+
restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT},
|
| 219 |
+
osi_approved=True,
|
| 220 |
+
fsf_libre=True,
|
| 221 |
+
url="https://www.mozilla.org/en-US/MPL/2.0/",
|
| 222 |
+
),
|
| 223 |
+
|
| 224 |
+
# Strong Copyleft
|
| 225 |
+
"GPL-3.0": SPDXLicense(
|
| 226 |
+
id="GPL-3.0",
|
| 227 |
+
name="GNU General Public License v3.0",
|
| 228 |
+
category=LicenseCategory.STRONG_COPYLEFT,
|
| 229 |
+
restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT},
|
| 230 |
+
osi_approved=True,
|
| 231 |
+
fsf_libre=True,
|
| 232 |
+
url="https://www.gnu.org/licenses/gpl-3.0.html",
|
| 233 |
+
),
|
| 234 |
+
"AGPL-3.0": SPDXLicense(
|
| 235 |
+
id="AGPL-3.0",
|
| 236 |
+
name="GNU Affero General Public License v3.0",
|
| 237 |
+
category=LicenseCategory.STRONG_COPYLEFT,
|
| 238 |
+
restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT},
|
| 239 |
+
osi_approved=True,
|
| 240 |
+
fsf_libre=True,
|
| 241 |
+
url="https://www.gnu.org/licenses/agpl-3.0.html",
|
| 242 |
+
),
|
| 243 |
+
|
| 244 |
+
# ML-Specific
|
| 245 |
+
"OpenRAIL": SPDXLicense(
|
| 246 |
+
id="OpenRAIL",
|
| 247 |
+
name="Open RAIL License",
|
| 248 |
+
category=LicenseCategory.PERMISSIVE,
|
| 249 |
+
restrictions={LicenseRestriction.ATTRIBUTION},
|
| 250 |
+
osi_approved=False,
|
| 251 |
+
fsf_libre=False,
|
| 252 |
+
url="https://huggingface.co/blog/open_rail",
|
| 253 |
+
),
|
| 254 |
+
"OpenRAIL-M": SPDXLicense(
|
| 255 |
+
id="OpenRAIL-M",
|
| 256 |
+
name="Open RAIL-M License",
|
| 257 |
+
category=LicenseCategory.PERMISSIVE,
|
| 258 |
+
restrictions={LicenseRestriction.ATTRIBUTION},
|
| 259 |
+
osi_approved=False,
|
| 260 |
+
fsf_libre=False,
|
| 261 |
+
url="https://www.licenses.ai/blog/2022/8/26/bigscience-open-rail-m-license",
|
| 262 |
+
),
|
| 263 |
+
|
| 264 |
+
# Special
|
| 265 |
+
"other": SPDXLicense(
|
| 266 |
+
id="other",
|
| 267 |
+
name="Other/Custom License",
|
| 268 |
+
category=LicenseCategory.UNKNOWN,
|
| 269 |
+
restrictions=set(),
|
| 270 |
+
osi_approved=False,
|
| 271 |
+
fsf_libre=False,
|
| 272 |
+
url=None,
|
| 273 |
+
),
|
| 274 |
+
"unknown": SPDXLicense(
|
| 275 |
+
id="unknown",
|
| 276 |
+
name="Unknown License",
|
| 277 |
+
category=LicenseCategory.UNKNOWN,
|
| 278 |
+
restrictions=set(),
|
| 279 |
+
osi_approved=False,
|
| 280 |
+
fsf_libre=False,
|
| 281 |
+
url=None,
|
| 282 |
+
),
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def get_license(spdx_id: str) -> SPDXLicense:
|
| 287 |
+
"""
|
| 288 |
+
Get license by SPDX identifier.
|
| 289 |
+
|
| 290 |
+
Args:
|
| 291 |
+
spdx_id: SPDX license identifier (case-insensitive)
|
| 292 |
+
|
| 293 |
+
Returns:
|
| 294 |
+
SPDXLicense object (unknown if not found)
|
| 295 |
+
"""
|
| 296 |
+
# Normalize common variants
|
| 297 |
+
normalized = spdx_id.strip()
|
| 298 |
+
|
| 299 |
+
# Direct lookup
|
| 300 |
+
if normalized in SPDX_LICENSES:
|
| 301 |
+
return SPDX_LICENSES[normalized]
|
| 302 |
+
|
| 303 |
+
# Case-insensitive lookup
|
| 304 |
+
for key, license in SPDX_LICENSES.items():
|
| 305 |
+
if key.lower() == normalized.lower():
|
| 306 |
+
return license
|
| 307 |
+
|
| 308 |
+
# Common aliases
|
| 309 |
+
aliases = {
|
| 310 |
+
"mit": "MIT",
|
| 311 |
+
"apache": "Apache-2.0",
|
| 312 |
+
"apache2": "Apache-2.0",
|
| 313 |
+
"gpl": "GPL-3.0",
|
| 314 |
+
"gpl3": "GPL-3.0",
|
| 315 |
+
"lgpl": "LGPL-3.0",
|
| 316 |
+
"bsd": "BSD-3-Clause",
|
| 317 |
+
"cc0": "CC0-1.0",
|
| 318 |
+
"cc-by": "CC-BY-4.0",
|
| 319 |
+
"cc-by-sa": "CC-BY-SA-4.0",
|
| 320 |
+
"cc-by-nc": "CC-BY-NC-4.0",
|
| 321 |
+
"cc-by-nc-sa": "CC-BY-NC-SA-4.0",
|
| 322 |
+
"cc-by-nd": "CC-BY-ND-4.0",
|
| 323 |
+
"unlicense": "Unlicense",
|
| 324 |
+
"public domain": "CC0-1.0",
|
| 325 |
+
"openrail": "OpenRAIL",
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
lower_id = normalized.lower().replace("_", "-").replace(" ", "-")
|
| 329 |
+
if lower_id in aliases:
|
| 330 |
+
return SPDX_LICENSES[aliases[lower_id]]
|
| 331 |
+
|
| 332 |
+
# Return unknown
|
| 333 |
+
return SPDX_LICENSES["unknown"]
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
@dataclass
|
| 337 |
+
class LicenseCompatibility:
|
| 338 |
+
"""Result of license compatibility check."""
|
| 339 |
+
compatible: bool
|
| 340 |
+
derived_license: Optional[SPDXLicense] = None
|
| 341 |
+
issues: List[str] = field(default_factory=list)
|
| 342 |
+
warnings: List[str] = field(default_factory=list)
|
| 343 |
+
attribution_required: List[str] = field(default_factory=list) # Source IDs requiring attribution
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
class LicenseAnalyzer:
|
| 347 |
+
"""
|
| 348 |
+
Analyze license compatibility for dataset derivation.
|
| 349 |
+
|
| 350 |
+
Rules:
|
| 351 |
+
1. No-Derivatives: Cannot create derivatives
|
| 352 |
+
2. Share-Alike: Must use same license
|
| 353 |
+
3. Copyleft: Must use compatible copyleft license
|
| 354 |
+
4. Non-Commercial: Restriction propagates
|
| 355 |
+
5. Attribution: Must credit all sources
|
| 356 |
+
"""
|
| 357 |
+
|
| 358 |
+
# License compatibility matrix (can this → derive into that?)
|
| 359 |
+
# Rows: source license category, Columns: derived license category
|
| 360 |
+
COMPATIBILITY_MATRIX = {
|
| 361 |
+
LicenseCategory.PUBLIC_DOMAIN: {
|
| 362 |
+
LicenseCategory.PUBLIC_DOMAIN: True,
|
| 363 |
+
LicenseCategory.PERMISSIVE: True,
|
| 364 |
+
LicenseCategory.CREATIVE_COMMONS: True,
|
| 365 |
+
LicenseCategory.WEAK_COPYLEFT: True,
|
| 366 |
+
LicenseCategory.STRONG_COPYLEFT: True,
|
| 367 |
+
LicenseCategory.PROPRIETARY: True,
|
| 368 |
+
},
|
| 369 |
+
LicenseCategory.PERMISSIVE: {
|
| 370 |
+
LicenseCategory.PUBLIC_DOMAIN: False,
|
| 371 |
+
LicenseCategory.PERMISSIVE: True,
|
| 372 |
+
LicenseCategory.CREATIVE_COMMONS: True,
|
| 373 |
+
LicenseCategory.WEAK_COPYLEFT: True,
|
| 374 |
+
LicenseCategory.STRONG_COPYLEFT: True,
|
| 375 |
+
LicenseCategory.PROPRIETARY: True,
|
| 376 |
+
},
|
| 377 |
+
LicenseCategory.CREATIVE_COMMONS: {
|
| 378 |
+
LicenseCategory.PUBLIC_DOMAIN: False,
|
| 379 |
+
LicenseCategory.PERMISSIVE: False, # Depends on specific CC
|
| 380 |
+
LicenseCategory.CREATIVE_COMMONS: True, # Depends on specific CC
|
| 381 |
+
LicenseCategory.WEAK_COPYLEFT: False,
|
| 382 |
+
LicenseCategory.STRONG_COPYLEFT: False,
|
| 383 |
+
LicenseCategory.PROPRIETARY: False,
|
| 384 |
+
},
|
| 385 |
+
LicenseCategory.WEAK_COPYLEFT: {
|
| 386 |
+
LicenseCategory.PUBLIC_DOMAIN: False,
|
| 387 |
+
LicenseCategory.PERMISSIVE: False,
|
| 388 |
+
LicenseCategory.CREATIVE_COMMONS: False,
|
| 389 |
+
LicenseCategory.WEAK_COPYLEFT: True,
|
| 390 |
+
LicenseCategory.STRONG_COPYLEFT: True,
|
| 391 |
+
LicenseCategory.PROPRIETARY: False,
|
| 392 |
+
},
|
| 393 |
+
LicenseCategory.STRONG_COPYLEFT: {
|
| 394 |
+
LicenseCategory.PUBLIC_DOMAIN: False,
|
| 395 |
+
LicenseCategory.PERMISSIVE: False,
|
| 396 |
+
LicenseCategory.CREATIVE_COMMONS: False,
|
| 397 |
+
LicenseCategory.WEAK_COPYLEFT: False,
|
| 398 |
+
LicenseCategory.STRONG_COPYLEFT: True,
|
| 399 |
+
LicenseCategory.PROPRIETARY: False,
|
| 400 |
+
},
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
def check_compatibility(
|
| 404 |
+
self,
|
| 405 |
+
source_licenses: List[Tuple[str, str]], # List of (entity_id, spdx_id)
|
| 406 |
+
target_license: Optional[str] = None,
|
| 407 |
+
) -> LicenseCompatibility:
|
| 408 |
+
"""
|
| 409 |
+
Check if source licenses allow derivation.
|
| 410 |
+
|
| 411 |
+
Args:
|
| 412 |
+
source_licenses: List of (entity_id, license_id) tuples
|
| 413 |
+
target_license: Intended license for derived work (optional)
|
| 414 |
+
|
| 415 |
+
Returns:
|
| 416 |
+
LicenseCompatibility result
|
| 417 |
+
"""
|
| 418 |
+
if not source_licenses:
|
| 419 |
+
return LicenseCompatibility(
|
| 420 |
+
compatible=True,
|
| 421 |
+
derived_license=SPDX_LICENSES["unknown"],
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
issues = []
|
| 425 |
+
warnings = []
|
| 426 |
+
attribution_required = []
|
| 427 |
+
|
| 428 |
+
# Collect all restrictions
|
| 429 |
+
all_restrictions: Set[LicenseRestriction] = set()
|
| 430 |
+
licenses = []
|
| 431 |
+
|
| 432 |
+
for entity_id, spdx_id in source_licenses:
|
| 433 |
+
lic = get_license(spdx_id)
|
| 434 |
+
licenses.append((entity_id, lic))
|
| 435 |
+
all_restrictions.update(lic.restrictions)
|
| 436 |
+
|
| 437 |
+
# Track attribution requirements
|
| 438 |
+
if lic.requires_attribution():
|
| 439 |
+
attribution_required.append(entity_id)
|
| 440 |
+
|
| 441 |
+
# Check No-Derivatives
|
| 442 |
+
for entity_id, lic in licenses:
|
| 443 |
+
if LicenseRestriction.NO_DERIVATIVES in lic.restrictions:
|
| 444 |
+
issues.append(
|
| 445 |
+
f"Cannot derive from '{entity_id}': license '{lic.id}' prohibits derivatives"
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
if issues:
|
| 449 |
+
return LicenseCompatibility(
|
| 450 |
+
compatible=False,
|
| 451 |
+
issues=issues,
|
| 452 |
+
warnings=warnings,
|
| 453 |
+
attribution_required=attribution_required,
|
| 454 |
+
)
|
| 455 |
+
|
| 456 |
+
# Determine derived license
|
| 457 |
+
derived = self._compute_derived_license(licenses, all_restrictions)
|
| 458 |
+
|
| 459 |
+
# Check target license compatibility
|
| 460 |
+
if target_license:
|
| 461 |
+
target = get_license(target_license)
|
| 462 |
+
if not self._can_relicense(derived, target):
|
| 463 |
+
issues.append(
|
| 464 |
+
f"Cannot license derived work as '{target.id}': "
|
| 465 |
+
f"must use '{derived.id}' or compatible license"
|
| 466 |
+
)
|
| 467 |
+
|
| 468 |
+
# Add warnings
|
| 469 |
+
if LicenseRestriction.NON_COMMERCIAL in all_restrictions:
|
| 470 |
+
warnings.append("Derived work restricted to non-commercial use only")
|
| 471 |
+
|
| 472 |
+
if LicenseRestriction.SHARE_ALIKE in all_restrictions:
|
| 473 |
+
warnings.append(f"Derived work must use ShareAlike-compatible license: {derived.id}")
|
| 474 |
+
|
| 475 |
+
if LicenseRestriction.COPYLEFT in all_restrictions:
|
| 476 |
+
warnings.append(f"Derived work must use copyleft license: {derived.id}")
|
| 477 |
+
|
| 478 |
+
return LicenseCompatibility(
|
| 479 |
+
compatible=len(issues) == 0,
|
| 480 |
+
derived_license=derived,
|
| 481 |
+
issues=issues,
|
| 482 |
+
warnings=warnings,
|
| 483 |
+
attribution_required=attribution_required,
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
def _compute_derived_license(
|
| 487 |
+
self,
|
| 488 |
+
licenses: List[Tuple[str, SPDXLicense]],
|
| 489 |
+
all_restrictions: Set[LicenseRestriction],
|
| 490 |
+
) -> SPDXLicense:
|
| 491 |
+
"""
|
| 492 |
+
Compute the most restrictive license for derived work.
|
| 493 |
+
|
| 494 |
+
The derived license is the "lowest common denominator" that
|
| 495 |
+
satisfies all source license requirements.
|
| 496 |
+
"""
|
| 497 |
+
# Priority: Strong Copyleft > Weak Copyleft > CC-SA > CC-NC > Permissive > Public Domain
|
| 498 |
+
|
| 499 |
+
has_strong_copyleft = any(
|
| 500 |
+
lic.category == LicenseCategory.STRONG_COPYLEFT
|
| 501 |
+
for _, lic in licenses
|
| 502 |
+
)
|
| 503 |
+
has_weak_copyleft = any(
|
| 504 |
+
lic.category == LicenseCategory.WEAK_COPYLEFT
|
| 505 |
+
for _, lic in licenses
|
| 506 |
+
)
|
| 507 |
+
has_share_alike = LicenseRestriction.SHARE_ALIKE in all_restrictions
|
| 508 |
+
has_non_commercial = LicenseRestriction.NON_COMMERCIAL in all_restrictions
|
| 509 |
+
|
| 510 |
+
# Strong copyleft dominates
|
| 511 |
+
if has_strong_copyleft:
|
| 512 |
+
for _, lic in licenses:
|
| 513 |
+
if lic.category == LicenseCategory.STRONG_COPYLEFT:
|
| 514 |
+
return lic
|
| 515 |
+
|
| 516 |
+
# Weak copyleft next
|
| 517 |
+
if has_weak_copyleft:
|
| 518 |
+
for _, lic in licenses:
|
| 519 |
+
if lic.category == LicenseCategory.WEAK_COPYLEFT:
|
| 520 |
+
return lic
|
| 521 |
+
|
| 522 |
+
# CC with restrictions
|
| 523 |
+
if has_share_alike and has_non_commercial:
|
| 524 |
+
return SPDX_LICENSES["CC-BY-NC-SA-4.0"]
|
| 525 |
+
elif has_share_alike:
|
| 526 |
+
return SPDX_LICENSES["CC-BY-SA-4.0"]
|
| 527 |
+
elif has_non_commercial:
|
| 528 |
+
return SPDX_LICENSES["CC-BY-NC-4.0"]
|
| 529 |
+
|
| 530 |
+
# Most permissive with attribution
|
| 531 |
+
if LicenseRestriction.ATTRIBUTION in all_restrictions:
|
| 532 |
+
# Check if any source requires specific license
|
| 533 |
+
for _, lic in licenses:
|
| 534 |
+
if lic.category == LicenseCategory.CREATIVE_COMMONS:
|
| 535 |
+
return lic
|
| 536 |
+
return SPDX_LICENSES["CC-BY-4.0"]
|
| 537 |
+
|
| 538 |
+
# Public domain
|
| 539 |
+
return SPDX_LICENSES["CC0-1.0"]
|
| 540 |
+
|
| 541 |
+
def _can_relicense(self, source: SPDXLicense, target: SPDXLicense) -> bool:
|
| 542 |
+
"""Check if source license allows relicensing to target."""
|
| 543 |
+
# Same license is always OK
|
| 544 |
+
if source.id == target.id:
|
| 545 |
+
return True
|
| 546 |
+
|
| 547 |
+
# No relicensing from copyleft to non-copyleft
|
| 548 |
+
if LicenseRestriction.COPYLEFT in source.restrictions:
|
| 549 |
+
if LicenseRestriction.COPYLEFT not in target.restrictions:
|
| 550 |
+
return False
|
| 551 |
+
|
| 552 |
+
# No relicensing from share-alike to non-share-alike
|
| 553 |
+
if LicenseRestriction.SHARE_ALIKE in source.restrictions:
|
| 554 |
+
if LicenseRestriction.SHARE_ALIKE not in target.restrictions:
|
| 555 |
+
return False
|
| 556 |
+
|
| 557 |
+
# Non-commercial must propagate
|
| 558 |
+
if LicenseRestriction.NON_COMMERCIAL in source.restrictions:
|
| 559 |
+
if LicenseRestriction.NON_COMMERCIAL not in target.restrictions:
|
| 560 |
+
return False
|
| 561 |
+
|
| 562 |
+
return True
|
| 563 |
+
|
| 564 |
+
def generate_attribution(
|
| 565 |
+
self,
|
| 566 |
+
sources: List[Tuple[str, str, str]], # (entity_id, license_id, name)
|
| 567 |
+
) -> str:
|
| 568 |
+
"""
|
| 569 |
+
Generate attribution text for derived work.
|
| 570 |
+
|
| 571 |
+
Args:
|
| 572 |
+
sources: List of (entity_id, license_id, name) tuples
|
| 573 |
+
|
| 574 |
+
Returns:
|
| 575 |
+
Attribution text
|
| 576 |
+
"""
|
| 577 |
+
lines = [
|
| 578 |
+
"## Attribution",
|
| 579 |
+
"",
|
| 580 |
+
"This dataset is derived from the following sources:",
|
| 581 |
+
"",
|
| 582 |
+
]
|
| 583 |
+
|
| 584 |
+
for entity_id, license_id, name in sources:
|
| 585 |
+
lic = get_license(license_id)
|
| 586 |
+
if lic.requires_attribution():
|
| 587 |
+
line = f"- **{name}** (`{entity_id}`)"
|
| 588 |
+
if lic.url:
|
| 589 |
+
line += f" - Licensed under [{lic.id}]({lic.url})"
|
| 590 |
+
else:
|
| 591 |
+
line += f" - Licensed under {lic.id}"
|
| 592 |
+
lines.append(line)
|
| 593 |
+
|
| 594 |
+
if len(lines) == 4: # No attributions needed
|
| 595 |
+
return ""
|
| 596 |
+
|
| 597 |
+
lines.append("")
|
| 598 |
+
return "\n".join(lines)
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
# Singleton analyzer
|
| 602 |
+
_analyzer = LicenseAnalyzer()
|
| 603 |
+
|
| 604 |
+
|
| 605 |
+
def check_license_compatibility(
|
| 606 |
+
sources: List[Tuple[str, str]],
|
| 607 |
+
target: Optional[str] = None,
|
| 608 |
+
) -> LicenseCompatibility:
|
| 609 |
+
"""
|
| 610 |
+
Convenience function to check license compatibility.
|
| 611 |
+
|
| 612 |
+
Args:
|
| 613 |
+
sources: List of (entity_id, license_id) tuples
|
| 614 |
+
target: Intended license for derived work
|
| 615 |
+
|
| 616 |
+
Returns:
|
| 617 |
+
LicenseCompatibility result
|
| 618 |
+
"""
|
| 619 |
+
return _analyzer.check_compatibility(sources, target)
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
def get_derived_license(sources: List[str]) -> SPDXLicense:
|
| 623 |
+
"""
|
| 624 |
+
Get the appropriate license for a work derived from given sources.
|
| 625 |
+
|
| 626 |
+
Args:
|
| 627 |
+
sources: List of SPDX license identifiers
|
| 628 |
+
|
| 629 |
+
Returns:
|
| 630 |
+
SPDXLicense for the derived work
|
| 631 |
+
"""
|
| 632 |
+
result = _analyzer.check_compatibility([
|
| 633 |
+
(f"source_{i}", lic) for i, lic in enumerate(sources)
|
| 634 |
+
])
|
| 635 |
+
return result.derived_license or SPDX_LICENSES["unknown"]
|
cascade/data/live.py
ADDED
|
@@ -0,0 +1,844 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Live Document Tracer
|
| 3 |
+
|
| 4 |
+
Real-time streaming of document-centric provenance events.
|
| 5 |
+
This is the LIVE version of what the export system freezes.
|
| 6 |
+
|
| 7 |
+
Instead of: Model runs → Process → Export frozen provenance
|
| 8 |
+
We do: Model runs → STREAM events → View live document highlights
|
| 9 |
+
|
| 10 |
+
Same data model as the observer/exporter, just streamed in real-time
|
| 11 |
+
with document snippet context attached.
|
| 12 |
+
|
| 13 |
+
Usage:
|
| 14 |
+
# Create observer with live streaming
|
| 15 |
+
observer = DatasetObserver("my_pipeline")
|
| 16 |
+
tracer = LiveDocumentTracer(observer)
|
| 17 |
+
|
| 18 |
+
# Subscribe to events
|
| 19 |
+
tracer.on_event(my_handler)
|
| 20 |
+
|
| 21 |
+
# Or stream to async consumer
|
| 22 |
+
async for event in tracer.stream():
|
| 23 |
+
render_highlight(event)
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
import asyncio
|
| 27 |
+
import json
|
| 28 |
+
import time
|
| 29 |
+
from dataclasses import dataclass, field
|
| 30 |
+
from enum import Enum
|
| 31 |
+
from typing import Any, Callable, Dict, Generator, List, Optional, Set, Tuple
|
| 32 |
+
from queue import Queue
|
| 33 |
+
from threading import Lock
|
| 34 |
+
from pathlib import Path
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class TraceEventType(Enum):
|
| 38 |
+
"""Types of document trace events."""
|
| 39 |
+
# Data flow events
|
| 40 |
+
DOCUMENT_TOUCHED = "document_touched" # Model accessed this document/record
|
| 41 |
+
SPAN_HIGHLIGHTED = "span_highlighted" # Specific text span being processed
|
| 42 |
+
ASSOCIATION_CREATED = "association_created" # Link between two spans/documents
|
| 43 |
+
|
| 44 |
+
# Activity events
|
| 45 |
+
ACTIVITY_STARTED = "activity_started"
|
| 46 |
+
ACTIVITY_PROGRESS = "activity_progress"
|
| 47 |
+
ACTIVITY_COMPLETED = "activity_completed"
|
| 48 |
+
|
| 49 |
+
# Entity events
|
| 50 |
+
ENTITY_CREATED = "entity_created"
|
| 51 |
+
ENTITY_DERIVED = "entity_derived"
|
| 52 |
+
|
| 53 |
+
# Relationship events
|
| 54 |
+
LINK_CREATED = "link_created"
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@dataclass
|
| 58 |
+
class DocumentSpan:
|
| 59 |
+
"""
|
| 60 |
+
A span within a document being traced.
|
| 61 |
+
|
| 62 |
+
This is the atomic unit of live visualization -
|
| 63 |
+
the specific text/content the model is touching.
|
| 64 |
+
"""
|
| 65 |
+
document_id: str # Entity or record ID
|
| 66 |
+
document_name: str # Human-readable name
|
| 67 |
+
field_name: str = "" # Column/field if applicable
|
| 68 |
+
row_index: int = -1 # Row if applicable
|
| 69 |
+
|
| 70 |
+
# The actual content span
|
| 71 |
+
text: str = "" # The snippet text
|
| 72 |
+
start_char: int = -1 # Start position in full text
|
| 73 |
+
end_char: int = -1 # End position in full text
|
| 74 |
+
|
| 75 |
+
# Visual hints
|
| 76 |
+
highlight_type: str = "default" # "source", "target", "match", "attention"
|
| 77 |
+
confidence: float = 1.0 # For attention/relevance visualization
|
| 78 |
+
|
| 79 |
+
# Metadata
|
| 80 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
| 81 |
+
|
| 82 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 83 |
+
return {
|
| 84 |
+
"document_id": self.document_id,
|
| 85 |
+
"document_name": self.document_name,
|
| 86 |
+
"field_name": self.field_name,
|
| 87 |
+
"row_index": self.row_index,
|
| 88 |
+
"text": self.text,
|
| 89 |
+
"start_char": self.start_char,
|
| 90 |
+
"end_char": self.end_char,
|
| 91 |
+
"highlight_type": self.highlight_type,
|
| 92 |
+
"confidence": self.confidence,
|
| 93 |
+
"metadata": self.metadata,
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
@dataclass
|
| 98 |
+
class DocumentAssociation:
|
| 99 |
+
"""
|
| 100 |
+
An association between two document spans.
|
| 101 |
+
|
| 102 |
+
Represents the model saying "this connects to that".
|
| 103 |
+
"""
|
| 104 |
+
source: DocumentSpan
|
| 105 |
+
target: DocumentSpan
|
| 106 |
+
association_type: str = "related" # "match", "derived", "similar", "references"
|
| 107 |
+
confidence: float = 1.0
|
| 108 |
+
|
| 109 |
+
# Why this association was made
|
| 110 |
+
reason: str = ""
|
| 111 |
+
|
| 112 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 113 |
+
return {
|
| 114 |
+
"source": self.source.to_dict(),
|
| 115 |
+
"target": self.target.to_dict(),
|
| 116 |
+
"association_type": self.association_type,
|
| 117 |
+
"confidence": self.confidence,
|
| 118 |
+
"reason": self.reason,
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
@dataclass
|
| 123 |
+
class TraceEvent:
|
| 124 |
+
"""
|
| 125 |
+
A single trace event for live document visualization.
|
| 126 |
+
|
| 127 |
+
This is what gets streamed to the UI in real-time.
|
| 128 |
+
"""
|
| 129 |
+
event_type: TraceEventType
|
| 130 |
+
timestamp: float = field(default_factory=time.time)
|
| 131 |
+
|
| 132 |
+
# Activity context
|
| 133 |
+
activity_id: Optional[str] = None
|
| 134 |
+
activity_name: Optional[str] = None
|
| 135 |
+
activity_type: Optional[str] = None
|
| 136 |
+
|
| 137 |
+
# Document spans involved
|
| 138 |
+
spans: List[DocumentSpan] = field(default_factory=list)
|
| 139 |
+
|
| 140 |
+
# Association if this event creates one
|
| 141 |
+
association: Optional[DocumentAssociation] = None
|
| 142 |
+
|
| 143 |
+
# Progress for long operations
|
| 144 |
+
progress: Optional[float] = None # 0.0 to 1.0
|
| 145 |
+
progress_message: Optional[str] = None
|
| 146 |
+
|
| 147 |
+
# Raw provenance data (for export compatibility)
|
| 148 |
+
entity_id: Optional[str] = None
|
| 149 |
+
relationship_type: Optional[str] = None
|
| 150 |
+
|
| 151 |
+
# Metadata
|
| 152 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
| 153 |
+
|
| 154 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 155 |
+
return {
|
| 156 |
+
"event_type": self.event_type.value,
|
| 157 |
+
"timestamp": self.timestamp,
|
| 158 |
+
"activity_id": self.activity_id,
|
| 159 |
+
"activity_name": self.activity_name,
|
| 160 |
+
"activity_type": self.activity_type,
|
| 161 |
+
"spans": [s.to_dict() for s in self.spans],
|
| 162 |
+
"association": self.association.to_dict() if self.association else None,
|
| 163 |
+
"progress": self.progress,
|
| 164 |
+
"progress_message": self.progress_message,
|
| 165 |
+
"entity_id": self.entity_id,
|
| 166 |
+
"metadata": self.metadata,
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
def to_json(self) -> str:
|
| 170 |
+
return json.dumps(self.to_dict(), default=str)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
class LiveDocumentTracer:
|
| 174 |
+
"""
|
| 175 |
+
Real-time document tracing for live visualization.
|
| 176 |
+
|
| 177 |
+
Hooks into DatasetObserver to stream events as they happen,
|
| 178 |
+
enriched with document snippet context for visualization.
|
| 179 |
+
|
| 180 |
+
This is the LIVE version of what CroissantExporter freezes.
|
| 181 |
+
|
| 182 |
+
NEW: Now writes all events to a tape file (JSONL) for buffered playback!
|
| 183 |
+
"""
|
| 184 |
+
|
| 185 |
+
def __init__(self, observer=None, buffer_size: int = 1000, log_dir: str = "./logs"):
|
| 186 |
+
"""
|
| 187 |
+
Initialize tracer.
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
observer: DatasetObserver to hook into (optional)
|
| 191 |
+
buffer_size: Max events to buffer for replay
|
| 192 |
+
log_dir: Directory for tape files (JSONL logs)
|
| 193 |
+
"""
|
| 194 |
+
self.observer = observer
|
| 195 |
+
self.buffer_size = buffer_size
|
| 196 |
+
|
| 197 |
+
# Event subscribers
|
| 198 |
+
self._handlers: List[Callable[[TraceEvent], None]] = []
|
| 199 |
+
self._async_handlers: List[Callable[[TraceEvent], Any]] = []
|
| 200 |
+
|
| 201 |
+
# Event buffer for replay/late subscribers
|
| 202 |
+
self._buffer: List[TraceEvent] = []
|
| 203 |
+
self._buffer_lock = Lock()
|
| 204 |
+
|
| 205 |
+
# Async queue for streaming
|
| 206 |
+
self._async_queue: Optional[asyncio.Queue] = None
|
| 207 |
+
|
| 208 |
+
# Current activity context
|
| 209 |
+
self._current_activity_id: Optional[str] = None
|
| 210 |
+
self._current_activity_name: Optional[str] = None
|
| 211 |
+
self._current_activity_type: Optional[str] = None
|
| 212 |
+
|
| 213 |
+
# Document context cache
|
| 214 |
+
self._document_cache: Dict[str, Dict[str, Any]] = {}
|
| 215 |
+
|
| 216 |
+
# === TAPE FILE FOR PLAYBACK ===
|
| 217 |
+
self._log_dir = Path(log_dir)
|
| 218 |
+
self._log_dir.mkdir(parents=True, exist_ok=True)
|
| 219 |
+
self._session_id = int(time.time())
|
| 220 |
+
self._tape_path = self._log_dir / f"unity_tape_{self._session_id}.jsonl"
|
| 221 |
+
self._tape_file = None
|
| 222 |
+
self._tape_lock = Lock()
|
| 223 |
+
self._event_count = 0
|
| 224 |
+
|
| 225 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 226 |
+
# SUBSCRIPTION
|
| 227 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 228 |
+
|
| 229 |
+
def on_event(self, handler: Callable[[TraceEvent], None]):
|
| 230 |
+
"""Subscribe to trace events (sync handler)."""
|
| 231 |
+
self._handlers.append(handler)
|
| 232 |
+
return self # Allow chaining
|
| 233 |
+
|
| 234 |
+
def on_event_async(self, handler: Callable[[TraceEvent], Any]):
|
| 235 |
+
"""Subscribe to trace events (async handler)."""
|
| 236 |
+
self._async_handlers.append(handler)
|
| 237 |
+
return self
|
| 238 |
+
|
| 239 |
+
def remove_handler(self, handler):
|
| 240 |
+
"""Unsubscribe a handler."""
|
| 241 |
+
if handler in self._handlers:
|
| 242 |
+
self._handlers.remove(handler)
|
| 243 |
+
if handler in self._async_handlers:
|
| 244 |
+
self._async_handlers.remove(handler)
|
| 245 |
+
|
| 246 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 247 |
+
# EVENT EMISSION
|
| 248 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 249 |
+
|
| 250 |
+
def emit(self, event: TraceEvent):
|
| 251 |
+
"""
|
| 252 |
+
Emit a trace event to all subscribers.
|
| 253 |
+
|
| 254 |
+
Called internally when provenance events occur.
|
| 255 |
+
Also writes to tape file for buffered playback!
|
| 256 |
+
"""
|
| 257 |
+
self._event_count += 1
|
| 258 |
+
|
| 259 |
+
# Add to buffer
|
| 260 |
+
with self._buffer_lock:
|
| 261 |
+
self._buffer.append(event)
|
| 262 |
+
if len(self._buffer) > self.buffer_size:
|
| 263 |
+
self._buffer.pop(0)
|
| 264 |
+
|
| 265 |
+
# === WRITE TO TAPE (JSONL) ===
|
| 266 |
+
self._write_to_tape(event)
|
| 267 |
+
|
| 268 |
+
# Call sync handlers
|
| 269 |
+
for handler in self._handlers:
|
| 270 |
+
try:
|
| 271 |
+
handler(event)
|
| 272 |
+
except Exception as e:
|
| 273 |
+
print(f"Handler error: {e}")
|
| 274 |
+
|
| 275 |
+
# Queue for async handlers
|
| 276 |
+
if self._async_queue:
|
| 277 |
+
try:
|
| 278 |
+
self._async_queue.put_nowait(event)
|
| 279 |
+
except asyncio.QueueFull:
|
| 280 |
+
pass # Drop if queue full
|
| 281 |
+
|
| 282 |
+
def _write_to_tape(self, event: TraceEvent):
|
| 283 |
+
"""Write event to tape file for later playback."""
|
| 284 |
+
try:
|
| 285 |
+
with self._tape_lock:
|
| 286 |
+
# Lazy open the file
|
| 287 |
+
if self._tape_file is None:
|
| 288 |
+
self._tape_file = open(self._tape_path, "a", encoding="utf-8")
|
| 289 |
+
print(f"[CASCADE] 📼 Unity tape started: {self._tape_path}")
|
| 290 |
+
|
| 291 |
+
# Build tape record with full context
|
| 292 |
+
record = {
|
| 293 |
+
"seq": self._event_count,
|
| 294 |
+
"event": event.to_dict(),
|
| 295 |
+
"session_id": self._session_id,
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
json_line = json.dumps(record, default=str) + "\n"
|
| 299 |
+
self._tape_file.write(json_line)
|
| 300 |
+
self._tape_file.flush()
|
| 301 |
+
|
| 302 |
+
# Debug: Log first few events
|
| 303 |
+
if self._event_count <= 3:
|
| 304 |
+
print(f"[CASCADE] 📝 Wrote event {self._event_count} to tape: {event.event_type}")
|
| 305 |
+
except Exception as e:
|
| 306 |
+
# Don't let tape errors break the main flow
|
| 307 |
+
print(f"[CASCADE] ⚠️ Tape write error: {e}")
|
| 308 |
+
pass
|
| 309 |
+
|
| 310 |
+
def _write_raw_to_tape(self, record: Dict[str, Any]):
|
| 311 |
+
"""Write a raw record to tape file (for docspace events)."""
|
| 312 |
+
try:
|
| 313 |
+
with self._tape_lock:
|
| 314 |
+
# Lazy open the file
|
| 315 |
+
if self._tape_file is None:
|
| 316 |
+
self._tape_file = open(self._tape_path, "a", encoding="utf-8")
|
| 317 |
+
print(f"[CASCADE] 📼 Unity tape started: {self._tape_path}")
|
| 318 |
+
|
| 319 |
+
self._tape_file.write(json.dumps(record, default=str) + "\n")
|
| 320 |
+
self._tape_file.flush()
|
| 321 |
+
except Exception:
|
| 322 |
+
pass
|
| 323 |
+
|
| 324 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 325 |
+
# DOCUMENT SPACE EVENTS (for polling iframe)
|
| 326 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 327 |
+
|
| 328 |
+
def emit_entity(self, entity_id: str, source: str, text: str, index: int, side: str = "a"):
|
| 329 |
+
"""
|
| 330 |
+
Emit an entity for Document Space visualization.
|
| 331 |
+
|
| 332 |
+
Args:
|
| 333 |
+
entity_id: Unique ID for the entity
|
| 334 |
+
source: Source dataset name
|
| 335 |
+
text: Preview text (truncated)
|
| 336 |
+
index: Row index in dataset
|
| 337 |
+
side: "a" or "b" to indicate which dataset
|
| 338 |
+
"""
|
| 339 |
+
self._event_count += 1
|
| 340 |
+
record = {
|
| 341 |
+
"seq": self._event_count,
|
| 342 |
+
"type": "docspace_entity",
|
| 343 |
+
"side": side,
|
| 344 |
+
"data": {
|
| 345 |
+
"id": entity_id,
|
| 346 |
+
"source": source,
|
| 347 |
+
"text": text[:200],
|
| 348 |
+
"index": index,
|
| 349 |
+
},
|
| 350 |
+
"session_id": self._session_id,
|
| 351 |
+
}
|
| 352 |
+
self._write_raw_to_tape(record)
|
| 353 |
+
|
| 354 |
+
def emit_match(self, doc_a_id: str, doc_b_id: str, score: float):
|
| 355 |
+
"""
|
| 356 |
+
Emit a match for Document Space visualization.
|
| 357 |
+
|
| 358 |
+
Args:
|
| 359 |
+
doc_a_id: ID of entity from dataset A
|
| 360 |
+
doc_b_id: ID of entity from dataset B
|
| 361 |
+
score: Similarity score (0-1)
|
| 362 |
+
"""
|
| 363 |
+
self._event_count += 1
|
| 364 |
+
record = {
|
| 365 |
+
"seq": self._event_count,
|
| 366 |
+
"type": "docspace_match",
|
| 367 |
+
"data": {
|
| 368 |
+
"docA": doc_a_id,
|
| 369 |
+
"docB": doc_b_id,
|
| 370 |
+
"score": float(score),
|
| 371 |
+
},
|
| 372 |
+
"session_id": self._session_id,
|
| 373 |
+
}
|
| 374 |
+
self._write_raw_to_tape(record)
|
| 375 |
+
|
| 376 |
+
def emit_phase(self, phase: str, progress: float, message: str = ""):
|
| 377 |
+
"""
|
| 378 |
+
Emit a phase update for Document Space.
|
| 379 |
+
|
| 380 |
+
Args:
|
| 381 |
+
phase: Current phase (embedding_a, embedding_b, comparing, complete)
|
| 382 |
+
progress: Progress 0-1
|
| 383 |
+
message: Status message
|
| 384 |
+
"""
|
| 385 |
+
self._event_count += 1
|
| 386 |
+
record = {
|
| 387 |
+
"seq": self._event_count,
|
| 388 |
+
"type": "docspace_phase",
|
| 389 |
+
"data": {
|
| 390 |
+
"phase": phase,
|
| 391 |
+
"progress": float(progress),
|
| 392 |
+
"message": message,
|
| 393 |
+
},
|
| 394 |
+
"session_id": self._session_id,
|
| 395 |
+
}
|
| 396 |
+
self._write_raw_to_tape(record)
|
| 397 |
+
|
| 398 |
+
def close_tape(self):
|
| 399 |
+
"""Close the tape file (call when session ends)."""
|
| 400 |
+
with self._tape_lock:
|
| 401 |
+
if self._tape_file:
|
| 402 |
+
self._tape_file.close()
|
| 403 |
+
self._tape_file = None
|
| 404 |
+
print(f"[CASCADE] 📼 Unity tape closed: {self._event_count} events → {self._tape_path}")
|
| 405 |
+
|
| 406 |
+
def get_tape_path(self) -> Optional[Path]:
|
| 407 |
+
"""Get the path to the current tape file (whether open or not)."""
|
| 408 |
+
return self._tape_path
|
| 409 |
+
|
| 410 |
+
@staticmethod
|
| 411 |
+
def load_tape(tape_path: str) -> List[Dict[str, Any]]:
|
| 412 |
+
"""
|
| 413 |
+
Load events from a tape file for playback.
|
| 414 |
+
|
| 415 |
+
Args:
|
| 416 |
+
tape_path: Path to the .jsonl tape file
|
| 417 |
+
|
| 418 |
+
Returns:
|
| 419 |
+
List of event records in chronological order
|
| 420 |
+
"""
|
| 421 |
+
events = []
|
| 422 |
+
with open(tape_path, "r", encoding="utf-8") as f:
|
| 423 |
+
for line in f:
|
| 424 |
+
line = line.strip()
|
| 425 |
+
if line:
|
| 426 |
+
try:
|
| 427 |
+
events.append(json.loads(line))
|
| 428 |
+
except json.JSONDecodeError:
|
| 429 |
+
pass # Skip malformed lines
|
| 430 |
+
return events
|
| 431 |
+
|
| 432 |
+
async def stream(self) -> Generator[TraceEvent, None, None]:
|
| 433 |
+
"""
|
| 434 |
+
Async generator for streaming events.
|
| 435 |
+
|
| 436 |
+
Usage:
|
| 437 |
+
async for event in tracer.stream():
|
| 438 |
+
await render(event)
|
| 439 |
+
"""
|
| 440 |
+
self._async_queue = asyncio.Queue(maxsize=self.buffer_size)
|
| 441 |
+
|
| 442 |
+
# Replay buffer first
|
| 443 |
+
with self._buffer_lock:
|
| 444 |
+
for event in self._buffer:
|
| 445 |
+
yield event
|
| 446 |
+
|
| 447 |
+
# Then stream new events
|
| 448 |
+
while True:
|
| 449 |
+
event = await self._async_queue.get()
|
| 450 |
+
yield event
|
| 451 |
+
|
| 452 |
+
def get_buffer(self) -> List[TraceEvent]:
|
| 453 |
+
"""Get buffered events for replay."""
|
| 454 |
+
with self._buffer_lock:
|
| 455 |
+
return list(self._buffer)
|
| 456 |
+
|
| 457 |
+
def clear_buffer(self):
|
| 458 |
+
"""Clear the event buffer."""
|
| 459 |
+
with self._buffer_lock:
|
| 460 |
+
self._buffer.clear()
|
| 461 |
+
|
| 462 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 463 |
+
# TRACING API - Call these to emit events
|
| 464 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 465 |
+
|
| 466 |
+
def start_activity(
|
| 467 |
+
self,
|
| 468 |
+
activity_id: str,
|
| 469 |
+
activity_name: str,
|
| 470 |
+
activity_type: str = "transform",
|
| 471 |
+
):
|
| 472 |
+
"""Signal start of an activity (for context)."""
|
| 473 |
+
self._current_activity_id = activity_id
|
| 474 |
+
self._current_activity_name = activity_name
|
| 475 |
+
self._current_activity_type = activity_type
|
| 476 |
+
|
| 477 |
+
self.emit(TraceEvent(
|
| 478 |
+
event_type=TraceEventType.ACTIVITY_STARTED,
|
| 479 |
+
activity_id=activity_id,
|
| 480 |
+
activity_name=activity_name,
|
| 481 |
+
activity_type=activity_type,
|
| 482 |
+
))
|
| 483 |
+
|
| 484 |
+
def end_activity(self, activity_id: str = None):
|
| 485 |
+
"""Signal end of an activity."""
|
| 486 |
+
self.emit(TraceEvent(
|
| 487 |
+
event_type=TraceEventType.ACTIVITY_COMPLETED,
|
| 488 |
+
activity_id=activity_id or self._current_activity_id,
|
| 489 |
+
activity_name=self._current_activity_name,
|
| 490 |
+
activity_type=self._current_activity_type,
|
| 491 |
+
))
|
| 492 |
+
self._current_activity_id = None
|
| 493 |
+
self._current_activity_name = None
|
| 494 |
+
self._current_activity_type = None
|
| 495 |
+
|
| 496 |
+
def report_progress(
|
| 497 |
+
self,
|
| 498 |
+
progress: float,
|
| 499 |
+
message: str = "",
|
| 500 |
+
activity_id: str = None,
|
| 501 |
+
):
|
| 502 |
+
"""Report progress on current activity."""
|
| 503 |
+
self.emit(TraceEvent(
|
| 504 |
+
event_type=TraceEventType.ACTIVITY_PROGRESS,
|
| 505 |
+
activity_id=activity_id or self._current_activity_id,
|
| 506 |
+
activity_name=self._current_activity_name,
|
| 507 |
+
progress=progress,
|
| 508 |
+
progress_message=message,
|
| 509 |
+
))
|
| 510 |
+
|
| 511 |
+
def touch_document(
|
| 512 |
+
self,
|
| 513 |
+
document_id: str,
|
| 514 |
+
document_name: str,
|
| 515 |
+
snippet: str = "",
|
| 516 |
+
field_name: str = "",
|
| 517 |
+
row_index: int = -1,
|
| 518 |
+
highlight_type: str = "default",
|
| 519 |
+
confidence: float = 1.0,
|
| 520 |
+
**metadata,
|
| 521 |
+
):
|
| 522 |
+
"""
|
| 523 |
+
Signal that the model touched a document/record.
|
| 524 |
+
|
| 525 |
+
This creates a highlight in the live view.
|
| 526 |
+
"""
|
| 527 |
+
span = DocumentSpan(
|
| 528 |
+
document_id=document_id,
|
| 529 |
+
document_name=document_name,
|
| 530 |
+
field_name=field_name,
|
| 531 |
+
row_index=row_index,
|
| 532 |
+
text=snippet,
|
| 533 |
+
highlight_type=highlight_type,
|
| 534 |
+
confidence=confidence,
|
| 535 |
+
metadata=metadata,
|
| 536 |
+
)
|
| 537 |
+
|
| 538 |
+
self.emit(TraceEvent(
|
| 539 |
+
event_type=TraceEventType.DOCUMENT_TOUCHED,
|
| 540 |
+
activity_id=self._current_activity_id,
|
| 541 |
+
activity_name=self._current_activity_name,
|
| 542 |
+
activity_type=self._current_activity_type,
|
| 543 |
+
spans=[span],
|
| 544 |
+
entity_id=document_id,
|
| 545 |
+
metadata=metadata,
|
| 546 |
+
))
|
| 547 |
+
|
| 548 |
+
return span
|
| 549 |
+
|
| 550 |
+
def highlight_span(
|
| 551 |
+
self,
|
| 552 |
+
document_id: str,
|
| 553 |
+
document_name: str,
|
| 554 |
+
text: str,
|
| 555 |
+
start_char: int = -1,
|
| 556 |
+
end_char: int = -1,
|
| 557 |
+
field_name: str = "",
|
| 558 |
+
row_index: int = -1,
|
| 559 |
+
highlight_type: str = "attention",
|
| 560 |
+
confidence: float = 1.0,
|
| 561 |
+
**metadata,
|
| 562 |
+
):
|
| 563 |
+
"""
|
| 564 |
+
Highlight a specific span within a document.
|
| 565 |
+
|
| 566 |
+
For showing exactly where in the text the model is focusing.
|
| 567 |
+
"""
|
| 568 |
+
span = DocumentSpan(
|
| 569 |
+
document_id=document_id,
|
| 570 |
+
document_name=document_name,
|
| 571 |
+
field_name=field_name,
|
| 572 |
+
row_index=row_index,
|
| 573 |
+
text=text,
|
| 574 |
+
start_char=start_char,
|
| 575 |
+
end_char=end_char,
|
| 576 |
+
highlight_type=highlight_type,
|
| 577 |
+
confidence=confidence,
|
| 578 |
+
metadata=metadata,
|
| 579 |
+
)
|
| 580 |
+
|
| 581 |
+
self.emit(TraceEvent(
|
| 582 |
+
event_type=TraceEventType.SPAN_HIGHLIGHTED,
|
| 583 |
+
activity_id=self._current_activity_id,
|
| 584 |
+
activity_name=self._current_activity_name,
|
| 585 |
+
activity_type=self._current_activity_type,
|
| 586 |
+
spans=[span],
|
| 587 |
+
metadata=metadata,
|
| 588 |
+
))
|
| 589 |
+
|
| 590 |
+
return span
|
| 591 |
+
|
| 592 |
+
def create_association(
|
| 593 |
+
self,
|
| 594 |
+
source_doc_id: str,
|
| 595 |
+
source_doc_name: str,
|
| 596 |
+
source_text: str,
|
| 597 |
+
target_doc_id: str,
|
| 598 |
+
target_doc_name: str,
|
| 599 |
+
target_text: str,
|
| 600 |
+
association_type: str = "related",
|
| 601 |
+
confidence: float = 1.0,
|
| 602 |
+
reason: str = "",
|
| 603 |
+
**metadata,
|
| 604 |
+
):
|
| 605 |
+
"""
|
| 606 |
+
Create an association between two document spans.
|
| 607 |
+
|
| 608 |
+
This is the "A connects to B" visualization.
|
| 609 |
+
"""
|
| 610 |
+
source = DocumentSpan(
|
| 611 |
+
document_id=source_doc_id,
|
| 612 |
+
document_name=source_doc_name,
|
| 613 |
+
text=source_text,
|
| 614 |
+
highlight_type="source",
|
| 615 |
+
confidence=confidence,
|
| 616 |
+
)
|
| 617 |
+
|
| 618 |
+
target = DocumentSpan(
|
| 619 |
+
document_id=target_doc_id,
|
| 620 |
+
document_name=target_doc_name,
|
| 621 |
+
text=target_text,
|
| 622 |
+
highlight_type="target",
|
| 623 |
+
confidence=confidence,
|
| 624 |
+
)
|
| 625 |
+
|
| 626 |
+
association = DocumentAssociation(
|
| 627 |
+
source=source,
|
| 628 |
+
target=target,
|
| 629 |
+
association_type=association_type,
|
| 630 |
+
confidence=confidence,
|
| 631 |
+
reason=reason,
|
| 632 |
+
)
|
| 633 |
+
|
| 634 |
+
self.emit(TraceEvent(
|
| 635 |
+
event_type=TraceEventType.ASSOCIATION_CREATED,
|
| 636 |
+
activity_id=self._current_activity_id,
|
| 637 |
+
activity_name=self._current_activity_name,
|
| 638 |
+
activity_type=self._current_activity_type,
|
| 639 |
+
spans=[source, target],
|
| 640 |
+
association=association,
|
| 641 |
+
metadata=metadata,
|
| 642 |
+
))
|
| 643 |
+
|
| 644 |
+
return association
|
| 645 |
+
|
| 646 |
+
def entity_created(
|
| 647 |
+
self,
|
| 648 |
+
entity_id: str,
|
| 649 |
+
entity_name: str,
|
| 650 |
+
record_count: int = None,
|
| 651 |
+
**metadata,
|
| 652 |
+
):
|
| 653 |
+
"""Signal that a new entity was created in provenance."""
|
| 654 |
+
self.emit(TraceEvent(
|
| 655 |
+
event_type=TraceEventType.ENTITY_CREATED,
|
| 656 |
+
activity_id=self._current_activity_id,
|
| 657 |
+
activity_name=self._current_activity_name,
|
| 658 |
+
entity_id=entity_id,
|
| 659 |
+
metadata={"name": entity_name, "record_count": record_count, **metadata},
|
| 660 |
+
))
|
| 661 |
+
|
| 662 |
+
def entity_derived(
|
| 663 |
+
self,
|
| 664 |
+
derived_id: str,
|
| 665 |
+
derived_name: str,
|
| 666 |
+
source_ids: List[str],
|
| 667 |
+
**metadata,
|
| 668 |
+
):
|
| 669 |
+
"""Signal that an entity was derived from others."""
|
| 670 |
+
self.emit(TraceEvent(
|
| 671 |
+
event_type=TraceEventType.ENTITY_DERIVED,
|
| 672 |
+
activity_id=self._current_activity_id,
|
| 673 |
+
activity_name=self._current_activity_name,
|
| 674 |
+
entity_id=derived_id,
|
| 675 |
+
metadata={"name": derived_name, "sources": source_ids, **metadata},
|
| 676 |
+
))
|
| 677 |
+
|
| 678 |
+
def link_created(
|
| 679 |
+
self,
|
| 680 |
+
source_id: str,
|
| 681 |
+
target_id: str,
|
| 682 |
+
relationship_type: str,
|
| 683 |
+
**metadata,
|
| 684 |
+
):
|
| 685 |
+
"""Signal that a provenance link was created."""
|
| 686 |
+
self.emit(TraceEvent(
|
| 687 |
+
event_type=TraceEventType.LINK_CREATED,
|
| 688 |
+
activity_id=self._current_activity_id,
|
| 689 |
+
activity_name=self._current_activity_name,
|
| 690 |
+
relationship_type=relationship_type,
|
| 691 |
+
metadata={"source": source_id, "target": target_id, **metadata},
|
| 692 |
+
))
|
| 693 |
+
|
| 694 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 695 |
+
# EXPORT (Freeze the live state)
|
| 696 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 697 |
+
|
| 698 |
+
def export_session(self) -> Dict[str, Any]:
|
| 699 |
+
"""
|
| 700 |
+
Export the trace session as frozen data.
|
| 701 |
+
|
| 702 |
+
This is the bridge between live and export -
|
| 703 |
+
same data, just frozen at a point in time.
|
| 704 |
+
"""
|
| 705 |
+
with self._buffer_lock:
|
| 706 |
+
return {
|
| 707 |
+
"events": [e.to_dict() for e in self._buffer],
|
| 708 |
+
"event_count": len(self._buffer),
|
| 709 |
+
"exported_at": time.time(),
|
| 710 |
+
}
|
| 711 |
+
|
| 712 |
+
def export_associations(self) -> List[Dict[str, Any]]:
|
| 713 |
+
"""Export just the associations for visualization."""
|
| 714 |
+
associations = []
|
| 715 |
+
with self._buffer_lock:
|
| 716 |
+
for event in self._buffer:
|
| 717 |
+
if event.association:
|
| 718 |
+
associations.append(event.association.to_dict())
|
| 719 |
+
return associations
|
| 720 |
+
|
| 721 |
+
def export_timeline(self) -> List[Dict[str, Any]]:
|
| 722 |
+
"""Export events as a timeline."""
|
| 723 |
+
timeline = []
|
| 724 |
+
with self._buffer_lock:
|
| 725 |
+
for event in self._buffer:
|
| 726 |
+
timeline.append({
|
| 727 |
+
"timestamp": event.timestamp,
|
| 728 |
+
"type": event.event_type.value,
|
| 729 |
+
"activity": event.activity_name,
|
| 730 |
+
"spans": len(event.spans),
|
| 731 |
+
"has_association": event.association is not None,
|
| 732 |
+
})
|
| 733 |
+
return timeline
|
| 734 |
+
|
| 735 |
+
|
| 736 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 737 |
+
# CONSOLE RENDERER - Simple text-based live view
|
| 738 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 739 |
+
|
| 740 |
+
class ConsoleTraceRenderer:
|
| 741 |
+
"""
|
| 742 |
+
Simple console renderer for live document traces.
|
| 743 |
+
|
| 744 |
+
Good for debugging and terminal-based workflows.
|
| 745 |
+
"""
|
| 746 |
+
|
| 747 |
+
def __init__(self, show_snippets: bool = True, max_snippet_len: int = 80):
|
| 748 |
+
self.show_snippets = show_snippets
|
| 749 |
+
self.max_snippet_len = max_snippet_len
|
| 750 |
+
|
| 751 |
+
def render(self, event: TraceEvent):
|
| 752 |
+
"""Render event to console."""
|
| 753 |
+
timestamp = time.strftime("%H:%M:%S", time.localtime(event.timestamp))
|
| 754 |
+
|
| 755 |
+
if event.event_type == TraceEventType.ACTIVITY_STARTED:
|
| 756 |
+
print(f"\n[{timestamp}] ▶ {event.activity_name} ({event.activity_type})")
|
| 757 |
+
print("─" * 60)
|
| 758 |
+
|
| 759 |
+
elif event.event_type == TraceEventType.ACTIVITY_COMPLETED:
|
| 760 |
+
print("─" * 60)
|
| 761 |
+
print(f"[{timestamp}] ✓ {event.activity_name} completed")
|
| 762 |
+
|
| 763 |
+
elif event.event_type == TraceEventType.ACTIVITY_PROGRESS:
|
| 764 |
+
pct = int((event.progress or 0) * 100)
|
| 765 |
+
bar = "█" * (pct // 5) + "░" * (20 - pct // 5)
|
| 766 |
+
msg = event.progress_message or ""
|
| 767 |
+
print(f"\r[{timestamp}] [{bar}] {pct}% {msg}", end="", flush=True)
|
| 768 |
+
if pct >= 100:
|
| 769 |
+
print()
|
| 770 |
+
|
| 771 |
+
elif event.event_type == TraceEventType.DOCUMENT_TOUCHED:
|
| 772 |
+
for span in event.spans:
|
| 773 |
+
snippet = self._truncate(span.text)
|
| 774 |
+
print(f"[{timestamp}] 📄 {span.document_name}", end="")
|
| 775 |
+
if span.field_name:
|
| 776 |
+
print(f"[{span.field_name}]", end="")
|
| 777 |
+
if span.row_index >= 0:
|
| 778 |
+
print(f" row={span.row_index}", end="")
|
| 779 |
+
if self.show_snippets and snippet:
|
| 780 |
+
print(f"\n └─ \"{snippet}\"")
|
| 781 |
+
else:
|
| 782 |
+
print()
|
| 783 |
+
|
| 784 |
+
elif event.event_type == TraceEventType.SPAN_HIGHLIGHTED:
|
| 785 |
+
for span in event.spans:
|
| 786 |
+
snippet = self._truncate(span.text)
|
| 787 |
+
conf = f"{span.confidence:.0%}" if span.confidence < 1.0 else ""
|
| 788 |
+
print(f"[{timestamp}] 🔍 [{span.highlight_type}] {conf}")
|
| 789 |
+
if self.show_snippets and snippet:
|
| 790 |
+
print(f" └─ \"{snippet}\"")
|
| 791 |
+
|
| 792 |
+
elif event.event_type == TraceEventType.ASSOCIATION_CREATED:
|
| 793 |
+
assoc = event.association
|
| 794 |
+
if assoc:
|
| 795 |
+
src = self._truncate(assoc.source.text, 40)
|
| 796 |
+
tgt = self._truncate(assoc.target.text, 40)
|
| 797 |
+
print(f"[{timestamp}] 🔗 {assoc.association_type} ({assoc.confidence:.0%})")
|
| 798 |
+
print(f" ├─ \"{src}\"")
|
| 799 |
+
print(f" └─ \"{tgt}\"")
|
| 800 |
+
if assoc.reason:
|
| 801 |
+
print(f" ({assoc.reason})")
|
| 802 |
+
|
| 803 |
+
elif event.event_type == TraceEventType.ENTITY_CREATED:
|
| 804 |
+
name = event.metadata.get("name", event.entity_id)
|
| 805 |
+
count = event.metadata.get("record_count", "?")
|
| 806 |
+
print(f"[{timestamp}] ✦ Entity created: {name} ({count} records)")
|
| 807 |
+
|
| 808 |
+
elif event.event_type == TraceEventType.ENTITY_DERIVED:
|
| 809 |
+
name = event.metadata.get("name", event.entity_id)
|
| 810 |
+
sources = event.metadata.get("sources", [])
|
| 811 |
+
print(f"[{timestamp}] ⤵ Entity derived: {name} ← {len(sources)} sources")
|
| 812 |
+
|
| 813 |
+
def _truncate(self, text: str, max_len: int = None) -> str:
|
| 814 |
+
max_len = max_len or self.max_snippet_len
|
| 815 |
+
if not text:
|
| 816 |
+
return ""
|
| 817 |
+
text = text.replace("\n", " ").strip()
|
| 818 |
+
if len(text) > max_len:
|
| 819 |
+
return text[:max_len-3] + "..."
|
| 820 |
+
return text
|
| 821 |
+
|
| 822 |
+
|
| 823 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 824 |
+
# CONVENIENCE
|
| 825 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 826 |
+
|
| 827 |
+
def create_live_tracer(observer=None, console: bool = False) -> LiveDocumentTracer:
|
| 828 |
+
"""
|
| 829 |
+
Create a live document tracer.
|
| 830 |
+
|
| 831 |
+
Args:
|
| 832 |
+
observer: DatasetObserver to hook into
|
| 833 |
+
console: If True, attach console renderer
|
| 834 |
+
|
| 835 |
+
Returns:
|
| 836 |
+
Configured LiveDocumentTracer
|
| 837 |
+
"""
|
| 838 |
+
tracer = LiveDocumentTracer(observer)
|
| 839 |
+
|
| 840 |
+
if console:
|
| 841 |
+
renderer = ConsoleTraceRenderer()
|
| 842 |
+
tracer.on_event(renderer.render)
|
| 843 |
+
|
| 844 |
+
return tracer
|
cascade/data/observer.py
ADDED
|
@@ -0,0 +1,666 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Dataset Observer
|
| 3 |
+
|
| 4 |
+
The main interface for observing datasets.
|
| 5 |
+
Provides context managers for tracking ingest, transform, and consume operations.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import hashlib
|
| 9 |
+
import time
|
| 10 |
+
from contextlib import contextmanager
|
| 11 |
+
from dataclasses import dataclass, field
|
| 12 |
+
from typing import Any, Callable, Dict, Generator, List, Optional, Union
|
| 13 |
+
|
| 14 |
+
from .entities import (
|
| 15 |
+
DatasetEntity, Activity, Agent, Relationship, RelationType,
|
| 16 |
+
ActivityType, AgentType, create_system_agent, create_model_agent, create_user_agent
|
| 17 |
+
)
|
| 18 |
+
from .provenance import ProvenanceGraph
|
| 19 |
+
from .schema import SchemaObserver, DatasetSchema, hash_content
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class ObservationContext:
|
| 24 |
+
"""
|
| 25 |
+
Context for an ongoing observation.
|
| 26 |
+
|
| 27 |
+
Used within context managers to track inputs/outputs.
|
| 28 |
+
"""
|
| 29 |
+
activity: Activity
|
| 30 |
+
observer: "DatasetObserver"
|
| 31 |
+
|
| 32 |
+
_inputs: List[DatasetEntity] = field(default_factory=list)
|
| 33 |
+
_outputs: List[DatasetEntity] = field(default_factory=list)
|
| 34 |
+
|
| 35 |
+
def input(self, dataset, name: str = None, **kwargs) -> DatasetEntity:
|
| 36 |
+
"""
|
| 37 |
+
Register an input dataset.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
dataset: HuggingFace Dataset, DatasetDict, or entity ID
|
| 41 |
+
name: Optional name override
|
| 42 |
+
**kwargs: Additional entity attributes
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
The created or retrieved DatasetEntity
|
| 46 |
+
"""
|
| 47 |
+
# If string, assume it's an existing entity ID
|
| 48 |
+
if isinstance(dataset, str):
|
| 49 |
+
entity = self.observer.graph.get_entity(dataset)
|
| 50 |
+
if entity:
|
| 51 |
+
self._inputs.append(entity)
|
| 52 |
+
self.activity.add_input(entity.id)
|
| 53 |
+
self.observer.graph.link_usage(self.activity.id, entity.id)
|
| 54 |
+
return entity
|
| 55 |
+
else:
|
| 56 |
+
raise ValueError(f"Entity not found: {dataset}")
|
| 57 |
+
|
| 58 |
+
# Otherwise, observe the dataset
|
| 59 |
+
entity = self.observer.observe_dataset(dataset, name=name, **kwargs)
|
| 60 |
+
self._inputs.append(entity)
|
| 61 |
+
self.activity.add_input(entity.id)
|
| 62 |
+
self.observer.graph.link_usage(self.activity.id, entity.id)
|
| 63 |
+
|
| 64 |
+
return entity
|
| 65 |
+
|
| 66 |
+
def output(self, dataset, name: str = None, **kwargs) -> DatasetEntity:
|
| 67 |
+
"""
|
| 68 |
+
Register an output dataset.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
dataset: HuggingFace Dataset, DatasetDict, or dict
|
| 72 |
+
name: Optional name override
|
| 73 |
+
**kwargs: Additional entity attributes
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
The created DatasetEntity
|
| 77 |
+
"""
|
| 78 |
+
entity = self.observer.observe_dataset(dataset, name=name, **kwargs)
|
| 79 |
+
self._outputs.append(entity)
|
| 80 |
+
self.activity.add_output(entity.id)
|
| 81 |
+
|
| 82 |
+
# Link generation
|
| 83 |
+
self.observer.graph.link_generation(entity.id, self.activity.id)
|
| 84 |
+
|
| 85 |
+
# Link derivation from all inputs
|
| 86 |
+
for input_entity in self._inputs:
|
| 87 |
+
self.observer.graph.link_derivation(entity.id, input_entity.id)
|
| 88 |
+
|
| 89 |
+
return entity
|
| 90 |
+
|
| 91 |
+
@property
|
| 92 |
+
def inputs(self) -> List[DatasetEntity]:
|
| 93 |
+
return self._inputs
|
| 94 |
+
|
| 95 |
+
@property
|
| 96 |
+
def outputs(self) -> List[DatasetEntity]:
|
| 97 |
+
return self._outputs
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
class DatasetObserver:
|
| 101 |
+
"""
|
| 102 |
+
Observer for dataset operations.
|
| 103 |
+
|
| 104 |
+
Tracks:
|
| 105 |
+
- Dataset loading (ingest)
|
| 106 |
+
- Transformations (filter, map, join, etc.)
|
| 107 |
+
- Consumption (training, inference)
|
| 108 |
+
|
| 109 |
+
Example:
|
| 110 |
+
observer = DatasetObserver()
|
| 111 |
+
|
| 112 |
+
with observer.observe_ingest("squad") as ctx:
|
| 113 |
+
ds = load_dataset("squad")
|
| 114 |
+
ctx.output(ds)
|
| 115 |
+
|
| 116 |
+
with observer.observe_transform("filter_english") as ctx:
|
| 117 |
+
ctx.input(ds)
|
| 118 |
+
filtered = ds.filter(lambda x: x["lang"] == "en")
|
| 119 |
+
ctx.output(filtered)
|
| 120 |
+
|
| 121 |
+
chain = observer.export_provenance()
|
| 122 |
+
"""
|
| 123 |
+
|
| 124 |
+
def __init__(
|
| 125 |
+
self,
|
| 126 |
+
name: str = "default",
|
| 127 |
+
agent: Agent = None,
|
| 128 |
+
):
|
| 129 |
+
"""
|
| 130 |
+
Initialize observer.
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
name: Name for the provenance graph
|
| 134 |
+
agent: Default agent for activities (defaults to graph's system agent)
|
| 135 |
+
"""
|
| 136 |
+
self.graph = ProvenanceGraph(name=name)
|
| 137 |
+
self.schema_observer = SchemaObserver()
|
| 138 |
+
|
| 139 |
+
# Use provided agent or the graph's default system agent
|
| 140 |
+
if agent:
|
| 141 |
+
self._default_agent = agent
|
| 142 |
+
self.graph.add_agent(agent)
|
| 143 |
+
else:
|
| 144 |
+
# Use the graph's already-created system agent
|
| 145 |
+
self._default_agent = self.graph._system_agent
|
| 146 |
+
|
| 147 |
+
# Entity counter for unique IDs
|
| 148 |
+
self._counter = 0
|
| 149 |
+
|
| 150 |
+
def _next_id(self, prefix: str) -> str:
|
| 151 |
+
"""Generate unique ID."""
|
| 152 |
+
self._counter += 1
|
| 153 |
+
return f"{prefix}:{int(time.time() * 1000)}:{self._counter:04d}"
|
| 154 |
+
|
| 155 |
+
# ═════════════════════════════════════════════════════��═════════════════════
|
| 156 |
+
# DATASET OBSERVATION
|
| 157 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 158 |
+
|
| 159 |
+
def observe_dataset(
|
| 160 |
+
self,
|
| 161 |
+
dataset,
|
| 162 |
+
name: str = None,
|
| 163 |
+
source_type: str = None,
|
| 164 |
+
source_uri: str = None,
|
| 165 |
+
version: str = None,
|
| 166 |
+
license_id: str = None,
|
| 167 |
+
license_url: str = None,
|
| 168 |
+
**kwargs,
|
| 169 |
+
) -> DatasetEntity:
|
| 170 |
+
"""
|
| 171 |
+
Observe a dataset and create an entity.
|
| 172 |
+
|
| 173 |
+
Args:
|
| 174 |
+
dataset: HuggingFace Dataset, DatasetDict, DataFrame, or dict
|
| 175 |
+
name: Name for the entity
|
| 176 |
+
source_type: Type of source (hf_hub, local, etc.)
|
| 177 |
+
source_uri: URI of the source
|
| 178 |
+
version: Version string
|
| 179 |
+
license_id: SPDX license identifier (e.g., "MIT", "CC-BY-4.0")
|
| 180 |
+
license_url: URL to the license text
|
| 181 |
+
**kwargs: Additional attributes
|
| 182 |
+
|
| 183 |
+
Returns:
|
| 184 |
+
DatasetEntity representing the dataset
|
| 185 |
+
"""
|
| 186 |
+
# Infer name if not provided
|
| 187 |
+
if name is None:
|
| 188 |
+
if hasattr(dataset, 'info') and hasattr(dataset.info, 'dataset_name'):
|
| 189 |
+
name = dataset.info.dataset_name
|
| 190 |
+
elif hasattr(dataset, 'config_name'):
|
| 191 |
+
name = dataset.config_name
|
| 192 |
+
else:
|
| 193 |
+
name = f"dataset_{self._counter + 1}"
|
| 194 |
+
|
| 195 |
+
# Try to extract license from HuggingFace dataset info
|
| 196 |
+
if license_id is None and hasattr(dataset, 'info'):
|
| 197 |
+
info = dataset.info
|
| 198 |
+
if hasattr(info, 'license') and info.license:
|
| 199 |
+
license_id = info.license
|
| 200 |
+
|
| 201 |
+
# Observe schema
|
| 202 |
+
schema = self._observe_schema(dataset)
|
| 203 |
+
|
| 204 |
+
# Compute content hash
|
| 205 |
+
content_hash = self._compute_content_hash(dataset)
|
| 206 |
+
|
| 207 |
+
# Get record count and splits
|
| 208 |
+
record_count, splits = self._get_counts(dataset)
|
| 209 |
+
|
| 210 |
+
# Infer source
|
| 211 |
+
if source_type is None:
|
| 212 |
+
source_type = self._infer_source_type(dataset)
|
| 213 |
+
|
| 214 |
+
# Create entity
|
| 215 |
+
entity = DatasetEntity(
|
| 216 |
+
id=self._next_id("entity"),
|
| 217 |
+
name=name,
|
| 218 |
+
content_hash=content_hash,
|
| 219 |
+
schema_hash=schema.hash() if schema else None,
|
| 220 |
+
version=version,
|
| 221 |
+
source_type=source_type,
|
| 222 |
+
source_uri=source_uri,
|
| 223 |
+
license_id=license_id,
|
| 224 |
+
license_url=license_url,
|
| 225 |
+
record_count=record_count,
|
| 226 |
+
splits=splits,
|
| 227 |
+
attributes={
|
| 228 |
+
"schema": schema.to_dict() if schema else None,
|
| 229 |
+
**kwargs,
|
| 230 |
+
},
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
# Add to graph
|
| 234 |
+
self.graph.add_entity(entity)
|
| 235 |
+
|
| 236 |
+
return entity
|
| 237 |
+
|
| 238 |
+
def register_agent(self, name: str, agent_type: str = "software", version: str = None) -> Agent:
|
| 239 |
+
"""
|
| 240 |
+
Register a new agent in the provenance graph.
|
| 241 |
+
|
| 242 |
+
Args:
|
| 243 |
+
name: Name of the agent
|
| 244 |
+
agent_type: Type of agent (software, model, person, etc.)
|
| 245 |
+
version: Optional version string
|
| 246 |
+
|
| 247 |
+
Returns:
|
| 248 |
+
The created Agent
|
| 249 |
+
"""
|
| 250 |
+
if agent_type == "model":
|
| 251 |
+
agent = create_model_agent(name, version=version)
|
| 252 |
+
elif agent_type == "system":
|
| 253 |
+
agent = create_system_agent(name, version=version)
|
| 254 |
+
elif agent_type == "person":
|
| 255 |
+
agent = create_user_agent(name)
|
| 256 |
+
else:
|
| 257 |
+
# Default to software agent or generic
|
| 258 |
+
try:
|
| 259 |
+
type_enum = AgentType(agent_type)
|
| 260 |
+
except ValueError:
|
| 261 |
+
type_enum = AgentType.SOFTWARE
|
| 262 |
+
|
| 263 |
+
agent = Agent(
|
| 264 |
+
id=f"agent:{type_enum.value}:{name.replace(' ', '_').lower()}",
|
| 265 |
+
agent_type=type_enum,
|
| 266 |
+
name=name,
|
| 267 |
+
version=version
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
self.graph.add_agent(agent)
|
| 271 |
+
return agent
|
| 272 |
+
|
| 273 |
+
def _observe_schema(self, dataset) -> Optional[DatasetSchema]:
|
| 274 |
+
"""Extract schema from dataset."""
|
| 275 |
+
try:
|
| 276 |
+
# HuggingFace Dataset
|
| 277 |
+
if hasattr(dataset, 'features'):
|
| 278 |
+
return self.schema_observer.observe_hf_dataset(dataset)
|
| 279 |
+
|
| 280 |
+
# Pandas DataFrame
|
| 281 |
+
if hasattr(dataset, 'dtypes') and hasattr(dataset, 'columns'):
|
| 282 |
+
return self.schema_observer.observe_pandas(dataset)
|
| 283 |
+
|
| 284 |
+
# Dict
|
| 285 |
+
if isinstance(dataset, dict):
|
| 286 |
+
# Check if it's columnar (dict of lists)
|
| 287 |
+
if all(isinstance(v, list) for v in dataset.values()):
|
| 288 |
+
return self.schema_observer.observe_dict(dataset)
|
| 289 |
+
|
| 290 |
+
return None
|
| 291 |
+
except Exception as e:
|
| 292 |
+
# Don't fail observation if schema extraction fails
|
| 293 |
+
print(f"Warning: Could not extract schema: {e}")
|
| 294 |
+
return None
|
| 295 |
+
|
| 296 |
+
def _compute_content_hash(self, dataset) -> str:
|
| 297 |
+
"""Compute content hash of dataset."""
|
| 298 |
+
try:
|
| 299 |
+
return hash_content(dataset)
|
| 300 |
+
except Exception:
|
| 301 |
+
# Fallback to timestamp-based hash
|
| 302 |
+
return hashlib.sha256(str(time.time()).encode()).hexdigest()
|
| 303 |
+
|
| 304 |
+
def _get_counts(self, dataset) -> tuple:
|
| 305 |
+
"""Get record count and split counts."""
|
| 306 |
+
record_count = None
|
| 307 |
+
splits = {}
|
| 308 |
+
|
| 309 |
+
try:
|
| 310 |
+
# HuggingFace DatasetDict
|
| 311 |
+
if hasattr(dataset, 'keys') and hasattr(dataset, '__getitem__'):
|
| 312 |
+
for split_name in dataset.keys():
|
| 313 |
+
split_ds = dataset[split_name]
|
| 314 |
+
if hasattr(split_ds, '__len__'):
|
| 315 |
+
splits[split_name] = len(split_ds)
|
| 316 |
+
record_count = sum(splits.values()) if splits else None
|
| 317 |
+
|
| 318 |
+
# Single dataset
|
| 319 |
+
elif hasattr(dataset, '__len__'):
|
| 320 |
+
record_count = len(dataset)
|
| 321 |
+
|
| 322 |
+
except Exception:
|
| 323 |
+
pass
|
| 324 |
+
|
| 325 |
+
return record_count, splits
|
| 326 |
+
|
| 327 |
+
def _infer_source_type(self, dataset) -> str:
|
| 328 |
+
"""Infer source type from dataset."""
|
| 329 |
+
# HuggingFace Dataset
|
| 330 |
+
if hasattr(dataset, '_info'):
|
| 331 |
+
return "hf_dataset"
|
| 332 |
+
|
| 333 |
+
# Pandas
|
| 334 |
+
if hasattr(dataset, 'dtypes'):
|
| 335 |
+
return "pandas"
|
| 336 |
+
|
| 337 |
+
# Dict
|
| 338 |
+
if isinstance(dataset, dict):
|
| 339 |
+
return "dict"
|
| 340 |
+
|
| 341 |
+
return "unknown"
|
| 342 |
+
|
| 343 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 344 |
+
# CONTEXT MANAGERS
|
| 345 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 346 |
+
|
| 347 |
+
@contextmanager
|
| 348 |
+
def observe_ingest(
|
| 349 |
+
self,
|
| 350 |
+
name: str,
|
| 351 |
+
source_uri: str = None,
|
| 352 |
+
agent: Agent = None,
|
| 353 |
+
**kwargs,
|
| 354 |
+
) -> Generator[ObservationContext, None, None]:
|
| 355 |
+
"""
|
| 356 |
+
Observe a dataset ingest operation.
|
| 357 |
+
|
| 358 |
+
Args:
|
| 359 |
+
name: Name of the ingest operation
|
| 360 |
+
source_uri: URI of the data source
|
| 361 |
+
agent: Agent performing the ingest
|
| 362 |
+
**kwargs: Additional activity parameters
|
| 363 |
+
|
| 364 |
+
Yields:
|
| 365 |
+
ObservationContext for registering inputs/outputs
|
| 366 |
+
|
| 367 |
+
Example:
|
| 368 |
+
with observer.observe_ingest("load_squad", source_uri="hf://squad") as ctx:
|
| 369 |
+
ds = load_dataset("squad")
|
| 370 |
+
ctx.output(ds, name="squad")
|
| 371 |
+
"""
|
| 372 |
+
activity = Activity(
|
| 373 |
+
id=self._next_id("activity"),
|
| 374 |
+
activity_type=ActivityType.INGEST,
|
| 375 |
+
name=name,
|
| 376 |
+
agent_id=(agent or self._default_agent).id,
|
| 377 |
+
parameters={"source_uri": source_uri, **kwargs},
|
| 378 |
+
)
|
| 379 |
+
activity.start()
|
| 380 |
+
|
| 381 |
+
ctx = ObservationContext(activity=activity, observer=self)
|
| 382 |
+
|
| 383 |
+
try:
|
| 384 |
+
yield ctx
|
| 385 |
+
finally:
|
| 386 |
+
activity.end()
|
| 387 |
+
self.graph.add_activity(activity)
|
| 388 |
+
self.graph.link_association(activity.id, activity.agent_id)
|
| 389 |
+
|
| 390 |
+
@contextmanager
|
| 391 |
+
def observe_transform(
|
| 392 |
+
self,
|
| 393 |
+
name: str,
|
| 394 |
+
transform_type: str = None,
|
| 395 |
+
agent: Agent = None,
|
| 396 |
+
**kwargs,
|
| 397 |
+
) -> Generator[ObservationContext, None, None]:
|
| 398 |
+
"""
|
| 399 |
+
Observe a dataset transformation.
|
| 400 |
+
|
| 401 |
+
Args:
|
| 402 |
+
name: Name of the transform
|
| 403 |
+
transform_type: Type of transform (filter, map, join, etc.)
|
| 404 |
+
agent: Agent performing the transform
|
| 405 |
+
**kwargs: Additional activity parameters
|
| 406 |
+
|
| 407 |
+
Yields:
|
| 408 |
+
ObservationContext for registering inputs/outputs
|
| 409 |
+
|
| 410 |
+
Example:
|
| 411 |
+
with observer.observe_transform("filter_english") as ctx:
|
| 412 |
+
ctx.input(ds)
|
| 413 |
+
filtered = ds.filter(lambda x: x["lang"] == "en")
|
| 414 |
+
ctx.output(filtered)
|
| 415 |
+
"""
|
| 416 |
+
activity = Activity(
|
| 417 |
+
id=self._next_id("activity"),
|
| 418 |
+
activity_type=ActivityType.TRANSFORM,
|
| 419 |
+
name=name,
|
| 420 |
+
agent_id=(agent or self._default_agent).id,
|
| 421 |
+
parameters={"transform_type": transform_type, **kwargs},
|
| 422 |
+
)
|
| 423 |
+
activity.start()
|
| 424 |
+
|
| 425 |
+
ctx = ObservationContext(activity=activity, observer=self)
|
| 426 |
+
|
| 427 |
+
try:
|
| 428 |
+
yield ctx
|
| 429 |
+
finally:
|
| 430 |
+
activity.end()
|
| 431 |
+
self.graph.add_activity(activity)
|
| 432 |
+
self.graph.link_association(activity.id, activity.agent_id)
|
| 433 |
+
|
| 434 |
+
@contextmanager
|
| 435 |
+
def observe_consume(
|
| 436 |
+
self,
|
| 437 |
+
name: str,
|
| 438 |
+
model_id: str = None,
|
| 439 |
+
consume_type: str = "train",
|
| 440 |
+
agent: Agent = None,
|
| 441 |
+
**kwargs,
|
| 442 |
+
) -> Generator[ObservationContext, None, None]:
|
| 443 |
+
"""
|
| 444 |
+
Observe dataset consumption (training, inference).
|
| 445 |
+
|
| 446 |
+
Args:
|
| 447 |
+
name: Name of the consumption operation
|
| 448 |
+
model_id: ID of the model consuming the data
|
| 449 |
+
consume_type: Type of consumption (train, evaluate, inference)
|
| 450 |
+
agent: Agent performing the consumption
|
| 451 |
+
**kwargs: Additional activity parameters
|
| 452 |
+
|
| 453 |
+
Yields:
|
| 454 |
+
ObservationContext for registering inputs/outputs
|
| 455 |
+
|
| 456 |
+
Example:
|
| 457 |
+
with observer.observe_consume("train_qa_model", model_id="bert-base") as ctx:
|
| 458 |
+
ctx.input(train_ds)
|
| 459 |
+
model = train(train_ds)
|
| 460 |
+
# Model provenance now links to data provenance!
|
| 461 |
+
"""
|
| 462 |
+
# Create model agent if model_id provided
|
| 463 |
+
if model_id and agent is None:
|
| 464 |
+
agent = create_model_agent(model_id)
|
| 465 |
+
self.graph.add_agent(agent)
|
| 466 |
+
|
| 467 |
+
activity_type = {
|
| 468 |
+
"train": ActivityType.TRAIN,
|
| 469 |
+
"evaluate": ActivityType.EVALUATE,
|
| 470 |
+
"inference": ActivityType.INFERENCE,
|
| 471 |
+
}.get(consume_type, ActivityType.TRAIN)
|
| 472 |
+
|
| 473 |
+
activity = Activity(
|
| 474 |
+
id=self._next_id("activity"),
|
| 475 |
+
activity_type=activity_type,
|
| 476 |
+
name=name,
|
| 477 |
+
agent_id=(agent or self._default_agent).id,
|
| 478 |
+
parameters={"model_id": model_id, "consume_type": consume_type, **kwargs},
|
| 479 |
+
)
|
| 480 |
+
activity.start()
|
| 481 |
+
|
| 482 |
+
ctx = ObservationContext(activity=activity, observer=self)
|
| 483 |
+
|
| 484 |
+
try:
|
| 485 |
+
yield ctx
|
| 486 |
+
finally:
|
| 487 |
+
activity.end()
|
| 488 |
+
self.graph.add_activity(activity)
|
| 489 |
+
self.graph.link_association(activity.id, activity.agent_id)
|
| 490 |
+
|
| 491 |
+
@contextmanager
|
| 492 |
+
def observe_entity_resolution(
|
| 493 |
+
self,
|
| 494 |
+
name: str,
|
| 495 |
+
model_id: str = None,
|
| 496 |
+
threshold: float = None,
|
| 497 |
+
agent: Agent = None,
|
| 498 |
+
**kwargs,
|
| 499 |
+
) -> Generator[ObservationContext, None, None]:
|
| 500 |
+
"""
|
| 501 |
+
Observe entity resolution / data unity operation.
|
| 502 |
+
|
| 503 |
+
Args:
|
| 504 |
+
name: Name of the operation
|
| 505 |
+
model_id: Embedding model used
|
| 506 |
+
threshold: Similarity threshold
|
| 507 |
+
agent: Agent performing the operation
|
| 508 |
+
**kwargs: Additional parameters
|
| 509 |
+
|
| 510 |
+
Example:
|
| 511 |
+
with observer.observe_entity_resolution("match_patients_claims") as ctx:
|
| 512 |
+
ctx.input(patients_ds)
|
| 513 |
+
ctx.input(claims_ds)
|
| 514 |
+
unified = run_unity(patients_ds, claims_ds)
|
| 515 |
+
ctx.output(unified)
|
| 516 |
+
"""
|
| 517 |
+
if model_id and agent is None:
|
| 518 |
+
agent = create_model_agent(model_id)
|
| 519 |
+
self.graph.add_agent(agent)
|
| 520 |
+
|
| 521 |
+
activity = Activity(
|
| 522 |
+
id=self._next_id("activity"),
|
| 523 |
+
activity_type=ActivityType.ENTITY_RESOLUTION,
|
| 524 |
+
name=name,
|
| 525 |
+
agent_id=(agent or self._default_agent).id,
|
| 526 |
+
parameters={
|
| 527 |
+
"model_id": model_id,
|
| 528 |
+
"threshold": threshold,
|
| 529 |
+
**kwargs,
|
| 530 |
+
},
|
| 531 |
+
)
|
| 532 |
+
activity.start()
|
| 533 |
+
|
| 534 |
+
ctx = ObservationContext(activity=activity, observer=self)
|
| 535 |
+
|
| 536 |
+
try:
|
| 537 |
+
yield ctx
|
| 538 |
+
finally:
|
| 539 |
+
activity.end()
|
| 540 |
+
self.graph.add_activity(activity)
|
| 541 |
+
self.graph.link_association(activity.id, activity.agent_id)
|
| 542 |
+
|
| 543 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 544 |
+
# EXPORT
|
| 545 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 546 |
+
|
| 547 |
+
def export_provenance(self) -> ProvenanceGraph:
|
| 548 |
+
"""Export the provenance graph."""
|
| 549 |
+
return self.graph
|
| 550 |
+
|
| 551 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 552 |
+
"""Export observation state to dictionary."""
|
| 553 |
+
return {
|
| 554 |
+
"graph": self.graph.to_dict(),
|
| 555 |
+
"counter": self._counter,
|
| 556 |
+
}
|
| 557 |
+
|
| 558 |
+
@classmethod
|
| 559 |
+
def from_dict(cls, data: Dict[str, Any]) -> "DatasetObserver":
|
| 560 |
+
"""Load observer from dictionary."""
|
| 561 |
+
observer = cls()
|
| 562 |
+
observer.graph = ProvenanceGraph.from_dict(data["graph"])
|
| 563 |
+
observer._counter = data.get("counter", 0)
|
| 564 |
+
return observer
|
| 565 |
+
|
| 566 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 567 |
+
# STATISTICS
|
| 568 |
+
# ══��════════════════════════════════════════════════════════════════════════
|
| 569 |
+
|
| 570 |
+
@property
|
| 571 |
+
def stats(self) -> Dict[str, Any]:
|
| 572 |
+
"""Get observer statistics."""
|
| 573 |
+
return {
|
| 574 |
+
"graph": self.graph.stats,
|
| 575 |
+
"root_hash": self.graph.root_hash,
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 579 |
+
# LICENSE TRACKING
|
| 580 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 581 |
+
|
| 582 |
+
def check_license_compatibility(
|
| 583 |
+
self,
|
| 584 |
+
entity_ids: List[str],
|
| 585 |
+
target_license: str = None,
|
| 586 |
+
):
|
| 587 |
+
"""
|
| 588 |
+
Check license compatibility for deriving from entities.
|
| 589 |
+
|
| 590 |
+
Args:
|
| 591 |
+
entity_ids: List of source entity IDs
|
| 592 |
+
target_license: Intended SPDX license for derived work
|
| 593 |
+
|
| 594 |
+
Returns:
|
| 595 |
+
LicenseCompatibility result
|
| 596 |
+
|
| 597 |
+
Example:
|
| 598 |
+
result = observer.check_license_compatibility(
|
| 599 |
+
["entity:123", "entity:456"],
|
| 600 |
+
target_license="MIT"
|
| 601 |
+
)
|
| 602 |
+
if not result.compatible:
|
| 603 |
+
print(f"Issues: {result.issues}")
|
| 604 |
+
"""
|
| 605 |
+
from .license import check_license_compatibility
|
| 606 |
+
|
| 607 |
+
sources = []
|
| 608 |
+
for entity_id in entity_ids:
|
| 609 |
+
entity = self.graph.get_entity(entity_id)
|
| 610 |
+
if entity:
|
| 611 |
+
license_id = entity.license_id or "unknown"
|
| 612 |
+
sources.append((entity_id, license_id))
|
| 613 |
+
|
| 614 |
+
return check_license_compatibility(sources, target_license)
|
| 615 |
+
|
| 616 |
+
def get_derived_license(self, entity_ids: List[str]):
|
| 617 |
+
"""
|
| 618 |
+
Get the appropriate license for a work derived from entities.
|
| 619 |
+
|
| 620 |
+
Args:
|
| 621 |
+
entity_ids: List of source entity IDs
|
| 622 |
+
|
| 623 |
+
Returns:
|
| 624 |
+
SPDXLicense for the derived work
|
| 625 |
+
"""
|
| 626 |
+
from .license import get_derived_license
|
| 627 |
+
|
| 628 |
+
licenses = []
|
| 629 |
+
for entity_id in entity_ids:
|
| 630 |
+
entity = self.graph.get_entity(entity_id)
|
| 631 |
+
if entity and entity.license_id:
|
| 632 |
+
licenses.append(entity.license_id)
|
| 633 |
+
|
| 634 |
+
return get_derived_license(licenses) if licenses else None
|
| 635 |
+
|
| 636 |
+
def generate_attribution(self, entity_ids: List[str] = None) -> str:
|
| 637 |
+
"""
|
| 638 |
+
Generate attribution text for entities.
|
| 639 |
+
|
| 640 |
+
Args:
|
| 641 |
+
entity_ids: List of entity IDs (defaults to all entities)
|
| 642 |
+
|
| 643 |
+
Returns:
|
| 644 |
+
Markdown attribution text
|
| 645 |
+
"""
|
| 646 |
+
from .license import LicenseAnalyzer
|
| 647 |
+
|
| 648 |
+
analyzer = LicenseAnalyzer()
|
| 649 |
+
|
| 650 |
+
if entity_ids is None:
|
| 651 |
+
entities = self.graph.list_entities()
|
| 652 |
+
else:
|
| 653 |
+
entities = [
|
| 654 |
+
self.graph.get_entity(eid) for eid in entity_ids
|
| 655 |
+
if self.graph.get_entity(eid)
|
| 656 |
+
]
|
| 657 |
+
|
| 658 |
+
sources = [
|
| 659 |
+
(e.id, e.license_id or "unknown", e.name)
|
| 660 |
+
for e in entities
|
| 661 |
+
]
|
| 662 |
+
|
| 663 |
+
return analyzer.generate_attribution(sources)
|
| 664 |
+
|
| 665 |
+
def __repr__(self) -> str:
|
| 666 |
+
return f"DatasetObserver({self.graph})"
|
cascade/data/pii.py
ADDED
|
@@ -0,0 +1,748 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PII Detection for CASCADE
|
| 3 |
+
|
| 4 |
+
Industry standard PII (Personally Identifiable Information) detection
|
| 5 |
+
based on Microsoft Presidio patterns and common PII taxonomies.
|
| 6 |
+
|
| 7 |
+
References:
|
| 8 |
+
- Microsoft Presidio: https://github.com/microsoft/presidio
|
| 9 |
+
- NIST PII Guide: https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-122.pdf
|
| 10 |
+
- GDPR Article 4 (personal data definition)
|
| 11 |
+
|
| 12 |
+
PII Categories:
|
| 13 |
+
1. Direct Identifiers: Name, SSN, passport, driver's license
|
| 14 |
+
2. Quasi-Identifiers: Age, ZIP code, gender, dates
|
| 15 |
+
3. Sensitive Data: Health, financial, biometric
|
| 16 |
+
|
| 17 |
+
Detection Methods:
|
| 18 |
+
- Regex patterns (fast, high precision for structured PII)
|
| 19 |
+
- Context-aware detection (surrounding words improve accuracy)
|
| 20 |
+
- Checksum validation (SSN, credit cards, etc.)
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
import re
|
| 24 |
+
from dataclasses import dataclass, field
|
| 25 |
+
from enum import Enum
|
| 26 |
+
from typing import Any, Callable, Dict, List, Optional, Pattern, Set, Tuple
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class PIIType(Enum):
|
| 30 |
+
"""Types of PII that can be detected."""
|
| 31 |
+
# Direct Identifiers
|
| 32 |
+
PERSON_NAME = "PERSON_NAME"
|
| 33 |
+
EMAIL = "EMAIL"
|
| 34 |
+
PHONE_NUMBER = "PHONE_NUMBER"
|
| 35 |
+
SSN = "SSN" # Social Security Number
|
| 36 |
+
CREDIT_CARD = "CREDIT_CARD"
|
| 37 |
+
IBAN = "IBAN" # International Bank Account Number
|
| 38 |
+
IP_ADDRESS = "IP_ADDRESS"
|
| 39 |
+
MAC_ADDRESS = "MAC_ADDRESS"
|
| 40 |
+
PASSPORT = "PASSPORT"
|
| 41 |
+
DRIVERS_LICENSE = "DRIVERS_LICENSE"
|
| 42 |
+
|
| 43 |
+
# Quasi-Identifiers
|
| 44 |
+
DATE_OF_BIRTH = "DATE_OF_BIRTH"
|
| 45 |
+
AGE = "AGE"
|
| 46 |
+
ZIPCODE = "ZIPCODE"
|
| 47 |
+
ADDRESS = "ADDRESS"
|
| 48 |
+
|
| 49 |
+
# Sensitive Data
|
| 50 |
+
MEDICAL_RECORD = "MEDICAL_RECORD"
|
| 51 |
+
API_KEY = "API_KEY"
|
| 52 |
+
AWS_KEY = "AWS_KEY"
|
| 53 |
+
PASSWORD = "PASSWORD"
|
| 54 |
+
CRYPTO_WALLET = "CRYPTO_WALLET"
|
| 55 |
+
|
| 56 |
+
# Location
|
| 57 |
+
GPS_COORDINATES = "GPS_COORDINATES"
|
| 58 |
+
|
| 59 |
+
# URLs and IDs
|
| 60 |
+
URL = "URL"
|
| 61 |
+
USERNAME = "USERNAME"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class PIISeverity(Enum):
|
| 65 |
+
"""Severity levels for PII findings."""
|
| 66 |
+
CRITICAL = "critical" # Direct identifier, immediate re-identification risk
|
| 67 |
+
HIGH = "high" # Sensitive data, significant privacy risk
|
| 68 |
+
MEDIUM = "medium" # Quasi-identifier, re-identification when combined
|
| 69 |
+
LOW = "low" # Minimal risk, contextual sensitivity
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@dataclass
|
| 73 |
+
class PIIMatch:
|
| 74 |
+
"""A detected PII instance."""
|
| 75 |
+
pii_type: PIIType
|
| 76 |
+
severity: PIISeverity
|
| 77 |
+
value: str # The matched text (may be redacted for display)
|
| 78 |
+
start: int # Start position in text
|
| 79 |
+
end: int # End position in text
|
| 80 |
+
confidence: float # 0.0 to 1.0
|
| 81 |
+
context: str = "" # Surrounding text for context
|
| 82 |
+
field_name: str = "" # Column/field where found
|
| 83 |
+
row_index: int = -1 # Row index if applicable
|
| 84 |
+
|
| 85 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 86 |
+
return {
|
| 87 |
+
"type": self.pii_type.value,
|
| 88 |
+
"severity": self.severity.value,
|
| 89 |
+
"value_preview": self._redact(self.value),
|
| 90 |
+
"start": self.start,
|
| 91 |
+
"end": self.end,
|
| 92 |
+
"confidence": self.confidence,
|
| 93 |
+
"field_name": self.field_name,
|
| 94 |
+
"row_index": self.row_index,
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
def _redact(self, value: str, show_chars: int = 4) -> str:
|
| 98 |
+
"""Partially redact the value for display."""
|
| 99 |
+
if len(value) <= show_chars:
|
| 100 |
+
return "*" * len(value)
|
| 101 |
+
return value[:show_chars] + "*" * (len(value) - show_chars)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
@dataclass
|
| 105 |
+
class PIIPattern:
|
| 106 |
+
"""A pattern for detecting PII."""
|
| 107 |
+
pii_type: PIIType
|
| 108 |
+
severity: PIISeverity
|
| 109 |
+
pattern: Pattern
|
| 110 |
+
confidence: float = 0.85
|
| 111 |
+
validator: Optional[Callable[[str], bool]] = None # Additional validation
|
| 112 |
+
context_patterns: List[str] = field(default_factory=list) # Boost confidence if context matches
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
@dataclass
|
| 116 |
+
class PIIScanResult:
|
| 117 |
+
"""Result of scanning content for PII."""
|
| 118 |
+
total_matches: int = 0
|
| 119 |
+
matches_by_type: Dict[str, int] = field(default_factory=dict)
|
| 120 |
+
matches_by_severity: Dict[str, int] = field(default_factory=dict)
|
| 121 |
+
matches_by_field: Dict[str, int] = field(default_factory=dict)
|
| 122 |
+
sample_matches: List[PIIMatch] = field(default_factory=list) # First N matches
|
| 123 |
+
fields_with_pii: Set[str] = field(default_factory=set)
|
| 124 |
+
high_risk_fields: Set[str] = field(default_factory=set) # Fields with CRITICAL/HIGH PII
|
| 125 |
+
|
| 126 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 127 |
+
return {
|
| 128 |
+
"total_matches": self.total_matches,
|
| 129 |
+
"matches_by_type": self.matches_by_type,
|
| 130 |
+
"matches_by_severity": self.matches_by_severity,
|
| 131 |
+
"matches_by_field": self.matches_by_field,
|
| 132 |
+
"fields_with_pii": list(self.fields_with_pii),
|
| 133 |
+
"high_risk_fields": list(self.high_risk_fields),
|
| 134 |
+
"sample_matches": [m.to_dict() for m in self.sample_matches[:10]],
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
def has_critical_pii(self) -> bool:
|
| 138 |
+
"""Check if any critical PII was found."""
|
| 139 |
+
return self.matches_by_severity.get("critical", 0) > 0
|
| 140 |
+
|
| 141 |
+
def has_high_risk_pii(self) -> bool:
|
| 142 |
+
"""Check if any high-risk PII was found."""
|
| 143 |
+
return (
|
| 144 |
+
self.matches_by_severity.get("critical", 0) > 0 or
|
| 145 |
+
self.matches_by_severity.get("high", 0) > 0
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
@property
|
| 149 |
+
def summary(self) -> str:
|
| 150 |
+
"""Human-readable summary."""
|
| 151 |
+
if self.total_matches == 0:
|
| 152 |
+
return "No PII detected"
|
| 153 |
+
|
| 154 |
+
lines = [f"Found {self.total_matches} PII instance(s):"]
|
| 155 |
+
for sev in ["critical", "high", "medium", "low"]:
|
| 156 |
+
count = self.matches_by_severity.get(sev, 0)
|
| 157 |
+
if count > 0:
|
| 158 |
+
lines.append(f" • {sev.upper()}: {count}")
|
| 159 |
+
|
| 160 |
+
if self.high_risk_fields:
|
| 161 |
+
lines.append(f" ⚠ High-risk fields: {', '.join(self.high_risk_fields)}")
|
| 162 |
+
|
| 163 |
+
return "\n".join(lines)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 167 |
+
# VALIDATION FUNCTIONS
|
| 168 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 169 |
+
|
| 170 |
+
def validate_luhn(card_number: str) -> bool:
|
| 171 |
+
"""
|
| 172 |
+
Validate credit card using Luhn algorithm.
|
| 173 |
+
|
| 174 |
+
Used by Visa, MasterCard, American Express, etc.
|
| 175 |
+
"""
|
| 176 |
+
digits = [int(d) for d in re.sub(r'\D', '', card_number)]
|
| 177 |
+
if len(digits) < 13 or len(digits) > 19:
|
| 178 |
+
return False
|
| 179 |
+
|
| 180 |
+
# Luhn checksum
|
| 181 |
+
checksum = 0
|
| 182 |
+
for i, digit in enumerate(reversed(digits)):
|
| 183 |
+
if i % 2 == 1:
|
| 184 |
+
digit *= 2
|
| 185 |
+
if digit > 9:
|
| 186 |
+
digit -= 9
|
| 187 |
+
checksum += digit
|
| 188 |
+
|
| 189 |
+
return checksum % 10 == 0
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def validate_ssn(ssn: str) -> bool:
|
| 193 |
+
"""
|
| 194 |
+
Validate US Social Security Number format.
|
| 195 |
+
|
| 196 |
+
SSN format: AAA-BB-CCCC
|
| 197 |
+
- AAA: Area number (001-899, excluding 666)
|
| 198 |
+
- BB: Group number (01-99)
|
| 199 |
+
- CCCC: Serial number (0001-9999)
|
| 200 |
+
"""
|
| 201 |
+
clean = re.sub(r'\D', '', ssn)
|
| 202 |
+
if len(clean) != 9:
|
| 203 |
+
return False
|
| 204 |
+
|
| 205 |
+
area = int(clean[:3])
|
| 206 |
+
group = int(clean[3:5])
|
| 207 |
+
serial = int(clean[5:])
|
| 208 |
+
|
| 209 |
+
# Invalid patterns
|
| 210 |
+
if area == 0 or area == 666 or area >= 900:
|
| 211 |
+
return False
|
| 212 |
+
if group == 0:
|
| 213 |
+
return False
|
| 214 |
+
if serial == 0:
|
| 215 |
+
return False
|
| 216 |
+
|
| 217 |
+
# Known invalid SSNs (advertising, testing)
|
| 218 |
+
invalid_ssns = {
|
| 219 |
+
"078051120", # Woolworth promotional
|
| 220 |
+
"219099999", # Advertising
|
| 221 |
+
}
|
| 222 |
+
if clean in invalid_ssns:
|
| 223 |
+
return False
|
| 224 |
+
|
| 225 |
+
return True
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def validate_iban(iban: str) -> bool:
|
| 229 |
+
"""
|
| 230 |
+
Validate IBAN using MOD-97 checksum.
|
| 231 |
+
"""
|
| 232 |
+
clean = re.sub(r'\s', '', iban).upper()
|
| 233 |
+
if len(clean) < 15 or len(clean) > 34:
|
| 234 |
+
return False
|
| 235 |
+
|
| 236 |
+
# Move country code and check digits to end
|
| 237 |
+
rearranged = clean[4:] + clean[:4]
|
| 238 |
+
|
| 239 |
+
# Convert letters to numbers (A=10, B=11, etc.)
|
| 240 |
+
numeric = ""
|
| 241 |
+
for char in rearranged:
|
| 242 |
+
if char.isdigit():
|
| 243 |
+
numeric += char
|
| 244 |
+
else:
|
| 245 |
+
numeric += str(ord(char) - ord('A') + 10)
|
| 246 |
+
|
| 247 |
+
# MOD 97 check
|
| 248 |
+
return int(numeric) % 97 == 1
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 252 |
+
# PII PATTERNS (Based on Microsoft Presidio)
|
| 253 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 254 |
+
|
| 255 |
+
PII_PATTERNS: List[PIIPattern] = [
|
| 256 |
+
# Email - RFC 5322 simplified
|
| 257 |
+
PIIPattern(
|
| 258 |
+
pii_type=PIIType.EMAIL,
|
| 259 |
+
severity=PIISeverity.HIGH,
|
| 260 |
+
pattern=re.compile(
|
| 261 |
+
r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
| 262 |
+
re.IGNORECASE
|
| 263 |
+
),
|
| 264 |
+
confidence=0.95,
|
| 265 |
+
context_patterns=["email", "e-mail", "contact", "mail"],
|
| 266 |
+
),
|
| 267 |
+
|
| 268 |
+
# Phone Number - International formats
|
| 269 |
+
PIIPattern(
|
| 270 |
+
pii_type=PIIType.PHONE_NUMBER,
|
| 271 |
+
severity=PIISeverity.MEDIUM,
|
| 272 |
+
pattern=re.compile(
|
| 273 |
+
r'''
|
| 274 |
+
(?:
|
| 275 |
+
\+?1?[-.\s]? # Country code
|
| 276 |
+
\(?[2-9]\d{2}\)?[-.\s]? # Area code
|
| 277 |
+
[2-9]\d{2}[-.\s]? # Exchange
|
| 278 |
+
\d{4} # Subscriber
|
| 279 |
+
|
|
| 280 |
+
\+?\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]? # International
|
| 281 |
+
\d{1,4}[-.\s]?\d{1,9}
|
| 282 |
+
)
|
| 283 |
+
''',
|
| 284 |
+
re.VERBOSE
|
| 285 |
+
),
|
| 286 |
+
confidence=0.75,
|
| 287 |
+
context_patterns=["phone", "tel", "mobile", "cell", "call", "fax"],
|
| 288 |
+
),
|
| 289 |
+
|
| 290 |
+
# SSN - US Social Security Number
|
| 291 |
+
PIIPattern(
|
| 292 |
+
pii_type=PIIType.SSN,
|
| 293 |
+
severity=PIISeverity.CRITICAL,
|
| 294 |
+
pattern=re.compile(
|
| 295 |
+
r'\b(?!000|666|9\d{2})\d{3}[-\s]?(?!00)\d{2}[-\s]?(?!0000)\d{4}\b'
|
| 296 |
+
),
|
| 297 |
+
confidence=0.85,
|
| 298 |
+
validator=validate_ssn,
|
| 299 |
+
context_patterns=["ssn", "social security", "tax id", "taxpayer"],
|
| 300 |
+
),
|
| 301 |
+
|
| 302 |
+
# Credit Card - Major card formats
|
| 303 |
+
PIIPattern(
|
| 304 |
+
pii_type=PIIType.CREDIT_CARD,
|
| 305 |
+
severity=PIISeverity.CRITICAL,
|
| 306 |
+
pattern=re.compile(
|
| 307 |
+
r'''
|
| 308 |
+
\b(?:
|
| 309 |
+
4[0-9]{12}(?:[0-9]{3})? # Visa
|
| 310 |
+
|
|
| 311 |
+
5[1-5][0-9]{14} # MasterCard
|
| 312 |
+
|
|
| 313 |
+
3[47][0-9]{13} # American Express
|
| 314 |
+
|
|
| 315 |
+
6(?:011|5[0-9]{2})[0-9]{12} # Discover
|
| 316 |
+
|
|
| 317 |
+
(?:2131|1800|35\d{3})\d{11} # JCB
|
| 318 |
+
)\b
|
| 319 |
+
|
|
| 320 |
+
\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b # Spaced format
|
| 321 |
+
''',
|
| 322 |
+
re.VERBOSE
|
| 323 |
+
),
|
| 324 |
+
confidence=0.90,
|
| 325 |
+
validator=validate_luhn,
|
| 326 |
+
context_patterns=["card", "credit", "visa", "mastercard", "amex", "payment"],
|
| 327 |
+
),
|
| 328 |
+
|
| 329 |
+
# IP Address - IPv4
|
| 330 |
+
PIIPattern(
|
| 331 |
+
pii_type=PIIType.IP_ADDRESS,
|
| 332 |
+
severity=PIISeverity.MEDIUM,
|
| 333 |
+
pattern=re.compile(
|
| 334 |
+
r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
|
| 335 |
+
),
|
| 336 |
+
confidence=0.90,
|
| 337 |
+
context_patterns=["ip", "address", "server", "host", "client"],
|
| 338 |
+
),
|
| 339 |
+
|
| 340 |
+
# IP Address - IPv6
|
| 341 |
+
PIIPattern(
|
| 342 |
+
pii_type=PIIType.IP_ADDRESS,
|
| 343 |
+
severity=PIISeverity.MEDIUM,
|
| 344 |
+
pattern=re.compile(
|
| 345 |
+
r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b'
|
| 346 |
+
),
|
| 347 |
+
confidence=0.90,
|
| 348 |
+
),
|
| 349 |
+
|
| 350 |
+
# MAC Address
|
| 351 |
+
PIIPattern(
|
| 352 |
+
pii_type=PIIType.MAC_ADDRESS,
|
| 353 |
+
severity=PIISeverity.LOW,
|
| 354 |
+
pattern=re.compile(
|
| 355 |
+
r'\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b'
|
| 356 |
+
),
|
| 357 |
+
confidence=0.95,
|
| 358 |
+
),
|
| 359 |
+
|
| 360 |
+
# IBAN - International Bank Account Number
|
| 361 |
+
PIIPattern(
|
| 362 |
+
pii_type=PIIType.IBAN,
|
| 363 |
+
severity=PIISeverity.CRITICAL,
|
| 364 |
+
pattern=re.compile(
|
| 365 |
+
r'\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}(?:[A-Z0-9]?){0,16}\b',
|
| 366 |
+
re.IGNORECASE
|
| 367 |
+
),
|
| 368 |
+
confidence=0.85,
|
| 369 |
+
validator=validate_iban,
|
| 370 |
+
context_patterns=["iban", "bank", "account", "transfer"],
|
| 371 |
+
),
|
| 372 |
+
|
| 373 |
+
# API Key patterns
|
| 374 |
+
PIIPattern(
|
| 375 |
+
pii_type=PIIType.API_KEY,
|
| 376 |
+
severity=PIISeverity.CRITICAL,
|
| 377 |
+
pattern=re.compile(
|
| 378 |
+
r'''
|
| 379 |
+
(?:
|
| 380 |
+
sk[-_]live[-_][a-zA-Z0-9]{24,} # Stripe
|
| 381 |
+
|
|
| 382 |
+
sk[-_]test[-_][a-zA-Z0-9]{24,} # Stripe test
|
| 383 |
+
|
|
| 384 |
+
pk[-_]live[-_][a-zA-Z0-9]{24,} # Stripe public
|
| 385 |
+
|
|
| 386 |
+
ghp_[a-zA-Z0-9]{36} # GitHub PAT
|
| 387 |
+
|
|
| 388 |
+
gho_[a-zA-Z0-9]{36} # GitHub OAuth
|
| 389 |
+
|
|
| 390 |
+
github_pat_[a-zA-Z0-9]{22}_[a-zA-Z0-9]{59} # GitHub fine-grained
|
| 391 |
+
|
|
| 392 |
+
xox[baprs]-[a-zA-Z0-9-]{10,} # Slack
|
| 393 |
+
|
|
| 394 |
+
ya29\.[a-zA-Z0-9_-]+ # Google OAuth
|
| 395 |
+
)
|
| 396 |
+
''',
|
| 397 |
+
re.VERBOSE
|
| 398 |
+
),
|
| 399 |
+
confidence=0.95,
|
| 400 |
+
context_patterns=["api", "key", "token", "secret", "auth"],
|
| 401 |
+
),
|
| 402 |
+
|
| 403 |
+
# AWS Access Key
|
| 404 |
+
PIIPattern(
|
| 405 |
+
pii_type=PIIType.AWS_KEY,
|
| 406 |
+
severity=PIISeverity.CRITICAL,
|
| 407 |
+
pattern=re.compile(
|
| 408 |
+
r'\b(?:AKIA|ABIA|ACCA|ASIA)[A-Z0-9]{16}\b'
|
| 409 |
+
),
|
| 410 |
+
confidence=0.95,
|
| 411 |
+
context_patterns=["aws", "amazon", "key", "access"],
|
| 412 |
+
),
|
| 413 |
+
|
| 414 |
+
# Crypto Wallet - Bitcoin
|
| 415 |
+
PIIPattern(
|
| 416 |
+
pii_type=PIIType.CRYPTO_WALLET,
|
| 417 |
+
severity=PIISeverity.HIGH,
|
| 418 |
+
pattern=re.compile(
|
| 419 |
+
r'\b(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}\b'
|
| 420 |
+
),
|
| 421 |
+
confidence=0.80,
|
| 422 |
+
context_patterns=["bitcoin", "btc", "wallet", "crypto"],
|
| 423 |
+
),
|
| 424 |
+
|
| 425 |
+
# Crypto Wallet - Ethereum
|
| 426 |
+
PIIPattern(
|
| 427 |
+
pii_type=PIIType.CRYPTO_WALLET,
|
| 428 |
+
severity=PIISeverity.HIGH,
|
| 429 |
+
pattern=re.compile(
|
| 430 |
+
r'\b0x[a-fA-F0-9]{40}\b'
|
| 431 |
+
),
|
| 432 |
+
confidence=0.80,
|
| 433 |
+
context_patterns=["ethereum", "eth", "wallet", "crypto"],
|
| 434 |
+
),
|
| 435 |
+
|
| 436 |
+
# GPS Coordinates
|
| 437 |
+
PIIPattern(
|
| 438 |
+
pii_type=PIIType.GPS_COORDINATES,
|
| 439 |
+
severity=PIISeverity.MEDIUM,
|
| 440 |
+
pattern=re.compile(
|
| 441 |
+
r'[-+]?(?:[1-8]?\d(?:\.\d+)?|90(?:\.0+)?)\s*,\s*[-+]?(?:180(?:\.0+)?|(?:(?:1[0-7]\d)|(?:[1-9]?\d))(?:\.\d+)?)'
|
| 442 |
+
),
|
| 443 |
+
confidence=0.70,
|
| 444 |
+
context_patterns=["location", "coordinates", "lat", "lng", "gps"],
|
| 445 |
+
),
|
| 446 |
+
|
| 447 |
+
# Date of Birth patterns
|
| 448 |
+
PIIPattern(
|
| 449 |
+
pii_type=PIIType.DATE_OF_BIRTH,
|
| 450 |
+
severity=PIISeverity.MEDIUM,
|
| 451 |
+
pattern=re.compile(
|
| 452 |
+
r'\b(?:0?[1-9]|1[0-2])[/\-.](?:0?[1-9]|[12]\d|3[01])[/\-.](?:19|20)\d{2}\b'
|
| 453 |
+
),
|
| 454 |
+
confidence=0.60, # Low base - needs context
|
| 455 |
+
context_patterns=["birth", "dob", "born", "birthday", "date of birth"],
|
| 456 |
+
),
|
| 457 |
+
|
| 458 |
+
# US ZIP Code
|
| 459 |
+
PIIPattern(
|
| 460 |
+
pii_type=PIIType.ZIPCODE,
|
| 461 |
+
severity=PIISeverity.LOW,
|
| 462 |
+
pattern=re.compile(
|
| 463 |
+
r'\b\d{5}(?:-\d{4})?\b'
|
| 464 |
+
),
|
| 465 |
+
confidence=0.50, # Low - needs context
|
| 466 |
+
context_patterns=["zip", "postal", "address", "code"],
|
| 467 |
+
),
|
| 468 |
+
|
| 469 |
+
# URL (can contain sensitive info in path/query)
|
| 470 |
+
PIIPattern(
|
| 471 |
+
pii_type=PIIType.URL,
|
| 472 |
+
severity=PIISeverity.LOW,
|
| 473 |
+
pattern=re.compile(
|
| 474 |
+
r'https?://[^\s<>"{}|\\^`\[\]]+',
|
| 475 |
+
re.IGNORECASE
|
| 476 |
+
),
|
| 477 |
+
confidence=0.70,
|
| 478 |
+
),
|
| 479 |
+
]
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
class PIIScanner:
|
| 483 |
+
"""
|
| 484 |
+
Scanner for detecting PII in text and datasets.
|
| 485 |
+
|
| 486 |
+
Uses regex patterns with optional validation and context boosting.
|
| 487 |
+
"""
|
| 488 |
+
|
| 489 |
+
def __init__(
|
| 490 |
+
self,
|
| 491 |
+
patterns: List[PIIPattern] = None,
|
| 492 |
+
min_confidence: float = 0.5,
|
| 493 |
+
context_boost: float = 0.1,
|
| 494 |
+
):
|
| 495 |
+
"""
|
| 496 |
+
Initialize scanner.
|
| 497 |
+
|
| 498 |
+
Args:
|
| 499 |
+
patterns: Custom patterns (defaults to PII_PATTERNS)
|
| 500 |
+
min_confidence: Minimum confidence to report (0.0-1.0)
|
| 501 |
+
context_boost: Confidence boost when context matches
|
| 502 |
+
"""
|
| 503 |
+
self.patterns = patterns or PII_PATTERNS
|
| 504 |
+
self.min_confidence = min_confidence
|
| 505 |
+
self.context_boost = context_boost
|
| 506 |
+
|
| 507 |
+
def scan_text(
|
| 508 |
+
self,
|
| 509 |
+
text: str,
|
| 510 |
+
field_name: str = "",
|
| 511 |
+
row_index: int = -1,
|
| 512 |
+
) -> List[PIIMatch]:
|
| 513 |
+
"""
|
| 514 |
+
Scan text for PII.
|
| 515 |
+
|
| 516 |
+
Args:
|
| 517 |
+
text: Text to scan
|
| 518 |
+
field_name: Optional field name for tracking
|
| 519 |
+
row_index: Optional row index for tracking
|
| 520 |
+
|
| 521 |
+
Returns:
|
| 522 |
+
List of PIIMatch objects
|
| 523 |
+
"""
|
| 524 |
+
if not text or not isinstance(text, str):
|
| 525 |
+
return []
|
| 526 |
+
|
| 527 |
+
matches = []
|
| 528 |
+
text_lower = text.lower()
|
| 529 |
+
|
| 530 |
+
for pattern in self.patterns:
|
| 531 |
+
for match in pattern.pattern.finditer(text):
|
| 532 |
+
value = match.group()
|
| 533 |
+
confidence = pattern.confidence
|
| 534 |
+
|
| 535 |
+
# Validate if validator provided
|
| 536 |
+
if pattern.validator:
|
| 537 |
+
if not pattern.validator(value):
|
| 538 |
+
continue
|
| 539 |
+
|
| 540 |
+
# Context boost
|
| 541 |
+
if pattern.context_patterns:
|
| 542 |
+
for ctx in pattern.context_patterns:
|
| 543 |
+
if ctx in text_lower:
|
| 544 |
+
confidence = min(1.0, confidence + self.context_boost)
|
| 545 |
+
break
|
| 546 |
+
|
| 547 |
+
# Apply minimum confidence filter
|
| 548 |
+
if confidence >= self.min_confidence:
|
| 549 |
+
# Get surrounding context (50 chars each side)
|
| 550 |
+
start = max(0, match.start() - 50)
|
| 551 |
+
end = min(len(text), match.end() + 50)
|
| 552 |
+
context = text[start:end]
|
| 553 |
+
|
| 554 |
+
matches.append(PIIMatch(
|
| 555 |
+
pii_type=pattern.pii_type,
|
| 556 |
+
severity=pattern.severity,
|
| 557 |
+
value=value,
|
| 558 |
+
start=match.start(),
|
| 559 |
+
end=match.end(),
|
| 560 |
+
confidence=confidence,
|
| 561 |
+
context=context,
|
| 562 |
+
field_name=field_name,
|
| 563 |
+
row_index=row_index,
|
| 564 |
+
))
|
| 565 |
+
|
| 566 |
+
return matches
|
| 567 |
+
|
| 568 |
+
def scan_dict(
|
| 569 |
+
self,
|
| 570 |
+
data: Dict[str, List[Any]],
|
| 571 |
+
sample_size: int = 1000,
|
| 572 |
+
) -> PIIScanResult:
|
| 573 |
+
"""
|
| 574 |
+
Scan a columnar dict for PII.
|
| 575 |
+
|
| 576 |
+
Args:
|
| 577 |
+
data: Dict of column_name -> values
|
| 578 |
+
sample_size: Max rows to scan per column
|
| 579 |
+
|
| 580 |
+
Returns:
|
| 581 |
+
PIIScanResult with aggregated findings
|
| 582 |
+
"""
|
| 583 |
+
result = PIIScanResult()
|
| 584 |
+
|
| 585 |
+
for field_name, values in data.items():
|
| 586 |
+
if not values:
|
| 587 |
+
continue
|
| 588 |
+
|
| 589 |
+
# Sample values
|
| 590 |
+
sample = values[:sample_size]
|
| 591 |
+
|
| 592 |
+
for row_idx, value in enumerate(sample):
|
| 593 |
+
if not isinstance(value, str):
|
| 594 |
+
value = str(value) if value is not None else ""
|
| 595 |
+
|
| 596 |
+
matches = self.scan_text(value, field_name, row_idx)
|
| 597 |
+
|
| 598 |
+
for match in matches:
|
| 599 |
+
result.total_matches += 1
|
| 600 |
+
|
| 601 |
+
# Count by type
|
| 602 |
+
type_name = match.pii_type.value
|
| 603 |
+
result.matches_by_type[type_name] = result.matches_by_type.get(type_name, 0) + 1
|
| 604 |
+
|
| 605 |
+
# Count by severity
|
| 606 |
+
sev = match.severity.value
|
| 607 |
+
result.matches_by_severity[sev] = result.matches_by_severity.get(sev, 0) + 1
|
| 608 |
+
|
| 609 |
+
# Count by field
|
| 610 |
+
result.matches_by_field[field_name] = result.matches_by_field.get(field_name, 0) + 1
|
| 611 |
+
|
| 612 |
+
# Track fields
|
| 613 |
+
result.fields_with_pii.add(field_name)
|
| 614 |
+
if match.severity in [PIISeverity.CRITICAL, PIISeverity.HIGH]:
|
| 615 |
+
result.high_risk_fields.add(field_name)
|
| 616 |
+
|
| 617 |
+
# Keep samples
|
| 618 |
+
if len(result.sample_matches) < 100:
|
| 619 |
+
result.sample_matches.append(match)
|
| 620 |
+
|
| 621 |
+
return result
|
| 622 |
+
|
| 623 |
+
def scan_dataset(
|
| 624 |
+
self,
|
| 625 |
+
dataset,
|
| 626 |
+
sample_size: int = 1000,
|
| 627 |
+
) -> PIIScanResult:
|
| 628 |
+
"""
|
| 629 |
+
Scan a HuggingFace Dataset or DatasetDict for PII.
|
| 630 |
+
|
| 631 |
+
Args:
|
| 632 |
+
dataset: HuggingFace Dataset or DatasetDict
|
| 633 |
+
sample_size: Max rows to scan
|
| 634 |
+
|
| 635 |
+
Returns:
|
| 636 |
+
PIIScanResult with aggregated findings
|
| 637 |
+
"""
|
| 638 |
+
# Handle DatasetDict (multiple splits)
|
| 639 |
+
if hasattr(dataset, 'keys') and callable(dataset.keys):
|
| 640 |
+
combined = PIIScanResult()
|
| 641 |
+
for split_name in dataset.keys():
|
| 642 |
+
split_result = self.scan_dataset(dataset[split_name], sample_size)
|
| 643 |
+
# Merge results
|
| 644 |
+
combined.total_matches += split_result.total_matches
|
| 645 |
+
for k, v in split_result.matches_by_type.items():
|
| 646 |
+
combined.matches_by_type[k] = combined.matches_by_type.get(k, 0) + v
|
| 647 |
+
for k, v in split_result.matches_by_severity.items():
|
| 648 |
+
combined.matches_by_severity[k] = combined.matches_by_severity.get(k, 0) + v
|
| 649 |
+
for k, v in split_result.matches_by_field.items():
|
| 650 |
+
combined.matches_by_field[k] = combined.matches_by_field.get(k, 0) + v
|
| 651 |
+
combined.fields_with_pii.update(split_result.fields_with_pii)
|
| 652 |
+
combined.high_risk_fields.update(split_result.high_risk_fields)
|
| 653 |
+
combined.sample_matches.extend(split_result.sample_matches[:20])
|
| 654 |
+
return combined
|
| 655 |
+
|
| 656 |
+
# Single Dataset
|
| 657 |
+
result = PIIScanResult()
|
| 658 |
+
|
| 659 |
+
# Get column names
|
| 660 |
+
if hasattr(dataset, 'features'):
|
| 661 |
+
columns = list(dataset.features.keys())
|
| 662 |
+
elif hasattr(dataset, 'column_names'):
|
| 663 |
+
columns = dataset.column_names
|
| 664 |
+
else:
|
| 665 |
+
return result
|
| 666 |
+
|
| 667 |
+
# Sample rows
|
| 668 |
+
num_rows = len(dataset) if hasattr(dataset, '__len__') else sample_size
|
| 669 |
+
sample_indices = range(min(sample_size, num_rows))
|
| 670 |
+
|
| 671 |
+
for idx in sample_indices:
|
| 672 |
+
row = dataset[idx]
|
| 673 |
+
for col in columns:
|
| 674 |
+
value = row.get(col) if isinstance(row, dict) else getattr(row, col, None)
|
| 675 |
+
if not isinstance(value, str):
|
| 676 |
+
value = str(value) if value is not None else ""
|
| 677 |
+
|
| 678 |
+
matches = self.scan_text(value, col, idx)
|
| 679 |
+
|
| 680 |
+
for match in matches:
|
| 681 |
+
result.total_matches += 1
|
| 682 |
+
|
| 683 |
+
type_name = match.pii_type.value
|
| 684 |
+
result.matches_by_type[type_name] = result.matches_by_type.get(type_name, 0) + 1
|
| 685 |
+
|
| 686 |
+
sev = match.severity.value
|
| 687 |
+
result.matches_by_severity[sev] = result.matches_by_severity.get(sev, 0) + 1
|
| 688 |
+
|
| 689 |
+
result.matches_by_field[col] = result.matches_by_field.get(col, 0) + 1
|
| 690 |
+
|
| 691 |
+
result.fields_with_pii.add(col)
|
| 692 |
+
if match.severity in [PIISeverity.CRITICAL, PIISeverity.HIGH]:
|
| 693 |
+
result.high_risk_fields.add(col)
|
| 694 |
+
|
| 695 |
+
if len(result.sample_matches) < 100:
|
| 696 |
+
result.sample_matches.append(match)
|
| 697 |
+
|
| 698 |
+
return result
|
| 699 |
+
|
| 700 |
+
|
| 701 |
+
# Singleton scanner
|
| 702 |
+
_scanner = PIIScanner()
|
| 703 |
+
|
| 704 |
+
|
| 705 |
+
def scan_for_pii(
|
| 706 |
+
data,
|
| 707 |
+
sample_size: int = 1000,
|
| 708 |
+
min_confidence: float = 0.5,
|
| 709 |
+
) -> PIIScanResult:
|
| 710 |
+
"""
|
| 711 |
+
Convenience function to scan data for PII.
|
| 712 |
+
|
| 713 |
+
Args:
|
| 714 |
+
data: Text, dict, or HuggingFace Dataset
|
| 715 |
+
sample_size: Max rows to scan
|
| 716 |
+
min_confidence: Minimum confidence threshold
|
| 717 |
+
|
| 718 |
+
Returns:
|
| 719 |
+
PIIScanResult with findings
|
| 720 |
+
"""
|
| 721 |
+
scanner = PIIScanner(min_confidence=min_confidence)
|
| 722 |
+
|
| 723 |
+
if isinstance(data, str):
|
| 724 |
+
matches = scanner.scan_text(data)
|
| 725 |
+
result = PIIScanResult(
|
| 726 |
+
total_matches=len(matches),
|
| 727 |
+
sample_matches=matches,
|
| 728 |
+
)
|
| 729 |
+
for m in matches:
|
| 730 |
+
result.matches_by_type[m.pii_type.value] = result.matches_by_type.get(m.pii_type.value, 0) + 1
|
| 731 |
+
result.matches_by_severity[m.severity.value] = result.matches_by_severity.get(m.severity.value, 0) + 1
|
| 732 |
+
return result
|
| 733 |
+
|
| 734 |
+
if isinstance(data, dict):
|
| 735 |
+
return scanner.scan_dict(data, sample_size)
|
| 736 |
+
|
| 737 |
+
# Assume HuggingFace Dataset
|
| 738 |
+
return scanner.scan_dataset(data, sample_size)
|
| 739 |
+
|
| 740 |
+
|
| 741 |
+
def quick_pii_check(data, sample_size: int = 100) -> bool:
|
| 742 |
+
"""
|
| 743 |
+
Quick check if data contains any PII.
|
| 744 |
+
|
| 745 |
+
Returns True if PII is found, False otherwise.
|
| 746 |
+
"""
|
| 747 |
+
result = scan_for_pii(data, sample_size=sample_size, min_confidence=0.7)
|
| 748 |
+
return result.total_matches > 0
|
cascade/data/provenance.py
ADDED
|
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Provenance Graph
|
| 3 |
+
|
| 4 |
+
Tracks entities, activities, agents, and their relationships.
|
| 5 |
+
Supports Merkle tree hashing for tamper-evident lineage.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import hashlib
|
| 9 |
+
import json
|
| 10 |
+
import time
|
| 11 |
+
from dataclasses import dataclass, field
|
| 12 |
+
from typing import Any, Dict, List, Optional, Set, Tuple, Iterator
|
| 13 |
+
|
| 14 |
+
from .entities import (
|
| 15 |
+
DatasetEntity, Activity, Agent, Relationship, RelationType,
|
| 16 |
+
ActivityType, AgentType, create_system_agent
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class ProvenanceNode:
|
| 22 |
+
"""A node in the provenance graph with hash chain."""
|
| 23 |
+
node_id: str
|
| 24 |
+
node_type: str # entity, activity, agent
|
| 25 |
+
data: Dict[str, Any]
|
| 26 |
+
|
| 27 |
+
# Hash chain
|
| 28 |
+
node_hash: str = ""
|
| 29 |
+
parent_hashes: List[str] = field(default_factory=list)
|
| 30 |
+
|
| 31 |
+
def __post_init__(self):
|
| 32 |
+
if not self.node_hash:
|
| 33 |
+
self.node_hash = self._compute_hash()
|
| 34 |
+
|
| 35 |
+
def _compute_hash(self) -> str:
|
| 36 |
+
"""Compute hash including parent hashes (Merkle-style)."""
|
| 37 |
+
content = json.dumps({
|
| 38 |
+
"id": self.node_id,
|
| 39 |
+
"type": self.node_type,
|
| 40 |
+
"data": self.data,
|
| 41 |
+
"parents": sorted(self.parent_hashes),
|
| 42 |
+
}, sort_keys=True, default=str)
|
| 43 |
+
return hashlib.sha256(content.encode()).hexdigest()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class ProvenanceGraph:
|
| 47 |
+
"""
|
| 48 |
+
A graph of provenance relationships.
|
| 49 |
+
|
| 50 |
+
Tracks:
|
| 51 |
+
- Entities (datasets, versions, splits)
|
| 52 |
+
- Activities (transforms, training, inference)
|
| 53 |
+
- Agents (users, models, pipelines)
|
| 54 |
+
- Relationships between them
|
| 55 |
+
|
| 56 |
+
Provides:
|
| 57 |
+
- Lineage queries (what produced this? what did this produce?)
|
| 58 |
+
- Hash chain for integrity verification
|
| 59 |
+
- Export to PROV-O and Croissant formats
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
def __init__(self, name: str = "default"):
|
| 63 |
+
self.name = name
|
| 64 |
+
self.created_at = time.time()
|
| 65 |
+
|
| 66 |
+
# Storage
|
| 67 |
+
self._entities: Dict[str, DatasetEntity] = {}
|
| 68 |
+
self._activities: Dict[str, Activity] = {}
|
| 69 |
+
self._agents: Dict[str, Agent] = {}
|
| 70 |
+
self._relationships: List[Relationship] = []
|
| 71 |
+
|
| 72 |
+
# Hash chain
|
| 73 |
+
self._nodes: Dict[str, ProvenanceNode] = {}
|
| 74 |
+
self._root_hash: Optional[str] = None
|
| 75 |
+
|
| 76 |
+
# Default system agent
|
| 77 |
+
self._system_agent = create_system_agent("cascade-data-observatory")
|
| 78 |
+
self.add_agent(self._system_agent)
|
| 79 |
+
|
| 80 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 81 |
+
# ENTITY MANAGEMENT
|
| 82 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 83 |
+
|
| 84 |
+
def add_entity(self, entity: DatasetEntity) -> str:
|
| 85 |
+
"""Add a dataset entity to the graph."""
|
| 86 |
+
self._entities[entity.id] = entity
|
| 87 |
+
|
| 88 |
+
# Create provenance node
|
| 89 |
+
node = ProvenanceNode(
|
| 90 |
+
node_id=entity.id,
|
| 91 |
+
node_type="entity",
|
| 92 |
+
data=entity.to_dict(),
|
| 93 |
+
)
|
| 94 |
+
self._nodes[entity.id] = node
|
| 95 |
+
self._update_root_hash()
|
| 96 |
+
|
| 97 |
+
return entity.id
|
| 98 |
+
|
| 99 |
+
def get_entity(self, entity_id: str) -> Optional[DatasetEntity]:
|
| 100 |
+
"""Get entity by ID."""
|
| 101 |
+
return self._entities.get(entity_id)
|
| 102 |
+
|
| 103 |
+
def list_entities(self) -> List[DatasetEntity]:
|
| 104 |
+
"""List all entities."""
|
| 105 |
+
return list(self._entities.values())
|
| 106 |
+
|
| 107 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 108 |
+
# ACTIVITY MANAGEMENT
|
| 109 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 110 |
+
|
| 111 |
+
def add_activity(self, activity: Activity) -> str:
|
| 112 |
+
"""Add an activity to the graph."""
|
| 113 |
+
self._activities[activity.id] = activity
|
| 114 |
+
|
| 115 |
+
# Link to agent
|
| 116 |
+
if not activity.agent_id:
|
| 117 |
+
activity.agent_id = self._system_agent.id
|
| 118 |
+
|
| 119 |
+
# Create provenance node with parent hashes from inputs
|
| 120 |
+
parent_hashes = []
|
| 121 |
+
for input_id in activity.inputs:
|
| 122 |
+
if input_id in self._nodes:
|
| 123 |
+
parent_hashes.append(self._nodes[input_id].node_hash)
|
| 124 |
+
|
| 125 |
+
node = ProvenanceNode(
|
| 126 |
+
node_id=activity.id,
|
| 127 |
+
node_type="activity",
|
| 128 |
+
data=activity.to_dict(),
|
| 129 |
+
parent_hashes=parent_hashes,
|
| 130 |
+
)
|
| 131 |
+
self._nodes[activity.id] = node
|
| 132 |
+
self._update_root_hash()
|
| 133 |
+
|
| 134 |
+
return activity.id
|
| 135 |
+
|
| 136 |
+
def get_activity(self, activity_id: str) -> Optional[Activity]:
|
| 137 |
+
"""Get activity by ID."""
|
| 138 |
+
return self._activities.get(activity_id)
|
| 139 |
+
|
| 140 |
+
def list_activities(self) -> List[Activity]:
|
| 141 |
+
"""List all activities."""
|
| 142 |
+
return list(self._activities.values())
|
| 143 |
+
|
| 144 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 145 |
+
# AGENT MANAGEMENT
|
| 146 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 147 |
+
|
| 148 |
+
def add_agent(self, agent: Agent) -> str:
|
| 149 |
+
"""Add an agent to the graph."""
|
| 150 |
+
self._agents[agent.id] = agent
|
| 151 |
+
|
| 152 |
+
node = ProvenanceNode(
|
| 153 |
+
node_id=agent.id,
|
| 154 |
+
node_type="agent",
|
| 155 |
+
data=agent.to_dict(),
|
| 156 |
+
)
|
| 157 |
+
self._nodes[agent.id] = node
|
| 158 |
+
|
| 159 |
+
return agent.id
|
| 160 |
+
|
| 161 |
+
def get_agent(self, agent_id: str) -> Optional[Agent]:
|
| 162 |
+
"""Get agent by ID."""
|
| 163 |
+
return self._agents.get(agent_id)
|
| 164 |
+
|
| 165 |
+
def list_agents(self) -> List[Agent]:
|
| 166 |
+
"""List all agents."""
|
| 167 |
+
return list(self._agents.values())
|
| 168 |
+
|
| 169 |
+
def list_relationships(self) -> List[Relationship]:
|
| 170 |
+
"""List all relationships."""
|
| 171 |
+
return list(self._relationships)
|
| 172 |
+
|
| 173 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 174 |
+
# RELATIONSHIP MANAGEMENT
|
| 175 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 176 |
+
|
| 177 |
+
def add_relationship(
|
| 178 |
+
self,
|
| 179 |
+
relation_type: RelationType,
|
| 180 |
+
source_id: str,
|
| 181 |
+
target_id: str,
|
| 182 |
+
attributes: Dict[str, Any] = None,
|
| 183 |
+
timestamp: float = None,
|
| 184 |
+
) -> Relationship:
|
| 185 |
+
"""Add a relationship between nodes."""
|
| 186 |
+
rel = Relationship(
|
| 187 |
+
relation_type=relation_type,
|
| 188 |
+
source_id=source_id,
|
| 189 |
+
target_id=target_id,
|
| 190 |
+
timestamp=timestamp if timestamp is not None else time.time(),
|
| 191 |
+
attributes=attributes or {},
|
| 192 |
+
)
|
| 193 |
+
self._relationships.append(rel)
|
| 194 |
+
return rel
|
| 195 |
+
|
| 196 |
+
def link_derivation(self, derived_id: str, source_id: str) -> Relationship:
|
| 197 |
+
"""Record that derived entity came from source entity."""
|
| 198 |
+
return self.add_relationship(
|
| 199 |
+
RelationType.WAS_DERIVED_FROM,
|
| 200 |
+
source_id=derived_id,
|
| 201 |
+
target_id=source_id,
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
def link_generation(self, entity_id: str, activity_id: str) -> Relationship:
|
| 205 |
+
"""Record that entity was generated by activity."""
|
| 206 |
+
return self.add_relationship(
|
| 207 |
+
RelationType.WAS_GENERATED_BY,
|
| 208 |
+
source_id=entity_id,
|
| 209 |
+
target_id=activity_id,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
def link_usage(self, activity_id: str, entity_id: str) -> Relationship:
|
| 213 |
+
"""Record that activity used entity as input."""
|
| 214 |
+
return self.add_relationship(
|
| 215 |
+
RelationType.USED,
|
| 216 |
+
source_id=activity_id,
|
| 217 |
+
target_id=entity_id,
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
def link_attribution(self, entity_id: str, agent_id: str) -> Relationship:
|
| 221 |
+
"""Record that entity was attributed to agent."""
|
| 222 |
+
return self.add_relationship(
|
| 223 |
+
RelationType.WAS_ATTRIBUTED_TO,
|
| 224 |
+
source_id=entity_id,
|
| 225 |
+
target_id=agent_id,
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
def link_association(self, activity_id: str, agent_id: str) -> Relationship:
|
| 229 |
+
"""Record that activity was associated with agent."""
|
| 230 |
+
return self.add_relationship(
|
| 231 |
+
RelationType.WAS_ASSOCIATED_WITH,
|
| 232 |
+
source_id=activity_id,
|
| 233 |
+
target_id=agent_id,
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 237 |
+
# LINEAGE QUERIES
|
| 238 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 239 |
+
|
| 240 |
+
def get_lineage(self, entity_id: str, direction: str = "upstream") -> List[str]:
|
| 241 |
+
"""
|
| 242 |
+
Get lineage for an entity.
|
| 243 |
+
|
| 244 |
+
Args:
|
| 245 |
+
entity_id: The entity to trace
|
| 246 |
+
direction: "upstream" (what produced this) or "downstream" (what this produced)
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
List of entity IDs in lineage order
|
| 250 |
+
"""
|
| 251 |
+
visited: Set[str] = set()
|
| 252 |
+
lineage: List[str] = []
|
| 253 |
+
|
| 254 |
+
def trace(current_id: str):
|
| 255 |
+
if current_id in visited:
|
| 256 |
+
return
|
| 257 |
+
visited.add(current_id)
|
| 258 |
+
|
| 259 |
+
for rel in self._relationships:
|
| 260 |
+
if direction == "upstream":
|
| 261 |
+
# Follow wasDerivedFrom backwards
|
| 262 |
+
if rel.relation_type == RelationType.WAS_DERIVED_FROM:
|
| 263 |
+
if rel.source_id == current_id:
|
| 264 |
+
lineage.append(rel.target_id)
|
| 265 |
+
trace(rel.target_id)
|
| 266 |
+
else:
|
| 267 |
+
# Follow wasDerivedFrom forwards
|
| 268 |
+
if rel.relation_type == RelationType.WAS_DERIVED_FROM:
|
| 269 |
+
if rel.target_id == current_id:
|
| 270 |
+
lineage.append(rel.source_id)
|
| 271 |
+
trace(rel.source_id)
|
| 272 |
+
|
| 273 |
+
trace(entity_id)
|
| 274 |
+
return lineage
|
| 275 |
+
|
| 276 |
+
def get_activities_for_entity(self, entity_id: str) -> List[Activity]:
|
| 277 |
+
"""Get activities that generated or used this entity."""
|
| 278 |
+
activity_ids = set()
|
| 279 |
+
|
| 280 |
+
for rel in self._relationships:
|
| 281 |
+
if rel.relation_type == RelationType.WAS_GENERATED_BY:
|
| 282 |
+
if rel.source_id == entity_id:
|
| 283 |
+
activity_ids.add(rel.target_id)
|
| 284 |
+
elif rel.relation_type == RelationType.USED:
|
| 285 |
+
if rel.target_id == entity_id:
|
| 286 |
+
activity_ids.add(rel.source_id)
|
| 287 |
+
|
| 288 |
+
return [self._activities[aid] for aid in activity_ids if aid in self._activities]
|
| 289 |
+
|
| 290 |
+
def get_inputs_for_activity(self, activity_id: str) -> List[DatasetEntity]:
|
| 291 |
+
"""Get entities that were inputs to an activity."""
|
| 292 |
+
entity_ids = set()
|
| 293 |
+
|
| 294 |
+
for rel in self._relationships:
|
| 295 |
+
if rel.relation_type == RelationType.USED:
|
| 296 |
+
if rel.source_id == activity_id:
|
| 297 |
+
entity_ids.add(rel.target_id)
|
| 298 |
+
|
| 299 |
+
return [self._entities[eid] for eid in entity_ids if eid in self._entities]
|
| 300 |
+
|
| 301 |
+
def get_outputs_for_activity(self, activity_id: str) -> List[DatasetEntity]:
|
| 302 |
+
"""Get entities that were outputs of an activity."""
|
| 303 |
+
entity_ids = set()
|
| 304 |
+
|
| 305 |
+
for rel in self._relationships:
|
| 306 |
+
if rel.relation_type == RelationType.WAS_GENERATED_BY:
|
| 307 |
+
if rel.target_id == activity_id:
|
| 308 |
+
entity_ids.add(rel.source_id)
|
| 309 |
+
|
| 310 |
+
return [self._entities[eid] for eid in entity_ids if eid in self._entities]
|
| 311 |
+
|
| 312 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 313 |
+
# HASH CHAIN
|
| 314 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 315 |
+
|
| 316 |
+
def _update_root_hash(self):
|
| 317 |
+
"""Update the Merkle root hash."""
|
| 318 |
+
if not self._nodes:
|
| 319 |
+
self._root_hash = None
|
| 320 |
+
return
|
| 321 |
+
|
| 322 |
+
# Compute root from all node hashes
|
| 323 |
+
all_hashes = sorted([n.node_hash for n in self._nodes.values()])
|
| 324 |
+
combined = "".join(all_hashes)
|
| 325 |
+
self._root_hash = hashlib.sha256(combined.encode()).hexdigest()
|
| 326 |
+
|
| 327 |
+
@property
|
| 328 |
+
def root_hash(self) -> Optional[str]:
|
| 329 |
+
"""Get the current Merkle root hash."""
|
| 330 |
+
return self._root_hash
|
| 331 |
+
|
| 332 |
+
def verify_integrity(self) -> Tuple[bool, List[str]]:
|
| 333 |
+
"""
|
| 334 |
+
Verify integrity of the provenance graph.
|
| 335 |
+
|
| 336 |
+
Returns:
|
| 337 |
+
(is_valid, list of invalid node IDs)
|
| 338 |
+
"""
|
| 339 |
+
invalid = []
|
| 340 |
+
|
| 341 |
+
for node_id, node in self._nodes.items():
|
| 342 |
+
expected_hash = node._compute_hash()
|
| 343 |
+
if expected_hash != node.node_hash:
|
| 344 |
+
invalid.append(node_id)
|
| 345 |
+
|
| 346 |
+
return len(invalid) == 0, invalid
|
| 347 |
+
|
| 348 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 349 |
+
# EXPORT
|
| 350 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 351 |
+
|
| 352 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 353 |
+
"""Export graph to dictionary."""
|
| 354 |
+
return {
|
| 355 |
+
"name": self.name,
|
| 356 |
+
"created_at": self.created_at,
|
| 357 |
+
"root_hash": self._root_hash,
|
| 358 |
+
"entities": {k: v.to_dict() for k, v in self._entities.items()},
|
| 359 |
+
"activities": {k: v.to_dict() for k, v in self._activities.items()},
|
| 360 |
+
"agents": {k: v.to_dict() for k, v in self._agents.items()},
|
| 361 |
+
"relationships": [r.to_dict() for r in self._relationships],
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
def to_prov_n(self) -> str:
|
| 365 |
+
"""Export as PROV-N notation."""
|
| 366 |
+
lines = [
|
| 367 |
+
f"document",
|
| 368 |
+
f" prefix cascade <https://cascade.ai/ns/>",
|
| 369 |
+
f" prefix prov <http://www.w3.org/ns/prov#>",
|
| 370 |
+
f"",
|
| 371 |
+
]
|
| 372 |
+
|
| 373 |
+
# Entities
|
| 374 |
+
for entity in self._entities.values():
|
| 375 |
+
lines.append(f" {entity.to_prov_n()}")
|
| 376 |
+
|
| 377 |
+
lines.append("")
|
| 378 |
+
|
| 379 |
+
# Activities
|
| 380 |
+
for activity in self._activities.values():
|
| 381 |
+
lines.append(f" {activity.to_prov_n()}")
|
| 382 |
+
|
| 383 |
+
lines.append("")
|
| 384 |
+
|
| 385 |
+
# Agents
|
| 386 |
+
for agent in self._agents.values():
|
| 387 |
+
lines.append(f" {agent.to_prov_n()}")
|
| 388 |
+
|
| 389 |
+
lines.append("")
|
| 390 |
+
|
| 391 |
+
# Relationships
|
| 392 |
+
for rel in self._relationships:
|
| 393 |
+
lines.append(f" {rel.to_prov_n()}")
|
| 394 |
+
|
| 395 |
+
lines.append("")
|
| 396 |
+
lines.append("endDocument")
|
| 397 |
+
|
| 398 |
+
return "\n".join(lines)
|
| 399 |
+
|
| 400 |
+
def to_prov_jsonld(self) -> Dict[str, Any]:
|
| 401 |
+
"""Export as PROV-O JSON-LD."""
|
| 402 |
+
return {
|
| 403 |
+
"@context": {
|
| 404 |
+
"prov": "http://www.w3.org/ns/prov#",
|
| 405 |
+
"cascade": "https://cascade.ai/ns/",
|
| 406 |
+
"xsd": "http://www.w3.org/2001/XMLSchema#",
|
| 407 |
+
},
|
| 408 |
+
"@graph": [
|
| 409 |
+
*[e.to_dict() for e in self._entities.values()],
|
| 410 |
+
*[a.to_dict() for a in self._activities.values()],
|
| 411 |
+
*[a.to_dict() for a in self._agents.values()],
|
| 412 |
+
],
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
@classmethod
|
| 416 |
+
def from_dict(cls, data: Dict[str, Any]) -> "ProvenanceGraph":
|
| 417 |
+
"""Load graph from dictionary."""
|
| 418 |
+
graph = cls(name=data.get("name", "default"))
|
| 419 |
+
graph.created_at = data.get("created_at", time.time())
|
| 420 |
+
|
| 421 |
+
# Load entities
|
| 422 |
+
for entity_data in data.get("entities", {}).values():
|
| 423 |
+
entity = DatasetEntity(
|
| 424 |
+
id=entity_data["@id"],
|
| 425 |
+
name=entity_data["name"],
|
| 426 |
+
content_hash=entity_data.get("content_hash"),
|
| 427 |
+
schema_hash=entity_data.get("schema_hash"),
|
| 428 |
+
version=entity_data.get("version"),
|
| 429 |
+
previous_version=entity_data.get("previous_version"),
|
| 430 |
+
source_type=entity_data.get("source_type", "unknown"),
|
| 431 |
+
source_uri=entity_data.get("source_uri"),
|
| 432 |
+
record_count=entity_data.get("record_count"),
|
| 433 |
+
size_bytes=entity_data.get("size_bytes"),
|
| 434 |
+
splits=entity_data.get("splits", {}),
|
| 435 |
+
attributes=entity_data.get("attributes", {}),
|
| 436 |
+
created_at=entity_data.get("created_at", time.time()),
|
| 437 |
+
)
|
| 438 |
+
graph.add_entity(entity)
|
| 439 |
+
|
| 440 |
+
# Load activities
|
| 441 |
+
for activity_data in data.get("activities", {}).values():
|
| 442 |
+
activity = Activity(
|
| 443 |
+
id=activity_data["@id"],
|
| 444 |
+
activity_type=ActivityType(activity_data["activity_type"]),
|
| 445 |
+
name=activity_data["name"],
|
| 446 |
+
started_at=activity_data.get("started_at"),
|
| 447 |
+
ended_at=activity_data.get("ended_at"),
|
| 448 |
+
inputs=activity_data.get("inputs", []),
|
| 449 |
+
outputs=activity_data.get("outputs", []),
|
| 450 |
+
agent_id=activity_data.get("agent_id"),
|
| 451 |
+
parameters=activity_data.get("parameters", {}),
|
| 452 |
+
attributes=activity_data.get("attributes", {}),
|
| 453 |
+
)
|
| 454 |
+
graph.add_activity(activity)
|
| 455 |
+
|
| 456 |
+
# Load agents
|
| 457 |
+
for agent_data in data.get("agents", {}).values():
|
| 458 |
+
agent = Agent(
|
| 459 |
+
id=agent_data["@id"],
|
| 460 |
+
agent_type=AgentType(agent_data["agent_type"]),
|
| 461 |
+
name=agent_data["name"],
|
| 462 |
+
version=agent_data.get("version"),
|
| 463 |
+
parent_agent_id=agent_data.get("parent_agent_id"),
|
| 464 |
+
identifier=agent_data.get("identifier"),
|
| 465 |
+
attributes=agent_data.get("attributes", {}),
|
| 466 |
+
created_at=agent_data.get("created_at", time.time()),
|
| 467 |
+
)
|
| 468 |
+
graph.add_agent(agent)
|
| 469 |
+
|
| 470 |
+
# Load relationships
|
| 471 |
+
for rel_data in data.get("relationships", []):
|
| 472 |
+
graph.add_relationship(
|
| 473 |
+
relation_type=RelationType(rel_data["type"]),
|
| 474 |
+
source_id=rel_data["source"],
|
| 475 |
+
target_id=rel_data["target"],
|
| 476 |
+
attributes=rel_data.get("attributes", {}),
|
| 477 |
+
timestamp=rel_data.get("timestamp"),
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
return graph
|
| 481 |
+
|
| 482 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 483 |
+
# STATISTICS
|
| 484 |
+
# ═══════════════════════════════════════════════════════════════════════════
|
| 485 |
+
|
| 486 |
+
@property
|
| 487 |
+
def stats(self) -> Dict[str, int]:
|
| 488 |
+
"""Get graph statistics."""
|
| 489 |
+
return {
|
| 490 |
+
"entities": len(self._entities),
|
| 491 |
+
"activities": len(self._activities),
|
| 492 |
+
"agents": len(self._agents),
|
| 493 |
+
"relationships": len(self._relationships),
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
def __repr__(self) -> str:
|
| 497 |
+
stats = self.stats
|
| 498 |
+
return (
|
| 499 |
+
f"ProvenanceGraph(name='{self.name}', "
|
| 500 |
+
f"entities={stats['entities']}, "
|
| 501 |
+
f"activities={stats['activities']}, "
|
| 502 |
+
f"relationships={stats['relationships']})"
|
| 503 |
+
)
|
cascade/data/schema.py
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Schema Observer
|
| 3 |
+
|
| 4 |
+
Observes and hashes dataset schemas/features.
|
| 5 |
+
Works with HuggingFace datasets Features, Pandas DataFrames, and raw dicts.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import hashlib
|
| 9 |
+
import json
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
from typing import Any, Dict, List, Optional, Union
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class FieldSchema:
|
| 16 |
+
"""Schema for a single field/column."""
|
| 17 |
+
name: str
|
| 18 |
+
dtype: str # Normalized type name
|
| 19 |
+
|
| 20 |
+
# Type details
|
| 21 |
+
nullable: bool = True
|
| 22 |
+
is_list: bool = False
|
| 23 |
+
list_inner_type: Optional[str] = None
|
| 24 |
+
|
| 25 |
+
# For ClassLabel
|
| 26 |
+
is_categorical: bool = False
|
| 27 |
+
categories: Optional[List[str]] = None
|
| 28 |
+
num_categories: Optional[int] = None
|
| 29 |
+
|
| 30 |
+
# For nested structures
|
| 31 |
+
nested_fields: Optional[Dict[str, "FieldSchema"]] = None
|
| 32 |
+
|
| 33 |
+
# For arrays/tensors
|
| 34 |
+
shape: Optional[tuple] = None
|
| 35 |
+
|
| 36 |
+
# Constraints
|
| 37 |
+
min_value: Optional[float] = None
|
| 38 |
+
max_value: Optional[float] = None
|
| 39 |
+
pattern: Optional[str] = None # Regex for strings
|
| 40 |
+
|
| 41 |
+
# Metadata
|
| 42 |
+
description: Optional[str] = None
|
| 43 |
+
|
| 44 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 45 |
+
result = {
|
| 46 |
+
"name": self.name,
|
| 47 |
+
"dtype": self.dtype,
|
| 48 |
+
"nullable": self.nullable,
|
| 49 |
+
}
|
| 50 |
+
if self.is_list:
|
| 51 |
+
result["is_list"] = True
|
| 52 |
+
result["list_inner_type"] = self.list_inner_type
|
| 53 |
+
if self.is_categorical:
|
| 54 |
+
result["is_categorical"] = True
|
| 55 |
+
result["categories"] = self.categories
|
| 56 |
+
result["num_categories"] = self.num_categories
|
| 57 |
+
if self.nested_fields:
|
| 58 |
+
result["nested_fields"] = {
|
| 59 |
+
k: v.to_dict() for k, v in self.nested_fields.items()
|
| 60 |
+
}
|
| 61 |
+
if self.shape:
|
| 62 |
+
result["shape"] = self.shape
|
| 63 |
+
if self.description:
|
| 64 |
+
result["description"] = self.description
|
| 65 |
+
return result
|
| 66 |
+
|
| 67 |
+
def hash(self) -> str:
|
| 68 |
+
"""Hash this field's structure."""
|
| 69 |
+
content = json.dumps(self.to_dict(), sort_keys=True)
|
| 70 |
+
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@dataclass
|
| 74 |
+
class DatasetSchema:
|
| 75 |
+
"""Complete schema for a dataset."""
|
| 76 |
+
fields: Dict[str, FieldSchema] = field(default_factory=dict)
|
| 77 |
+
|
| 78 |
+
# Dataset-level metadata
|
| 79 |
+
primary_key: Optional[List[str]] = None
|
| 80 |
+
foreign_keys: Dict[str, str] = field(default_factory=dict) # field → target
|
| 81 |
+
|
| 82 |
+
# Source info
|
| 83 |
+
source_format: Optional[str] = None # arrow, parquet, csv, etc.
|
| 84 |
+
|
| 85 |
+
def add_field(self, field_schema: FieldSchema):
|
| 86 |
+
"""Add a field to the schema."""
|
| 87 |
+
self.fields[field_schema.name] = field_schema
|
| 88 |
+
|
| 89 |
+
@property
|
| 90 |
+
def field_names(self) -> List[str]:
|
| 91 |
+
return list(self.fields.keys())
|
| 92 |
+
|
| 93 |
+
@property
|
| 94 |
+
def num_fields(self) -> int:
|
| 95 |
+
return len(self.fields)
|
| 96 |
+
|
| 97 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 98 |
+
return {
|
| 99 |
+
"fields": {k: v.to_dict() for k, v in self.fields.items()},
|
| 100 |
+
"primary_key": self.primary_key,
|
| 101 |
+
"foreign_keys": self.foreign_keys,
|
| 102 |
+
"source_format": self.source_format,
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
def hash(self) -> str:
|
| 106 |
+
"""Compute schema hash - identifies structure regardless of content."""
|
| 107 |
+
# Sort fields for deterministic hashing
|
| 108 |
+
ordered_fields = sorted(self.fields.keys())
|
| 109 |
+
content = json.dumps({
|
| 110 |
+
"fields": [self.fields[k].to_dict() for k in ordered_fields],
|
| 111 |
+
"primary_key": self.primary_key,
|
| 112 |
+
}, sort_keys=True)
|
| 113 |
+
return hashlib.sha256(content.encode()).hexdigest()
|
| 114 |
+
|
| 115 |
+
def diff(self, other: "DatasetSchema") -> Dict[str, Any]:
|
| 116 |
+
"""Compare two schemas and return differences."""
|
| 117 |
+
added = set(other.field_names) - set(self.field_names)
|
| 118 |
+
removed = set(self.field_names) - set(other.field_names)
|
| 119 |
+
|
| 120 |
+
modified = {}
|
| 121 |
+
for name in set(self.field_names) & set(other.field_names):
|
| 122 |
+
if self.fields[name].hash() != other.fields[name].hash():
|
| 123 |
+
modified[name] = {
|
| 124 |
+
"old": self.fields[name].to_dict(),
|
| 125 |
+
"new": other.fields[name].to_dict(),
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
return {
|
| 129 |
+
"added": list(added),
|
| 130 |
+
"removed": list(removed),
|
| 131 |
+
"modified": modified,
|
| 132 |
+
"compatible": len(removed) == 0 and len(modified) == 0,
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
class SchemaObserver:
|
| 137 |
+
"""
|
| 138 |
+
Observes and extracts schemas from various data sources.
|
| 139 |
+
"""
|
| 140 |
+
|
| 141 |
+
# Type mapping from various sources to normalized types
|
| 142 |
+
TYPE_MAP = {
|
| 143 |
+
# Python types
|
| 144 |
+
"str": "string",
|
| 145 |
+
"int": "int64",
|
| 146 |
+
"float": "float64",
|
| 147 |
+
"bool": "bool",
|
| 148 |
+
"bytes": "binary",
|
| 149 |
+
|
| 150 |
+
# NumPy types
|
| 151 |
+
"int8": "int8",
|
| 152 |
+
"int16": "int16",
|
| 153 |
+
"int32": "int32",
|
| 154 |
+
"int64": "int64",
|
| 155 |
+
"uint8": "uint8",
|
| 156 |
+
"uint16": "uint16",
|
| 157 |
+
"uint32": "uint32",
|
| 158 |
+
"uint64": "uint64",
|
| 159 |
+
"float16": "float16",
|
| 160 |
+
"float32": "float32",
|
| 161 |
+
"float64": "float64",
|
| 162 |
+
|
| 163 |
+
# Arrow types
|
| 164 |
+
"string": "string",
|
| 165 |
+
"large_string": "string",
|
| 166 |
+
"binary": "binary",
|
| 167 |
+
"large_binary": "binary",
|
| 168 |
+
|
| 169 |
+
# HuggingFace special types
|
| 170 |
+
"Image": "image",
|
| 171 |
+
"Audio": "audio",
|
| 172 |
+
"ClassLabel": "categorical",
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
def observe_hf_dataset(self, dataset) -> DatasetSchema:
|
| 176 |
+
"""
|
| 177 |
+
Extract schema from HuggingFace Dataset.
|
| 178 |
+
|
| 179 |
+
Args:
|
| 180 |
+
dataset: A HuggingFace datasets.Dataset or DatasetDict
|
| 181 |
+
|
| 182 |
+
Returns:
|
| 183 |
+
DatasetSchema with all fields
|
| 184 |
+
"""
|
| 185 |
+
schema = DatasetSchema(source_format="arrow")
|
| 186 |
+
|
| 187 |
+
# Get features (works for both Dataset and DatasetDict)
|
| 188 |
+
if hasattr(dataset, 'features'):
|
| 189 |
+
features = dataset.features
|
| 190 |
+
elif hasattr(dataset, '__iter__'):
|
| 191 |
+
# DatasetDict - get features from first split
|
| 192 |
+
first_split = next(iter(dataset.values()))
|
| 193 |
+
features = first_split.features
|
| 194 |
+
else:
|
| 195 |
+
raise ValueError(f"Cannot extract features from {type(dataset)}")
|
| 196 |
+
|
| 197 |
+
# Parse each feature
|
| 198 |
+
for name, feature in features.items():
|
| 199 |
+
field_schema = self._parse_hf_feature(name, feature)
|
| 200 |
+
schema.add_field(field_schema)
|
| 201 |
+
|
| 202 |
+
return schema
|
| 203 |
+
|
| 204 |
+
def _parse_hf_feature(self, name: str, feature) -> FieldSchema:
|
| 205 |
+
"""Parse a HuggingFace Feature into FieldSchema."""
|
| 206 |
+
# Import here to avoid hard dependency
|
| 207 |
+
try:
|
| 208 |
+
from datasets import (
|
| 209 |
+
Value, ClassLabel, Sequence,
|
| 210 |
+
Array2D, Array3D, Array4D, Array5D,
|
| 211 |
+
Image, Audio
|
| 212 |
+
)
|
| 213 |
+
except ImportError:
|
| 214 |
+
# Fallback for when datasets not installed
|
| 215 |
+
return FieldSchema(name=name, dtype="unknown")
|
| 216 |
+
|
| 217 |
+
# Value type (primitives)
|
| 218 |
+
if isinstance(feature, Value):
|
| 219 |
+
return FieldSchema(
|
| 220 |
+
name=name,
|
| 221 |
+
dtype=self.TYPE_MAP.get(feature.dtype, feature.dtype),
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
# ClassLabel (categorical)
|
| 225 |
+
if isinstance(feature, ClassLabel):
|
| 226 |
+
return FieldSchema(
|
| 227 |
+
name=name,
|
| 228 |
+
dtype="categorical",
|
| 229 |
+
is_categorical=True,
|
| 230 |
+
categories=feature.names,
|
| 231 |
+
num_categories=feature.num_classes,
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
# Sequence (list)
|
| 235 |
+
if isinstance(feature, Sequence):
|
| 236 |
+
inner = self._parse_hf_feature(f"{name}_inner", feature.feature)
|
| 237 |
+
return FieldSchema(
|
| 238 |
+
name=name,
|
| 239 |
+
dtype="list",
|
| 240 |
+
is_list=True,
|
| 241 |
+
list_inner_type=inner.dtype,
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
# Arrays
|
| 245 |
+
if isinstance(feature, (Array2D, Array3D, Array4D, Array5D)):
|
| 246 |
+
return FieldSchema(
|
| 247 |
+
name=name,
|
| 248 |
+
dtype=self.TYPE_MAP.get(feature.dtype, feature.dtype),
|
| 249 |
+
shape=feature.shape,
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
# Image
|
| 253 |
+
if isinstance(feature, Image):
|
| 254 |
+
return FieldSchema(
|
| 255 |
+
name=name,
|
| 256 |
+
dtype="image",
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
# Audio
|
| 260 |
+
if isinstance(feature, Audio):
|
| 261 |
+
return FieldSchema(
|
| 262 |
+
name=name,
|
| 263 |
+
dtype="audio",
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
# Dict/nested structure
|
| 267 |
+
if isinstance(feature, dict):
|
| 268 |
+
nested = {}
|
| 269 |
+
for k, v in feature.items():
|
| 270 |
+
nested[k] = self._parse_hf_feature(k, v)
|
| 271 |
+
return FieldSchema(
|
| 272 |
+
name=name,
|
| 273 |
+
dtype="struct",
|
| 274 |
+
nested_fields=nested,
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
# Fallback
|
| 278 |
+
return FieldSchema(
|
| 279 |
+
name=name,
|
| 280 |
+
dtype=str(type(feature).__name__),
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
def observe_pandas(self, df) -> DatasetSchema:
|
| 284 |
+
"""
|
| 285 |
+
Extract schema from Pandas DataFrame.
|
| 286 |
+
|
| 287 |
+
Args:
|
| 288 |
+
df: A pandas DataFrame
|
| 289 |
+
|
| 290 |
+
Returns:
|
| 291 |
+
DatasetSchema with all fields
|
| 292 |
+
"""
|
| 293 |
+
schema = DatasetSchema(source_format="pandas")
|
| 294 |
+
|
| 295 |
+
for col in df.columns:
|
| 296 |
+
dtype = str(df[col].dtype)
|
| 297 |
+
normalized = self.TYPE_MAP.get(dtype, dtype)
|
| 298 |
+
|
| 299 |
+
# Check for categorical
|
| 300 |
+
if dtype == "category":
|
| 301 |
+
schema.add_field(FieldSchema(
|
| 302 |
+
name=col,
|
| 303 |
+
dtype="categorical",
|
| 304 |
+
is_categorical=True,
|
| 305 |
+
categories=list(df[col].cat.categories),
|
| 306 |
+
num_categories=len(df[col].cat.categories),
|
| 307 |
+
))
|
| 308 |
+
else:
|
| 309 |
+
schema.add_field(FieldSchema(
|
| 310 |
+
name=col,
|
| 311 |
+
dtype=normalized,
|
| 312 |
+
nullable=df[col].isna().any(),
|
| 313 |
+
))
|
| 314 |
+
|
| 315 |
+
return schema
|
| 316 |
+
|
| 317 |
+
def observe_dict(self, data: Dict[str, Any], sample_size: int = 100) -> DatasetSchema:
|
| 318 |
+
"""
|
| 319 |
+
Extract schema from a dict of lists (columnar format).
|
| 320 |
+
|
| 321 |
+
Args:
|
| 322 |
+
data: Dict mapping column names to lists of values
|
| 323 |
+
sample_size: Number of values to sample for type inference
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
DatasetSchema with all fields
|
| 327 |
+
"""
|
| 328 |
+
schema = DatasetSchema(source_format="dict")
|
| 329 |
+
|
| 330 |
+
for col, values in data.items():
|
| 331 |
+
if not values:
|
| 332 |
+
schema.add_field(FieldSchema(name=col, dtype="unknown"))
|
| 333 |
+
continue
|
| 334 |
+
|
| 335 |
+
# Sample values for type inference
|
| 336 |
+
sample = values[:sample_size]
|
| 337 |
+
types = set(type(v).__name__ for v in sample if v is not None)
|
| 338 |
+
|
| 339 |
+
# Determine type
|
| 340 |
+
if len(types) == 0:
|
| 341 |
+
dtype = "null"
|
| 342 |
+
elif len(types) == 1:
|
| 343 |
+
dtype = self.TYPE_MAP.get(types.pop(), "unknown")
|
| 344 |
+
else:
|
| 345 |
+
dtype = "mixed"
|
| 346 |
+
|
| 347 |
+
# Check for nulls
|
| 348 |
+
nullable = any(v is None for v in sample)
|
| 349 |
+
|
| 350 |
+
schema.add_field(FieldSchema(
|
| 351 |
+
name=col,
|
| 352 |
+
dtype=dtype,
|
| 353 |
+
nullable=nullable,
|
| 354 |
+
))
|
| 355 |
+
|
| 356 |
+
return schema
|
| 357 |
+
|
| 358 |
+
def observe_arrow(self, table) -> DatasetSchema:
|
| 359 |
+
"""
|
| 360 |
+
Extract schema from PyArrow Table.
|
| 361 |
+
|
| 362 |
+
Args:
|
| 363 |
+
table: A pyarrow.Table
|
| 364 |
+
|
| 365 |
+
Returns:
|
| 366 |
+
DatasetSchema with all fields
|
| 367 |
+
"""
|
| 368 |
+
schema = DatasetSchema(source_format="arrow")
|
| 369 |
+
|
| 370 |
+
for field in table.schema:
|
| 371 |
+
dtype = str(field.type)
|
| 372 |
+
normalized = self.TYPE_MAP.get(dtype, dtype)
|
| 373 |
+
|
| 374 |
+
schema.add_field(FieldSchema(
|
| 375 |
+
name=field.name,
|
| 376 |
+
dtype=normalized,
|
| 377 |
+
nullable=field.nullable,
|
| 378 |
+
))
|
| 379 |
+
|
| 380 |
+
return schema
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
def hash_content(data, sample_size: int = 10000) -> str:
|
| 384 |
+
"""
|
| 385 |
+
Compute content hash of dataset.
|
| 386 |
+
|
| 387 |
+
For large datasets, samples rows for efficiency.
|
| 388 |
+
"""
|
| 389 |
+
hasher = hashlib.sha256()
|
| 390 |
+
|
| 391 |
+
# Handle dict first (dict also has __iter__ and __len__)
|
| 392 |
+
if isinstance(data, dict):
|
| 393 |
+
content = json.dumps(data, sort_keys=True, default=str)
|
| 394 |
+
hasher.update(content.encode())
|
| 395 |
+
|
| 396 |
+
# Handle list
|
| 397 |
+
elif isinstance(data, list):
|
| 398 |
+
for item in data[:sample_size]:
|
| 399 |
+
item_str = json.dumps(item, sort_keys=True, default=str)
|
| 400 |
+
hasher.update(item_str.encode())
|
| 401 |
+
|
| 402 |
+
# Handle HuggingFace Dataset or other iterables with __len__
|
| 403 |
+
elif hasattr(data, '__iter__') and hasattr(data, '__len__'):
|
| 404 |
+
# Sample if large
|
| 405 |
+
n = len(data)
|
| 406 |
+
if n > sample_size:
|
| 407 |
+
import random
|
| 408 |
+
indices = sorted(random.sample(range(n), sample_size))
|
| 409 |
+
sample = [data[i] for i in indices]
|
| 410 |
+
else:
|
| 411 |
+
sample = list(data)
|
| 412 |
+
|
| 413 |
+
for row in sample:
|
| 414 |
+
row_str = json.dumps(row, sort_keys=True, default=str)
|
| 415 |
+
hasher.update(row_str.encode())
|
| 416 |
+
|
| 417 |
+
return hasher.hexdigest()
|
cascade/demo.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE-LATTICE Interactive Demo
|
| 3 |
+
|
| 4 |
+
Launch the LunarLander demo showcasing:
|
| 5 |
+
- cascade.hold: Human-in-the-loop intervention
|
| 6 |
+
- cascade.store: Provenance tracking
|
| 7 |
+
- Merkle-chained decision records
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
cascade-demo # Run the demo
|
| 11 |
+
python -m cascade.demo # Alternative
|
| 12 |
+
|
| 13 |
+
Controls:
|
| 14 |
+
[H] HOLD-FREEZE - Pause time, inspect AI decision
|
| 15 |
+
[T] HOLD-TAKEOVER - Continue time, YOU control with WASD
|
| 16 |
+
[ESC] Release hold, return to AI sovereignty
|
| 17 |
+
|
| 18 |
+
In HOLD modes:
|
| 19 |
+
[W] Main Engine (thrust up)
|
| 20 |
+
[A] Left Engine (rotate)
|
| 21 |
+
[D] Right Engine (rotate)
|
| 22 |
+
[S] No-op / Accept AI decision
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
import sys
|
| 26 |
+
import subprocess
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def check_demo_dependencies():
|
| 31 |
+
"""Check if demo dependencies are installed."""
|
| 32 |
+
missing = []
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
import gymnasium
|
| 36 |
+
except ImportError:
|
| 37 |
+
missing.append("gymnasium")
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
import pygame
|
| 41 |
+
except ImportError:
|
| 42 |
+
missing.append("pygame")
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
import stable_baselines3
|
| 46 |
+
except ImportError:
|
| 47 |
+
missing.append("stable-baselines3")
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
import box2d
|
| 51 |
+
except ImportError:
|
| 52 |
+
missing.append("box2d-py")
|
| 53 |
+
|
| 54 |
+
return missing
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def main():
|
| 58 |
+
"""Launch the interactive CASCADE-LATTICE demo."""
|
| 59 |
+
print("""
|
| 60 |
+
╔═══════════════════════════════════════════════════════════════════════════════╗
|
| 61 |
+
║ ║
|
| 62 |
+
║ ██████╗ █████╗ ███████╗ ██████╗ █████╗ ██████╗ ███████╗ ║
|
| 63 |
+
║ ██╔════╝██╔══██╗██╔════╝██╔════╝██╔══██╗██╔══██╗██╔════╝ ║
|
| 64 |
+
║ ██║ ███████║███████╗██║ ███████║██║ ██║█████╗ ║
|
| 65 |
+
║ ██║ ██╔══██║╚════██║██║ ██╔══██║██║ ██║██╔══╝ ║
|
| 66 |
+
║ ╚██████╗██║ ██║███████║╚██████╗██║ ██║██████╔╝███████╗ ║
|
| 67 |
+
║ ╚═════╝╚═╝ ╚═╝╚══════╝ ╚═════╝╚═╝ ╚═╝╚═════╝ ╚══════╝ ║
|
| 68 |
+
║ ║
|
| 69 |
+
║ LATTICE DEMO - Sovereign Neural Internetwork Control ║
|
| 70 |
+
║ ║
|
| 71 |
+
╚═══════════════════════════════════════════════════════════════════════════════╝
|
| 72 |
+
""")
|
| 73 |
+
|
| 74 |
+
# Check dependencies
|
| 75 |
+
missing = check_demo_dependencies()
|
| 76 |
+
if missing:
|
| 77 |
+
print(f"[!] Missing demo dependencies: {', '.join(missing)}")
|
| 78 |
+
print()
|
| 79 |
+
print(" Install with:")
|
| 80 |
+
print(" pip install cascade-lattice[demo]")
|
| 81 |
+
print()
|
| 82 |
+
print(" Or manually:")
|
| 83 |
+
print(f" pip install {' '.join(missing)}")
|
| 84 |
+
sys.exit(1)
|
| 85 |
+
|
| 86 |
+
# Check for rl-zoo3 (needed for model download)
|
| 87 |
+
try:
|
| 88 |
+
import rl_zoo3
|
| 89 |
+
except ImportError:
|
| 90 |
+
print("[!] Missing rl-zoo3 (needed for pretrained model)")
|
| 91 |
+
print(" pip install rl-zoo3")
|
| 92 |
+
sys.exit(1)
|
| 93 |
+
|
| 94 |
+
print("[CASCADE] Starting LunarLander demo...")
|
| 95 |
+
print()
|
| 96 |
+
print("Controls:")
|
| 97 |
+
print(" [H] HOLD-FREEZE - Pause time, inspect AI decision")
|
| 98 |
+
print(" [T] HOLD-TAKEOVER - Continue time, YOU control with WASD")
|
| 99 |
+
print(" [ESC] Release hold / Quit")
|
| 100 |
+
print()
|
| 101 |
+
print("In HOLD modes:")
|
| 102 |
+
print(" [W] Main Engine [A] Left Engine [D] Right Engine")
|
| 103 |
+
print(" [S] Accept AI choice / No-op")
|
| 104 |
+
print()
|
| 105 |
+
|
| 106 |
+
# Run the demo
|
| 107 |
+
demo_path = Path(__file__).parent.parent / "examples" / "sovereign_lattice_eval.py"
|
| 108 |
+
|
| 109 |
+
if not demo_path.exists():
|
| 110 |
+
# Try installed package location
|
| 111 |
+
import cascade
|
| 112 |
+
package_dir = Path(cascade.__file__).parent
|
| 113 |
+
demo_path = package_dir.parent / "examples" / "sovereign_lattice_eval.py"
|
| 114 |
+
|
| 115 |
+
if not demo_path.exists():
|
| 116 |
+
# Fallback: run inline demo
|
| 117 |
+
print("[!] Demo file not found. Running inline version...")
|
| 118 |
+
_run_inline_demo()
|
| 119 |
+
return
|
| 120 |
+
|
| 121 |
+
# Run the demo script
|
| 122 |
+
subprocess.run([sys.executable, str(demo_path)])
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def _run_inline_demo():
|
| 126 |
+
"""Minimal inline demo if main file not found."""
|
| 127 |
+
import gymnasium as gym
|
| 128 |
+
import numpy as np
|
| 129 |
+
|
| 130 |
+
from cascade import init
|
| 131 |
+
from cascade.hold import Hold
|
| 132 |
+
from cascade.store import observe
|
| 133 |
+
|
| 134 |
+
init(project="cascade_demo")
|
| 135 |
+
hold = Hold.get()
|
| 136 |
+
|
| 137 |
+
print("[CASCADE] Running minimal demo (install full package for GUI)")
|
| 138 |
+
print()
|
| 139 |
+
|
| 140 |
+
env = gym.make("LunarLander-v3")
|
| 141 |
+
obs, _ = env.reset()
|
| 142 |
+
|
| 143 |
+
for step in range(100):
|
| 144 |
+
# Random policy for minimal demo
|
| 145 |
+
action_probs = np.array([0.25, 0.25, 0.25, 0.25])
|
| 146 |
+
|
| 147 |
+
resolution = hold.yield_point(
|
| 148 |
+
action_probs=action_probs,
|
| 149 |
+
value=0.0,
|
| 150 |
+
observation={"state": obs.tolist()[:4]},
|
| 151 |
+
brain_id="random_demo",
|
| 152 |
+
action_labels=["NOOP", "LEFT", "MAIN", "RIGHT"],
|
| 153 |
+
blocking=False
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
obs, reward, term, trunc, _ = env.step(resolution.action)
|
| 157 |
+
|
| 158 |
+
observe("demo", {
|
| 159 |
+
"step": step,
|
| 160 |
+
"action": int(resolution.action),
|
| 161 |
+
"reward": float(reward),
|
| 162 |
+
"merkle": resolution.merkle_root,
|
| 163 |
+
}, sync=False)
|
| 164 |
+
|
| 165 |
+
if term or trunc:
|
| 166 |
+
print(f"[CASCADE] Episode ended at step {step}")
|
| 167 |
+
break
|
| 168 |
+
|
| 169 |
+
env.close()
|
| 170 |
+
print("[CASCADE] Demo complete. Check ~/.cascade/lattice for provenance data.")
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
if __name__ == "__main__":
|
| 174 |
+
main()
|
cascade/demo_sdk.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE SDK Demo - Shows automatic observation of calls.
|
| 3 |
+
|
| 4 |
+
Run: python -m cascade.demo_sdk
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
# Add cascade to path if needed
|
| 11 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def demo_manual_observation():
|
| 15 |
+
"""Demo manual observation without any provider installed."""
|
| 16 |
+
print("=" * 60)
|
| 17 |
+
print("CASCADE SDK Demo - Manual Observation")
|
| 18 |
+
print("=" * 60)
|
| 19 |
+
|
| 20 |
+
import cascade
|
| 21 |
+
from cascade.sdk import CascadeSDK
|
| 22 |
+
|
| 23 |
+
# Initialize with verbose mode
|
| 24 |
+
sdk = CascadeSDK()
|
| 25 |
+
sdk.init(emit_async=False, verbose=True)
|
| 26 |
+
|
| 27 |
+
print("\n[1] Simulating an OpenAI call...")
|
| 28 |
+
sdk.observe(
|
| 29 |
+
model_id="openai/gpt-4",
|
| 30 |
+
input_data="What is the capital of France?",
|
| 31 |
+
output_data="The capital of France is Paris.",
|
| 32 |
+
metrics={"prompt_tokens": 10, "completion_tokens": 8, "total_tokens": 18},
|
| 33 |
+
context={"provider": "openai", "endpoint": "chat.completions"}
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
print("\n[2] Simulating an Anthropic call...")
|
| 37 |
+
sdk.observe(
|
| 38 |
+
model_id="anthropic/claude-3-opus-20240229",
|
| 39 |
+
input_data="Explain quantum entanglement simply.",
|
| 40 |
+
output_data="Quantum entanglement is when two particles become connected...",
|
| 41 |
+
metrics={"input_tokens": 6, "output_tokens": 45},
|
| 42 |
+
context={"provider": "anthropic", "endpoint": "messages"}
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
print("\n[3] Simulating an Ollama local call...")
|
| 46 |
+
sdk.observe(
|
| 47 |
+
model_id="ollama/llama2:7b",
|
| 48 |
+
input_data="Write a haiku about coding.",
|
| 49 |
+
output_data="Fingers on keyboard\nLogic flows like mountain stream\nBugs become features",
|
| 50 |
+
metrics={"eval_count": 20, "eval_duration": 1.5},
|
| 51 |
+
context={"provider": "ollama", "endpoint": "generate"}
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
print("\n" + "=" * 60)
|
| 55 |
+
print("Observations saved to lattice/observations/")
|
| 56 |
+
print("=" * 60)
|
| 57 |
+
|
| 58 |
+
# Show what was saved
|
| 59 |
+
from cascade.observation import ObservationManager
|
| 60 |
+
manager = ObservationManager()
|
| 61 |
+
stats = manager.get_stats()
|
| 62 |
+
print(f"\nTotal observations: {stats['total_observations']}")
|
| 63 |
+
print(f"Model observations: {stats['model_observations']}")
|
| 64 |
+
print(f"Unique models: {stats['unique_models']}")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def demo_auto_patch():
|
| 68 |
+
"""Demo auto-patching (requires providers to be installed)."""
|
| 69 |
+
print("\n" + "=" * 60)
|
| 70 |
+
print("CASCADE Auto-Patch Demo")
|
| 71 |
+
print("=" * 60)
|
| 72 |
+
|
| 73 |
+
import cascade
|
| 74 |
+
|
| 75 |
+
# This patches all installed providers
|
| 76 |
+
cascade.init(verbose=True)
|
| 77 |
+
|
| 78 |
+
print("\nPatched providers. Now any call will emit receipts.")
|
| 79 |
+
print("Example usage:")
|
| 80 |
+
print("""
|
| 81 |
+
import cascade
|
| 82 |
+
cascade.init()
|
| 83 |
+
|
| 84 |
+
# OpenAI (if installed)
|
| 85 |
+
import openai
|
| 86 |
+
client = openai.OpenAI()
|
| 87 |
+
response = client.chat.completions.create(
|
| 88 |
+
model="gpt-4",
|
| 89 |
+
messages=[{"role": "user", "content": "Hello!"}]
|
| 90 |
+
)
|
| 91 |
+
# ^^^ Receipt automatically emitted to lattice
|
| 92 |
+
|
| 93 |
+
# Anthropic (if installed)
|
| 94 |
+
import anthropic
|
| 95 |
+
client = anthropic.Anthropic()
|
| 96 |
+
response = client.messages.create(
|
| 97 |
+
model="claude-3-opus-20240229",
|
| 98 |
+
max_tokens=100,
|
| 99 |
+
messages=[{"role": "user", "content": "Hello!"}]
|
| 100 |
+
)
|
| 101 |
+
# ^^^ Receipt automatically emitted to lattice
|
| 102 |
+
|
| 103 |
+
# Ollama (if installed)
|
| 104 |
+
import ollama
|
| 105 |
+
response = ollama.chat(model="llama2", messages=[
|
| 106 |
+
{"role": "user", "content": "Hello!"}
|
| 107 |
+
])
|
| 108 |
+
# ^^^ Receipt automatically emitted to lattice
|
| 109 |
+
""")
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
if __name__ == "__main__":
|
| 113 |
+
demo_manual_observation()
|
| 114 |
+
demo_auto_patch()
|
cascade/export/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Export Module - Tableau and BI Integration
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .tableau_export import (
|
| 6 |
+
export_for_tableau,
|
| 7 |
+
export_events_csv,
|
| 8 |
+
export_chains_csv,
|
| 9 |
+
export_metrics_csv,
|
| 10 |
+
export_hold_events_csv,
|
| 11 |
+
export_causation_graph_csv,
|
| 12 |
+
TableauExporter,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
__all__ = [
|
| 16 |
+
"export_for_tableau",
|
| 17 |
+
"export_events_csv",
|
| 18 |
+
"export_chains_csv",
|
| 19 |
+
"export_metrics_csv",
|
| 20 |
+
"export_hold_events_csv",
|
| 21 |
+
"export_causation_graph_csv",
|
| 22 |
+
"TableauExporter",
|
| 23 |
+
]
|
cascade/export/tableau_export.py
ADDED
|
@@ -0,0 +1,598 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE → Tableau Export Pipeline
|
| 3 |
+
|
| 4 |
+
Exports Cascade data in Tableau-friendly formats:
|
| 5 |
+
- CSV files (universal)
|
| 6 |
+
- Hyper files (native Tableau, optional)
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
from cascade.export import export_for_tableau
|
| 10 |
+
|
| 11 |
+
# Export all data to a directory
|
| 12 |
+
export_for_tableau("./tableau_data")
|
| 13 |
+
|
| 14 |
+
# Then in Tableau: Connect → Text File → select CSVs
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import csv
|
| 18 |
+
import json
|
| 19 |
+
import os
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from datetime import datetime
|
| 22 |
+
from typing import Dict, List, Any, Optional
|
| 23 |
+
from dataclasses import dataclass, asdict
|
| 24 |
+
|
| 25 |
+
# Try to import Hyper API (optional)
|
| 26 |
+
try:
|
| 27 |
+
from tableauhyperapi import (
|
| 28 |
+
HyperProcess, Telemetry, Connection, CreateMode,
|
| 29 |
+
TableDefinition, SqlType, TableName, Inserter
|
| 30 |
+
)
|
| 31 |
+
HAS_HYPER = True
|
| 32 |
+
except ImportError:
|
| 33 |
+
HAS_HYPER = False
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@dataclass
|
| 37 |
+
class EventRow:
|
| 38 |
+
"""Flattened event for Tableau."""
|
| 39 |
+
event_id: str
|
| 40 |
+
timestamp: float
|
| 41 |
+
timestamp_iso: str
|
| 42 |
+
component: str
|
| 43 |
+
event_type: str
|
| 44 |
+
data_json: str
|
| 45 |
+
# Extracted common fields
|
| 46 |
+
loss: Optional[float] = None
|
| 47 |
+
accuracy: Optional[float] = None
|
| 48 |
+
learning_rate: Optional[float] = None
|
| 49 |
+
epoch: Optional[int] = None
|
| 50 |
+
step: Optional[int] = None
|
| 51 |
+
tokens: Optional[int] = None
|
| 52 |
+
latency_ms: Optional[float] = None
|
| 53 |
+
error_message: Optional[str] = None
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@dataclass
|
| 57 |
+
class ChainRow:
|
| 58 |
+
"""Flattened provenance chain for Tableau."""
|
| 59 |
+
session_id: str
|
| 60 |
+
model_id: str
|
| 61 |
+
model_hash: str
|
| 62 |
+
input_hash: str
|
| 63 |
+
output_hash: Optional[str]
|
| 64 |
+
merkle_root: str
|
| 65 |
+
created_at: float
|
| 66 |
+
created_at_iso: str
|
| 67 |
+
record_count: int
|
| 68 |
+
external_links_count: int
|
| 69 |
+
is_verified: bool
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@dataclass
|
| 73 |
+
class HoldEventRow:
|
| 74 |
+
"""Flattened HOLD event for Tableau."""
|
| 75 |
+
hold_id: str
|
| 76 |
+
timestamp: float
|
| 77 |
+
timestamp_iso: str
|
| 78 |
+
brain_id: str
|
| 79 |
+
state: str # PENDING, ACCEPTED, OVERRIDDEN, TIMEOUT
|
| 80 |
+
ai_choice: int
|
| 81 |
+
ai_confidence: float
|
| 82 |
+
final_action: int
|
| 83 |
+
was_override: bool
|
| 84 |
+
hold_duration_sec: float
|
| 85 |
+
value_estimate: float
|
| 86 |
+
action_count: int
|
| 87 |
+
override_source: Optional[str] = None
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
@dataclass
|
| 91 |
+
class CausationEdgeRow:
|
| 92 |
+
"""Flattened causation link for Tableau."""
|
| 93 |
+
link_id: str
|
| 94 |
+
from_event_id: str
|
| 95 |
+
to_event_id: str
|
| 96 |
+
causation_type: str # temporal, correlation, threshold, direct
|
| 97 |
+
strength: float
|
| 98 |
+
timestamp: float
|
| 99 |
+
timestamp_iso: str
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
@dataclass
|
| 103 |
+
class MetricRow:
|
| 104 |
+
"""Time-series metric for Tableau."""
|
| 105 |
+
timestamp: float
|
| 106 |
+
timestamp_iso: str
|
| 107 |
+
metric_name: str
|
| 108 |
+
metric_value: float
|
| 109 |
+
category: str # TRAINING_DYNAMICS, GRADIENT_HEALTH, etc.
|
| 110 |
+
component: str
|
| 111 |
+
is_anomaly: bool
|
| 112 |
+
anomaly_severity: Optional[str] = None
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def _ts_to_iso(ts: float) -> str:
|
| 116 |
+
"""Convert Unix timestamp to ISO string."""
|
| 117 |
+
try:
|
| 118 |
+
return datetime.fromtimestamp(ts).isoformat()
|
| 119 |
+
except:
|
| 120 |
+
return ""
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def _extract_metric_fields(data: Dict) -> Dict[str, Any]:
|
| 124 |
+
"""Extract common metric fields from event data."""
|
| 125 |
+
return {
|
| 126 |
+
"loss": data.get("loss"),
|
| 127 |
+
"accuracy": data.get("accuracy") or data.get("acc"),
|
| 128 |
+
"learning_rate": data.get("learning_rate") or data.get("lr"),
|
| 129 |
+
"epoch": data.get("epoch"),
|
| 130 |
+
"step": data.get("step") or data.get("iter"),
|
| 131 |
+
"tokens": data.get("tokens") or data.get("total_tokens"),
|
| 132 |
+
"latency_ms": data.get("latency_ms") or data.get("latency"),
|
| 133 |
+
"error_message": data.get("error") or data.get("message"),
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
class TableauExporter:
|
| 138 |
+
"""
|
| 139 |
+
Export Cascade data for Tableau visualization.
|
| 140 |
+
|
| 141 |
+
Creates a directory with CSV files ready for Tableau import:
|
| 142 |
+
- events.csv: All observed events
|
| 143 |
+
- chains.csv: Provenance chains
|
| 144 |
+
- hold_events.csv: HOLD protocol events
|
| 145 |
+
- causation_edges.csv: Graph edges for relationship diagrams
|
| 146 |
+
- metrics_timeseries.csv: Metrics over time
|
| 147 |
+
|
| 148 |
+
Example:
|
| 149 |
+
exporter = TableauExporter()
|
| 150 |
+
exporter.add_events(events)
|
| 151 |
+
exporter.add_chains(chains)
|
| 152 |
+
exporter.export("./tableau_data")
|
| 153 |
+
"""
|
| 154 |
+
|
| 155 |
+
def __init__(self):
|
| 156 |
+
self.events: List[EventRow] = []
|
| 157 |
+
self.chains: List[ChainRow] = []
|
| 158 |
+
self.hold_events: List[HoldEventRow] = []
|
| 159 |
+
self.causation_edges: List[CausationEdgeRow] = []
|
| 160 |
+
self.metrics: List[MetricRow] = []
|
| 161 |
+
|
| 162 |
+
def add_event(self, event) -> None:
|
| 163 |
+
"""Add a Cascade Event."""
|
| 164 |
+
data = event.data if hasattr(event, 'data') else {}
|
| 165 |
+
extracted = _extract_metric_fields(data)
|
| 166 |
+
|
| 167 |
+
row = EventRow(
|
| 168 |
+
event_id=event.event_id,
|
| 169 |
+
timestamp=event.timestamp,
|
| 170 |
+
timestamp_iso=_ts_to_iso(event.timestamp),
|
| 171 |
+
component=event.component,
|
| 172 |
+
event_type=event.event_type,
|
| 173 |
+
data_json=json.dumps(data),
|
| 174 |
+
**extracted
|
| 175 |
+
)
|
| 176 |
+
self.events.append(row)
|
| 177 |
+
|
| 178 |
+
def add_events(self, events) -> None:
|
| 179 |
+
"""Add multiple events."""
|
| 180 |
+
for e in events:
|
| 181 |
+
self.add_event(e)
|
| 182 |
+
|
| 183 |
+
def add_chain(self, chain, is_verified: bool = True) -> None:
|
| 184 |
+
"""Add a ProvenanceChain."""
|
| 185 |
+
row = ChainRow(
|
| 186 |
+
session_id=chain.session_id,
|
| 187 |
+
model_id=chain.model_id,
|
| 188 |
+
model_hash=chain.model_hash,
|
| 189 |
+
input_hash=chain.input_hash,
|
| 190 |
+
output_hash=chain.output_hash,
|
| 191 |
+
merkle_root=chain.merkle_root or "",
|
| 192 |
+
created_at=chain.created_at,
|
| 193 |
+
created_at_iso=_ts_to_iso(chain.created_at),
|
| 194 |
+
record_count=len(chain.records),
|
| 195 |
+
external_links_count=len(chain.external_roots),
|
| 196 |
+
is_verified=is_verified,
|
| 197 |
+
)
|
| 198 |
+
self.chains.append(row)
|
| 199 |
+
|
| 200 |
+
def add_chains(self, chains) -> None:
|
| 201 |
+
"""Add multiple chains."""
|
| 202 |
+
for c in chains:
|
| 203 |
+
self.add_chain(c)
|
| 204 |
+
|
| 205 |
+
def add_hold_event(self, hold_point, resolution) -> None:
|
| 206 |
+
"""Add a HOLD event with its resolution."""
|
| 207 |
+
import numpy as np
|
| 208 |
+
|
| 209 |
+
probs = hold_point.action_probs
|
| 210 |
+
if isinstance(probs, np.ndarray):
|
| 211 |
+
ai_choice = int(np.argmax(probs))
|
| 212 |
+
ai_confidence = float(np.max(probs))
|
| 213 |
+
action_count = len(probs)
|
| 214 |
+
else:
|
| 215 |
+
ai_choice = 0
|
| 216 |
+
ai_confidence = 0.0
|
| 217 |
+
action_count = 0
|
| 218 |
+
|
| 219 |
+
row = HoldEventRow(
|
| 220 |
+
hold_id=getattr(hold_point, 'hold_id', f"hold_{hold_point.timestamp}"),
|
| 221 |
+
timestamp=hold_point.timestamp if hasattr(hold_point, 'timestamp') else 0,
|
| 222 |
+
timestamp_iso=_ts_to_iso(hold_point.timestamp) if hasattr(hold_point, 'timestamp') else "",
|
| 223 |
+
brain_id=hold_point.brain_id,
|
| 224 |
+
state=resolution.state.value if hasattr(resolution.state, 'value') else str(resolution.state),
|
| 225 |
+
ai_choice=ai_choice,
|
| 226 |
+
ai_confidence=ai_confidence,
|
| 227 |
+
final_action=resolution.action,
|
| 228 |
+
was_override=resolution.was_override,
|
| 229 |
+
hold_duration_sec=resolution.hold_duration if hasattr(resolution, 'hold_duration') else 0,
|
| 230 |
+
value_estimate=hold_point.value,
|
| 231 |
+
action_count=action_count,
|
| 232 |
+
override_source=resolution.override_source if hasattr(resolution, 'override_source') else None,
|
| 233 |
+
)
|
| 234 |
+
self.hold_events.append(row)
|
| 235 |
+
|
| 236 |
+
def add_causation_link(self, link) -> None:
|
| 237 |
+
"""Add a causation graph edge."""
|
| 238 |
+
row = CausationEdgeRow(
|
| 239 |
+
link_id=link.link_id if hasattr(link, 'link_id') else f"{link.from_event}_{link.to_event}",
|
| 240 |
+
from_event_id=link.from_event,
|
| 241 |
+
to_event_id=link.to_event,
|
| 242 |
+
causation_type=link.causation_type,
|
| 243 |
+
strength=link.strength,
|
| 244 |
+
timestamp=link.timestamp if hasattr(link, 'timestamp') else 0,
|
| 245 |
+
timestamp_iso=_ts_to_iso(link.timestamp) if hasattr(link, 'timestamp') else "",
|
| 246 |
+
)
|
| 247 |
+
self.causation_edges.append(row)
|
| 248 |
+
|
| 249 |
+
def add_causation_links(self, links) -> None:
|
| 250 |
+
"""Add multiple causation links."""
|
| 251 |
+
for link in links:
|
| 252 |
+
self.add_causation_link(link)
|
| 253 |
+
|
| 254 |
+
def add_metric(self, name: str, value: float, timestamp: float,
|
| 255 |
+
category: str = "OTHER", component: str = "default",
|
| 256 |
+
is_anomaly: bool = False, anomaly_severity: str = None) -> None:
|
| 257 |
+
"""Add a time-series metric point."""
|
| 258 |
+
row = MetricRow(
|
| 259 |
+
timestamp=timestamp,
|
| 260 |
+
timestamp_iso=_ts_to_iso(timestamp),
|
| 261 |
+
metric_name=name,
|
| 262 |
+
metric_value=value,
|
| 263 |
+
category=category,
|
| 264 |
+
component=component,
|
| 265 |
+
is_anomaly=is_anomaly,
|
| 266 |
+
anomaly_severity=anomaly_severity,
|
| 267 |
+
)
|
| 268 |
+
self.metrics.append(row)
|
| 269 |
+
|
| 270 |
+
def add_metrics_from_event(self, event, category_map: Dict[str, str] = None) -> None:
|
| 271 |
+
"""Extract and add all metrics from an event."""
|
| 272 |
+
if category_map is None:
|
| 273 |
+
category_map = {
|
| 274 |
+
"loss": "TRAINING_DYNAMICS",
|
| 275 |
+
"accuracy": "TRAINING_DYNAMICS",
|
| 276 |
+
"lr": "TRAINING_DYNAMICS",
|
| 277 |
+
"learning_rate": "TRAINING_DYNAMICS",
|
| 278 |
+
"grad_norm": "GRADIENT_HEALTH",
|
| 279 |
+
"weight_norm": "WEIGHT_DYNAMICS",
|
| 280 |
+
"tokens": "MEMORY_COMPUTE",
|
| 281 |
+
"latency": "MEMORY_COMPUTE",
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
data = event.data if hasattr(event, 'data') else {}
|
| 285 |
+
for key, value in data.items():
|
| 286 |
+
if isinstance(value, (int, float)) and not isinstance(value, bool):
|
| 287 |
+
self.add_metric(
|
| 288 |
+
name=key,
|
| 289 |
+
value=float(value),
|
| 290 |
+
timestamp=event.timestamp,
|
| 291 |
+
category=category_map.get(key, "OTHER"),
|
| 292 |
+
component=event.component,
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
def _write_csv(self, path: Path, rows: List, fieldnames: List[str]) -> None:
|
| 296 |
+
"""Write rows to CSV."""
|
| 297 |
+
with open(path, 'w', newline='', encoding='utf-8') as f:
|
| 298 |
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
| 299 |
+
writer.writeheader()
|
| 300 |
+
for row in rows:
|
| 301 |
+
writer.writerow(asdict(row) if hasattr(row, '__dataclass_fields__') else row)
|
| 302 |
+
|
| 303 |
+
def export(self, output_dir: str) -> Dict[str, str]:
|
| 304 |
+
"""
|
| 305 |
+
Export all data to CSV files.
|
| 306 |
+
|
| 307 |
+
Args:
|
| 308 |
+
output_dir: Directory to write CSV files
|
| 309 |
+
|
| 310 |
+
Returns:
|
| 311 |
+
Dict mapping data type to file path
|
| 312 |
+
"""
|
| 313 |
+
output_path = Path(output_dir)
|
| 314 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 315 |
+
|
| 316 |
+
files = {}
|
| 317 |
+
|
| 318 |
+
# Events
|
| 319 |
+
if self.events:
|
| 320 |
+
events_path = output_path / "events.csv"
|
| 321 |
+
self._write_csv(events_path, self.events, list(EventRow.__dataclass_fields__.keys()))
|
| 322 |
+
files["events"] = str(events_path)
|
| 323 |
+
print(f"✓ Exported {len(self.events)} events to {events_path}")
|
| 324 |
+
|
| 325 |
+
# Chains
|
| 326 |
+
if self.chains:
|
| 327 |
+
chains_path = output_path / "chains.csv"
|
| 328 |
+
self._write_csv(chains_path, self.chains, list(ChainRow.__dataclass_fields__.keys()))
|
| 329 |
+
files["chains"] = str(chains_path)
|
| 330 |
+
print(f"✓ Exported {len(self.chains)} chains to {chains_path}")
|
| 331 |
+
|
| 332 |
+
# HOLD events
|
| 333 |
+
if self.hold_events:
|
| 334 |
+
hold_path = output_path / "hold_events.csv"
|
| 335 |
+
self._write_csv(hold_path, self.hold_events, list(HoldEventRow.__dataclass_fields__.keys()))
|
| 336 |
+
files["hold_events"] = str(hold_path)
|
| 337 |
+
print(f"✓ Exported {len(self.hold_events)} HOLD events to {hold_path}")
|
| 338 |
+
|
| 339 |
+
# Causation edges
|
| 340 |
+
if self.causation_edges:
|
| 341 |
+
edges_path = output_path / "causation_edges.csv"
|
| 342 |
+
self._write_csv(edges_path, self.causation_edges, list(CausationEdgeRow.__dataclass_fields__.keys()))
|
| 343 |
+
files["causation_edges"] = str(edges_path)
|
| 344 |
+
print(f"✓ Exported {len(self.causation_edges)} causation edges to {edges_path}")
|
| 345 |
+
|
| 346 |
+
# Metrics time series
|
| 347 |
+
if self.metrics:
|
| 348 |
+
metrics_path = output_path / "metrics_timeseries.csv"
|
| 349 |
+
self._write_csv(metrics_path, self.metrics, list(MetricRow.__dataclass_fields__.keys()))
|
| 350 |
+
files["metrics"] = str(metrics_path)
|
| 351 |
+
print(f"✓ Exported {len(self.metrics)} metric points to {metrics_path}")
|
| 352 |
+
|
| 353 |
+
# Write a manifest
|
| 354 |
+
manifest_path = output_path / "manifest.json"
|
| 355 |
+
manifest = {
|
| 356 |
+
"exported_at": datetime.now().isoformat(),
|
| 357 |
+
"files": files,
|
| 358 |
+
"counts": {
|
| 359 |
+
"events": len(self.events),
|
| 360 |
+
"chains": len(self.chains),
|
| 361 |
+
"hold_events": len(self.hold_events),
|
| 362 |
+
"causation_edges": len(self.causation_edges),
|
| 363 |
+
"metrics": len(self.metrics),
|
| 364 |
+
}
|
| 365 |
+
}
|
| 366 |
+
with open(manifest_path, 'w') as f:
|
| 367 |
+
json.dump(manifest, f, indent=2)
|
| 368 |
+
|
| 369 |
+
print(f"\n📊 Tableau export complete: {output_path}")
|
| 370 |
+
print(f" Open Tableau → Connect → Text File → Select CSVs")
|
| 371 |
+
|
| 372 |
+
return files
|
| 373 |
+
|
| 374 |
+
def export_hyper(self, output_path: str) -> Optional[str]:
|
| 375 |
+
"""
|
| 376 |
+
Export to Tableau Hyper format (native, fastest).
|
| 377 |
+
|
| 378 |
+
Requires: pip install tableauhyperapi
|
| 379 |
+
"""
|
| 380 |
+
if not HAS_HYPER:
|
| 381 |
+
print("⚠️ Hyper API not installed. Run: pip install tableauhyperapi")
|
| 382 |
+
return None
|
| 383 |
+
|
| 384 |
+
hyper_path = Path(output_path)
|
| 385 |
+
|
| 386 |
+
with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper:
|
| 387 |
+
with Connection(hyper.endpoint, str(hyper_path), CreateMode.CREATE_AND_REPLACE) as conn:
|
| 388 |
+
|
| 389 |
+
# Create events table
|
| 390 |
+
if self.events:
|
| 391 |
+
events_table = TableDefinition(
|
| 392 |
+
TableName("events"),
|
| 393 |
+
[
|
| 394 |
+
("event_id", SqlType.text()),
|
| 395 |
+
("timestamp", SqlType.double()),
|
| 396 |
+
("timestamp_iso", SqlType.text()),
|
| 397 |
+
("component", SqlType.text()),
|
| 398 |
+
("event_type", SqlType.text()),
|
| 399 |
+
("loss", SqlType.double()),
|
| 400 |
+
("accuracy", SqlType.double()),
|
| 401 |
+
("tokens", SqlType.int()),
|
| 402 |
+
]
|
| 403 |
+
)
|
| 404 |
+
conn.catalog.create_table(events_table)
|
| 405 |
+
|
| 406 |
+
with Inserter(conn, events_table) as inserter:
|
| 407 |
+
for e in self.events:
|
| 408 |
+
inserter.add_row([
|
| 409 |
+
e.event_id, e.timestamp, e.timestamp_iso,
|
| 410 |
+
e.component, e.event_type,
|
| 411 |
+
e.loss, e.accuracy, e.tokens
|
| 412 |
+
])
|
| 413 |
+
inserter.execute()
|
| 414 |
+
|
| 415 |
+
print(f"✓ Exported Hyper file: {hyper_path}")
|
| 416 |
+
return str(hyper_path)
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
# =============================================================================
|
| 420 |
+
# Convenience Functions
|
| 421 |
+
# =============================================================================
|
| 422 |
+
|
| 423 |
+
def export_for_tableau(output_dir: str = "./tableau_export",
|
| 424 |
+
include_sample_data: bool = True) -> Dict[str, str]:
|
| 425 |
+
"""
|
| 426 |
+
One-line export of all Cascade data for Tableau.
|
| 427 |
+
|
| 428 |
+
Args:
|
| 429 |
+
output_dir: Where to write CSV files
|
| 430 |
+
include_sample_data: Generate sample data if no real data
|
| 431 |
+
|
| 432 |
+
Returns:
|
| 433 |
+
Dict of exported file paths
|
| 434 |
+
"""
|
| 435 |
+
exporter = TableauExporter()
|
| 436 |
+
|
| 437 |
+
# Try to load real data from Cascade store
|
| 438 |
+
try:
|
| 439 |
+
from cascade.store import query, stats
|
| 440 |
+
from cascade.observation import ObservationManager
|
| 441 |
+
|
| 442 |
+
# Get observations
|
| 443 |
+
manager = ObservationManager()
|
| 444 |
+
observations = manager.get_recent(limit=1000)
|
| 445 |
+
|
| 446 |
+
for obs in observations:
|
| 447 |
+
# Create mock event from observation
|
| 448 |
+
class MockEvent:
|
| 449 |
+
def __init__(self, o):
|
| 450 |
+
self.event_id = o.get('cid', '')
|
| 451 |
+
self.timestamp = o.get('timestamp', 0)
|
| 452 |
+
self.component = o.get('model_id', 'unknown')
|
| 453 |
+
self.event_type = 'inference'
|
| 454 |
+
self.data = o.get('data', {})
|
| 455 |
+
|
| 456 |
+
exporter.add_event(MockEvent(obs))
|
| 457 |
+
exporter.add_metrics_from_event(MockEvent(obs))
|
| 458 |
+
|
| 459 |
+
print(f"Loaded {len(observations)} observations from Cascade store")
|
| 460 |
+
|
| 461 |
+
except Exception as e:
|
| 462 |
+
print(f"Note: Could not load Cascade store ({e})")
|
| 463 |
+
if include_sample_data:
|
| 464 |
+
print("Generating sample data for demo...")
|
| 465 |
+
_add_sample_data(exporter)
|
| 466 |
+
|
| 467 |
+
return exporter.export(output_dir)
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
def _add_sample_data(exporter: TableauExporter) -> None:
|
| 471 |
+
"""Add sample data for demonstration."""
|
| 472 |
+
import time
|
| 473 |
+
import random
|
| 474 |
+
|
| 475 |
+
base_time = time.time() - 3600 # 1 hour ago
|
| 476 |
+
|
| 477 |
+
# Sample events
|
| 478 |
+
models = ["gpt-4", "claude-3-opus", "llama-3-8b", "mistral-7b"]
|
| 479 |
+
event_types = ["inference", "training_step", "error", "checkpoint"]
|
| 480 |
+
|
| 481 |
+
for i in range(200):
|
| 482 |
+
class SampleEvent:
|
| 483 |
+
def __init__(self, idx):
|
| 484 |
+
self.event_id = f"evt_{idx:06d}"
|
| 485 |
+
self.timestamp = base_time + (idx * 18) # 18 sec apart
|
| 486 |
+
self.component = random.choice(models)
|
| 487 |
+
self.event_type = random.choice(event_types)
|
| 488 |
+
self.data = {
|
| 489 |
+
"loss": 2.5 - (idx * 0.01) + random.uniform(-0.1, 0.1),
|
| 490 |
+
"accuracy": min(0.95, 0.5 + (idx * 0.002) + random.uniform(-0.02, 0.02)),
|
| 491 |
+
"tokens": random.randint(100, 2000),
|
| 492 |
+
"latency_ms": random.uniform(50, 500),
|
| 493 |
+
"step": idx,
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
event = SampleEvent(i)
|
| 497 |
+
exporter.add_event(event)
|
| 498 |
+
exporter.add_metrics_from_event(event)
|
| 499 |
+
|
| 500 |
+
# Sample HOLD events
|
| 501 |
+
for i in range(20):
|
| 502 |
+
class SampleHoldPoint:
|
| 503 |
+
def __init__(self, idx):
|
| 504 |
+
import numpy as np
|
| 505 |
+
self.hold_id = f"hold_{idx:04d}"
|
| 506 |
+
self.timestamp = base_time + (idx * 180)
|
| 507 |
+
self.brain_id = random.choice(models)
|
| 508 |
+
self.action_probs = np.random.dirichlet([1, 1, 1, 1])
|
| 509 |
+
self.value = random.uniform(0.3, 0.9)
|
| 510 |
+
|
| 511 |
+
class SampleResolution:
|
| 512 |
+
def __init__(self, override=False):
|
| 513 |
+
self.state = type('State', (), {'value': 'OVERRIDDEN' if override else 'ACCEPTED'})()
|
| 514 |
+
self.action = random.randint(0, 3)
|
| 515 |
+
self.was_override = override
|
| 516 |
+
self.hold_duration = random.uniform(0.5, 10.0)
|
| 517 |
+
self.override_source = "human" if override else None
|
| 518 |
+
|
| 519 |
+
hold = SampleHoldPoint(i)
|
| 520 |
+
resolution = SampleResolution(override=random.random() < 0.25)
|
| 521 |
+
exporter.add_hold_event(hold, resolution)
|
| 522 |
+
|
| 523 |
+
# Sample causation edges
|
| 524 |
+
for i in range(50):
|
| 525 |
+
class SampleLink:
|
| 526 |
+
def __init__(self, idx):
|
| 527 |
+
self.link_id = f"link_{idx:04d}"
|
| 528 |
+
self.from_event = f"evt_{idx:06d}"
|
| 529 |
+
self.to_event = f"evt_{idx+1:06d}"
|
| 530 |
+
self.causation_type = random.choice(["temporal", "correlation", "threshold", "direct"])
|
| 531 |
+
self.strength = random.uniform(0.5, 1.0)
|
| 532 |
+
self.timestamp = base_time + (idx * 18)
|
| 533 |
+
|
| 534 |
+
exporter.add_causation_link(SampleLink(i))
|
| 535 |
+
|
| 536 |
+
# Sample chains
|
| 537 |
+
for i in range(10):
|
| 538 |
+
class SampleChain:
|
| 539 |
+
def __init__(self, idx):
|
| 540 |
+
self.session_id = f"session_{idx:04d}"
|
| 541 |
+
self.model_id = random.choice(models)
|
| 542 |
+
self.model_hash = f"{random.randint(0, 0xFFFFFFFF):08x}"
|
| 543 |
+
self.input_hash = f"{random.randint(0, 0xFFFFFFFF):08x}"
|
| 544 |
+
self.output_hash = f"{random.randint(0, 0xFFFFFFFF):08x}"
|
| 545 |
+
self.merkle_root = f"{random.randint(0, 0xFFFFFFFFFFFFFFFF):016x}"
|
| 546 |
+
self.created_at = base_time + (idx * 360)
|
| 547 |
+
self.records = [None] * random.randint(5, 50)
|
| 548 |
+
self.external_roots = [f"root_{j}" for j in range(random.randint(0, 3))]
|
| 549 |
+
|
| 550 |
+
exporter.add_chain(SampleChain(i))
|
| 551 |
+
|
| 552 |
+
|
| 553 |
+
def export_events_csv(events, output_path: str) -> str:
|
| 554 |
+
"""Export events to CSV."""
|
| 555 |
+
exporter = TableauExporter()
|
| 556 |
+
exporter.add_events(events)
|
| 557 |
+
files = exporter.export(str(Path(output_path).parent))
|
| 558 |
+
return files.get("events", "")
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
def export_chains_csv(chains, output_path: str) -> str:
|
| 562 |
+
"""Export chains to CSV."""
|
| 563 |
+
exporter = TableauExporter()
|
| 564 |
+
exporter.add_chains(chains)
|
| 565 |
+
files = exporter.export(str(Path(output_path).parent))
|
| 566 |
+
return files.get("chains", "")
|
| 567 |
+
|
| 568 |
+
|
| 569 |
+
def export_metrics_csv(events, output_path: str) -> str:
|
| 570 |
+
"""Export metrics time series to CSV."""
|
| 571 |
+
exporter = TableauExporter()
|
| 572 |
+
for e in events:
|
| 573 |
+
exporter.add_metrics_from_event(e)
|
| 574 |
+
files = exporter.export(str(Path(output_path).parent))
|
| 575 |
+
return files.get("metrics", "")
|
| 576 |
+
|
| 577 |
+
|
| 578 |
+
def export_hold_events_csv(hold_pairs, output_path: str) -> str:
|
| 579 |
+
"""Export HOLD events to CSV. hold_pairs = [(hold_point, resolution), ...]"""
|
| 580 |
+
exporter = TableauExporter()
|
| 581 |
+
for hold, res in hold_pairs:
|
| 582 |
+
exporter.add_hold_event(hold, res)
|
| 583 |
+
files = exporter.export(str(Path(output_path).parent))
|
| 584 |
+
return files.get("hold_events", "")
|
| 585 |
+
|
| 586 |
+
|
| 587 |
+
def export_causation_graph_csv(links, output_path: str) -> str:
|
| 588 |
+
"""Export causation edges to CSV."""
|
| 589 |
+
exporter = TableauExporter()
|
| 590 |
+
exporter.add_causation_links(links)
|
| 591 |
+
files = exporter.export(str(Path(output_path).parent))
|
| 592 |
+
return files.get("causation_edges", "")
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
if __name__ == "__main__":
|
| 596 |
+
# Quick test
|
| 597 |
+
print("Exporting sample data for Tableau...")
|
| 598 |
+
export_for_tableau("./tableau_export", include_sample_data=True)
|
cascade/forensics/__init__.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Forensics - Read the Ghost in the Data
|
| 3 |
+
|
| 4 |
+
Every dataset is a confession. It remembers what happened to it.
|
| 5 |
+
This module reads those memories.
|
| 6 |
+
|
| 7 |
+
GHOST LOG: Inferred processing history from data artifacts
|
| 8 |
+
SKELETON: Probable system architecture
|
| 9 |
+
DNA: Technology fingerprints
|
| 10 |
+
SOUL: Behavioral predictions
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
from cascade.forensics import DataForensics
|
| 14 |
+
|
| 15 |
+
forensics = DataForensics()
|
| 16 |
+
report = forensics.analyze(dataframe)
|
| 17 |
+
|
| 18 |
+
print(report.ghost_log) # Inferred operations
|
| 19 |
+
print(report.skeleton) # System architecture
|
| 20 |
+
print(report.fingerprints) # Technology hints
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
from cascade.forensics.analyzer import (
|
| 24 |
+
DataForensics,
|
| 25 |
+
ForensicsReport,
|
| 26 |
+
GhostLog,
|
| 27 |
+
InferredOperation,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
from cascade.forensics.artifacts import (
|
| 31 |
+
ArtifactDetector,
|
| 32 |
+
TimestampArtifacts,
|
| 33 |
+
IDPatternArtifacts,
|
| 34 |
+
TextArtifacts,
|
| 35 |
+
NumericArtifacts,
|
| 36 |
+
NullPatternArtifacts,
|
| 37 |
+
SchemaArtifacts,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
from cascade.forensics.fingerprints import (
|
| 41 |
+
TechFingerprinter,
|
| 42 |
+
Fingerprint,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
__all__ = [
|
| 46 |
+
"DataForensics",
|
| 47 |
+
"ForensicsReport",
|
| 48 |
+
"GhostLog",
|
| 49 |
+
"InferredOperation",
|
| 50 |
+
"ArtifactDetector",
|
| 51 |
+
"TechFingerprinter",
|
| 52 |
+
"Fingerprint",
|
| 53 |
+
]
|
cascade/forensics/analyzer.py
ADDED
|
@@ -0,0 +1,464 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Forensics - Main Analyzer
|
| 3 |
+
|
| 4 |
+
The data remembers. This module reads those memories.
|
| 5 |
+
|
| 6 |
+
Generates:
|
| 7 |
+
- GHOST LOG: Inferred sequence of operations
|
| 8 |
+
- SKELETON: Probable system architecture
|
| 9 |
+
- DNA: Technology fingerprints
|
| 10 |
+
- SOUL: Behavioral predictions
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import hashlib
|
| 14 |
+
import json
|
| 15 |
+
import time
|
| 16 |
+
from dataclasses import dataclass, field
|
| 17 |
+
from typing import List, Dict, Any, Optional
|
| 18 |
+
from collections import OrderedDict
|
| 19 |
+
|
| 20 |
+
from cascade.forensics.artifacts import (
|
| 21 |
+
Artifact, ArtifactDetector,
|
| 22 |
+
TimestampArtifacts, IDPatternArtifacts, TextArtifacts,
|
| 23 |
+
NumericArtifacts, NullPatternArtifacts, SchemaArtifacts,
|
| 24 |
+
)
|
| 25 |
+
from cascade.forensics.fingerprints import TechFingerprinter, Fingerprint
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass
|
| 29 |
+
class InferredOperation:
|
| 30 |
+
"""A single inferred operation from the ghost log."""
|
| 31 |
+
sequence: int
|
| 32 |
+
operation: str
|
| 33 |
+
description: str
|
| 34 |
+
confidence: float
|
| 35 |
+
evidence: List[str] = field(default_factory=list)
|
| 36 |
+
|
| 37 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 38 |
+
return {
|
| 39 |
+
"seq": self.sequence,
|
| 40 |
+
"op": self.operation,
|
| 41 |
+
"desc": self.description,
|
| 42 |
+
"confidence": self.confidence,
|
| 43 |
+
"evidence": self.evidence,
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@dataclass
|
| 48 |
+
class GhostLog:
|
| 49 |
+
"""
|
| 50 |
+
Inferred processing history - the ghost of the system.
|
| 51 |
+
|
| 52 |
+
This is a reconstruction of what PROBABLY happened
|
| 53 |
+
based on artifacts left in the data.
|
| 54 |
+
"""
|
| 55 |
+
operations: List[InferredOperation] = field(default_factory=list)
|
| 56 |
+
|
| 57 |
+
# Provenance
|
| 58 |
+
analysis_timestamp: float = field(default_factory=time.time)
|
| 59 |
+
data_hash: str = ""
|
| 60 |
+
ghost_hash: str = ""
|
| 61 |
+
|
| 62 |
+
def add_operation(self, op: str, desc: str, confidence: float, evidence: List[str] = None):
|
| 63 |
+
"""Add an inferred operation to the ghost log."""
|
| 64 |
+
self.operations.append(InferredOperation(
|
| 65 |
+
sequence=len(self.operations) + 1,
|
| 66 |
+
operation=op,
|
| 67 |
+
description=desc,
|
| 68 |
+
confidence=confidence,
|
| 69 |
+
evidence=evidence or [],
|
| 70 |
+
))
|
| 71 |
+
|
| 72 |
+
def finalize(self) -> str:
|
| 73 |
+
"""Compute hash of the ghost log for provenance."""
|
| 74 |
+
content = json.dumps([op.to_dict() for op in self.operations], sort_keys=True)
|
| 75 |
+
self.ghost_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
|
| 76 |
+
return self.ghost_hash
|
| 77 |
+
|
| 78 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 79 |
+
return {
|
| 80 |
+
"operations": [op.to_dict() for op in self.operations],
|
| 81 |
+
"analysis_timestamp": self.analysis_timestamp,
|
| 82 |
+
"data_hash": self.data_hash,
|
| 83 |
+
"ghost_hash": self.ghost_hash,
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
def to_narrative(self) -> str:
|
| 87 |
+
"""Generate human-readable narrative of inferred processing."""
|
| 88 |
+
if not self.operations:
|
| 89 |
+
return "No processing artifacts detected."
|
| 90 |
+
|
| 91 |
+
lines = ["## Ghost Log - Inferred Processing History\n"]
|
| 92 |
+
lines.append("*Based on artifacts left in the data, this is what probably happened:*\n")
|
| 93 |
+
|
| 94 |
+
for op in self.operations:
|
| 95 |
+
conf_str = "●" * int(op.confidence * 5) + "○" * (5 - int(op.confidence * 5))
|
| 96 |
+
lines.append(f"**{op.sequence}. {op.operation}** [{conf_str}]")
|
| 97 |
+
lines.append(f" {op.description}")
|
| 98 |
+
if op.evidence:
|
| 99 |
+
lines.append(f" *Evidence: {', '.join(op.evidence[:3])}*")
|
| 100 |
+
lines.append("")
|
| 101 |
+
|
| 102 |
+
return "\n".join(lines)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
@dataclass
|
| 106 |
+
class ForensicsReport:
|
| 107 |
+
"""Complete forensics analysis report."""
|
| 108 |
+
|
| 109 |
+
# Artifacts detected
|
| 110 |
+
artifacts: List[Artifact] = field(default_factory=list)
|
| 111 |
+
|
| 112 |
+
# Inferred processing
|
| 113 |
+
ghost_log: GhostLog = field(default_factory=GhostLog)
|
| 114 |
+
|
| 115 |
+
# Technology fingerprints
|
| 116 |
+
fingerprints: List[Fingerprint] = field(default_factory=list)
|
| 117 |
+
|
| 118 |
+
# Synthesized architecture
|
| 119 |
+
likely_stack: Dict[str, Any] = field(default_factory=dict)
|
| 120 |
+
|
| 121 |
+
# Security concerns
|
| 122 |
+
security_concerns: List[Dict[str, Any]] = field(default_factory=list)
|
| 123 |
+
|
| 124 |
+
# Metadata
|
| 125 |
+
analysis_timestamp: float = field(default_factory=time.time)
|
| 126 |
+
row_count: int = 0
|
| 127 |
+
column_count: int = 0
|
| 128 |
+
data_hash: str = ""
|
| 129 |
+
|
| 130 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 131 |
+
return {
|
| 132 |
+
"artifacts": [a.to_dict() for a in self.artifacts],
|
| 133 |
+
"ghost_log": self.ghost_log.to_dict(),
|
| 134 |
+
"fingerprints": [f.to_dict() for f in self.fingerprints],
|
| 135 |
+
"likely_stack": self.likely_stack,
|
| 136 |
+
"security_concerns": self.security_concerns,
|
| 137 |
+
"metadata": {
|
| 138 |
+
"timestamp": self.analysis_timestamp,
|
| 139 |
+
"rows": self.row_count,
|
| 140 |
+
"columns": self.column_count,
|
| 141 |
+
"data_hash": self.data_hash,
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
def summary(self) -> Dict[str, Any]:
|
| 146 |
+
"""Generate summary for display."""
|
| 147 |
+
return {
|
| 148 |
+
"artifacts_found": len(self.artifacts),
|
| 149 |
+
"operations_inferred": len(self.ghost_log.operations),
|
| 150 |
+
"technologies_identified": len(self.fingerprints),
|
| 151 |
+
"security_concerns": len(self.security_concerns),
|
| 152 |
+
"top_fingerprints": [f.technology for f in self.fingerprints[:5]],
|
| 153 |
+
"data_hash": self.data_hash,
|
| 154 |
+
"ghost_hash": self.ghost_log.ghost_hash,
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
class DataForensics:
|
| 159 |
+
"""
|
| 160 |
+
Main forensics analyzer.
|
| 161 |
+
|
| 162 |
+
Usage:
|
| 163 |
+
forensics = DataForensics()
|
| 164 |
+
report = forensics.analyze(df)
|
| 165 |
+
|
| 166 |
+
print(report.ghost_log.to_narrative())
|
| 167 |
+
print(report.likely_stack)
|
| 168 |
+
"""
|
| 169 |
+
|
| 170 |
+
def __init__(self):
|
| 171 |
+
self.detectors = [
|
| 172 |
+
TimestampArtifacts(),
|
| 173 |
+
IDPatternArtifacts(),
|
| 174 |
+
TextArtifacts(),
|
| 175 |
+
NumericArtifacts(),
|
| 176 |
+
NullPatternArtifacts(),
|
| 177 |
+
SchemaArtifacts(),
|
| 178 |
+
]
|
| 179 |
+
self.fingerprinter = TechFingerprinter()
|
| 180 |
+
|
| 181 |
+
def analyze(self, df) -> ForensicsReport:
|
| 182 |
+
"""
|
| 183 |
+
Analyze a dataframe for processing artifacts.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
df: Pandas DataFrame to analyze
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
ForensicsReport with all findings
|
| 190 |
+
"""
|
| 191 |
+
report = ForensicsReport()
|
| 192 |
+
report.row_count = len(df)
|
| 193 |
+
report.column_count = len(df.columns)
|
| 194 |
+
|
| 195 |
+
# Compute data hash
|
| 196 |
+
try:
|
| 197 |
+
# Sample hash for large datasets
|
| 198 |
+
if len(df) > 10000:
|
| 199 |
+
sample = df.sample(10000, random_state=42)
|
| 200 |
+
else:
|
| 201 |
+
sample = df
|
| 202 |
+
content = sample.to_json()
|
| 203 |
+
report.data_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
|
| 204 |
+
except:
|
| 205 |
+
report.data_hash = "unknown"
|
| 206 |
+
|
| 207 |
+
# Run all detectors
|
| 208 |
+
all_artifacts = []
|
| 209 |
+
|
| 210 |
+
for detector in self.detectors:
|
| 211 |
+
try:
|
| 212 |
+
# Some detectors analyze all columns at once
|
| 213 |
+
if hasattr(detector, 'detect_all'):
|
| 214 |
+
artifacts = detector.detect_all(df)
|
| 215 |
+
all_artifacts.extend(artifacts)
|
| 216 |
+
|
| 217 |
+
# Column-by-column analysis
|
| 218 |
+
for col in df.columns:
|
| 219 |
+
artifacts = detector.detect(df, col)
|
| 220 |
+
all_artifacts.extend(artifacts)
|
| 221 |
+
except Exception as e:
|
| 222 |
+
# Don't let one detector crash the whole analysis
|
| 223 |
+
pass
|
| 224 |
+
|
| 225 |
+
report.artifacts = all_artifacts
|
| 226 |
+
|
| 227 |
+
# Build ghost log from artifacts
|
| 228 |
+
report.ghost_log = self._build_ghost_log(all_artifacts, df)
|
| 229 |
+
report.ghost_log.data_hash = report.data_hash
|
| 230 |
+
report.ghost_log.finalize()
|
| 231 |
+
|
| 232 |
+
# Generate technology fingerprints
|
| 233 |
+
report.fingerprints = self.fingerprinter.analyze(all_artifacts)
|
| 234 |
+
report.likely_stack = self.fingerprinter.get_likely_stack()
|
| 235 |
+
report.security_concerns = self.fingerprinter.get_security_concerns()
|
| 236 |
+
|
| 237 |
+
return report
|
| 238 |
+
|
| 239 |
+
def _build_ghost_log(self, artifacts: List[Artifact], df) -> GhostLog:
|
| 240 |
+
"""
|
| 241 |
+
Build inferred processing history from artifacts.
|
| 242 |
+
|
| 243 |
+
This is where we reconstruct the sequence of operations
|
| 244 |
+
that probably created this data.
|
| 245 |
+
"""
|
| 246 |
+
ghost = GhostLog()
|
| 247 |
+
|
| 248 |
+
# Group artifacts by type for logical ordering
|
| 249 |
+
by_type = {}
|
| 250 |
+
for a in artifacts:
|
| 251 |
+
if a.artifact_type not in by_type:
|
| 252 |
+
by_type[a.artifact_type] = []
|
| 253 |
+
by_type[a.artifact_type].append(a)
|
| 254 |
+
|
| 255 |
+
# Infer operations in logical order
|
| 256 |
+
|
| 257 |
+
# 1. Data sourcing (schema artifacts come first)
|
| 258 |
+
if "framework_fingerprint" in by_type:
|
| 259 |
+
for a in by_type["framework_fingerprint"]:
|
| 260 |
+
ghost.add_operation(
|
| 261 |
+
"DATA_SOURCE",
|
| 262 |
+
f"Data originated from {a.details.get('framework', 'database')}: {a.evidence}",
|
| 263 |
+
a.confidence,
|
| 264 |
+
[a.evidence]
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
if "naming_convention" in by_type:
|
| 268 |
+
for a in by_type["naming_convention"]:
|
| 269 |
+
ghost.add_operation(
|
| 270 |
+
"SCHEMA_ORIGIN",
|
| 271 |
+
f"Schema follows {a.details.get('convention', 'unknown')} convention",
|
| 272 |
+
a.confidence,
|
| 273 |
+
[a.evidence]
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
# 2. Merging (if multiple sources detected)
|
| 277 |
+
if "mixed_conventions" in by_type or "id_prefix" in by_type:
|
| 278 |
+
ghost.add_operation(
|
| 279 |
+
"DATA_MERGE",
|
| 280 |
+
"Multiple data sources were merged together",
|
| 281 |
+
0.75,
|
| 282 |
+
[a.evidence for a in by_type.get("mixed_conventions", []) + by_type.get("id_prefix", [])]
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
# 3. ID generation
|
| 286 |
+
if "uuid_version" in by_type:
|
| 287 |
+
for a in by_type["uuid_version"]:
|
| 288 |
+
ghost.add_operation(
|
| 289 |
+
"ID_GENERATION",
|
| 290 |
+
f"IDs generated using {a.details.get('meaning', 'UUID')}",
|
| 291 |
+
a.confidence,
|
| 292 |
+
[a.evidence]
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
if "hash_id" in by_type:
|
| 296 |
+
for a in by_type["hash_id"]:
|
| 297 |
+
ghost.add_operation(
|
| 298 |
+
"ID_GENERATION",
|
| 299 |
+
f"IDs are {a.details.get('probable_algorithm', 'hash')}-based (content-addressed)",
|
| 300 |
+
a.confidence,
|
| 301 |
+
[a.evidence]
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
# 4. Processing / Transformation
|
| 305 |
+
if "case_normalization" in by_type:
|
| 306 |
+
for a in by_type["case_normalization"]:
|
| 307 |
+
ghost.add_operation(
|
| 308 |
+
"TEXT_NORMALIZATION",
|
| 309 |
+
f"Text converted to {a.details.get('case', 'normalized')} case",
|
| 310 |
+
a.confidence,
|
| 311 |
+
[a.evidence]
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
if "whitespace_trimming" in by_type:
|
| 315 |
+
ghost.add_operation(
|
| 316 |
+
"TEXT_CLEANING",
|
| 317 |
+
"Whitespace trimmed from text fields",
|
| 318 |
+
0.70,
|
| 319 |
+
[a.evidence for a in by_type["whitespace_trimming"]]
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
if "truncation" in by_type:
|
| 323 |
+
for a in by_type["truncation"]:
|
| 324 |
+
ghost.add_operation(
|
| 325 |
+
"FIELD_TRUNCATION",
|
| 326 |
+
f"Text truncated at {a.details.get('max_length', '?')} characters",
|
| 327 |
+
a.confidence,
|
| 328 |
+
[a.evidence]
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
if "numeric_rounding" in by_type:
|
| 332 |
+
for a in by_type["numeric_rounding"]:
|
| 333 |
+
ghost.add_operation(
|
| 334 |
+
"NUMERIC_ROUNDING",
|
| 335 |
+
f"Numbers rounded: {a.evidence}",
|
| 336 |
+
a.confidence,
|
| 337 |
+
[a.evidence]
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
# 5. Filtering / Deletion
|
| 341 |
+
if "sequential_id_gaps" in by_type:
|
| 342 |
+
for a in by_type["sequential_id_gaps"]:
|
| 343 |
+
gap_ratio = a.details.get('gap_ratio', 0)
|
| 344 |
+
ghost.add_operation(
|
| 345 |
+
"RECORD_FILTERING",
|
| 346 |
+
f"~{gap_ratio*100:.0f}% of records were filtered or deleted",
|
| 347 |
+
a.confidence,
|
| 348 |
+
[a.evidence]
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
if "hard_cutoff" in by_type:
|
| 352 |
+
for a in by_type["hard_cutoff"]:
|
| 353 |
+
ghost.add_operation(
|
| 354 |
+
"VALUE_CAPPING",
|
| 355 |
+
f"Values capped at {a.details.get('cutoff', '?')}",
|
| 356 |
+
a.confidence,
|
| 357 |
+
[a.evidence]
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
# 6. Batch processing patterns
|
| 361 |
+
if "timestamp_rounding" in by_type:
|
| 362 |
+
for a in by_type["timestamp_rounding"]:
|
| 363 |
+
ghost.add_operation(
|
| 364 |
+
"BATCH_PROCESSING",
|
| 365 |
+
f"Data processed in batches: {a.evidence}",
|
| 366 |
+
a.confidence,
|
| 367 |
+
[a.evidence]
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
if "regular_intervals" in by_type:
|
| 371 |
+
for a in by_type["regular_intervals"]:
|
| 372 |
+
ghost.add_operation(
|
| 373 |
+
"SCHEDULED_JOB",
|
| 374 |
+
f"Regular processing schedule detected: {a.details.get('interval_desc', 'unknown')}",
|
| 375 |
+
a.confidence,
|
| 376 |
+
[a.evidence]
|
| 377 |
+
)
|
| 378 |
+
|
| 379 |
+
if "temporal_clustering" in by_type:
|
| 380 |
+
ghost.add_operation(
|
| 381 |
+
"BURST_PROCESSING",
|
| 382 |
+
"Event-driven or burst batch processing detected",
|
| 383 |
+
0.75,
|
| 384 |
+
[a.evidence for a in by_type["temporal_clustering"]]
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
# 7. Data quality issues
|
| 388 |
+
if "encoding_artifact" in by_type:
|
| 389 |
+
for a in by_type["encoding_artifact"]:
|
| 390 |
+
ghost.add_operation(
|
| 391 |
+
"ENCODING_ERROR",
|
| 392 |
+
f"Character encoding conversion failed: {a.evidence}",
|
| 393 |
+
a.confidence,
|
| 394 |
+
[a.evidence]
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
if "sentinel_value" in by_type:
|
| 398 |
+
for a in by_type["sentinel_value"]:
|
| 399 |
+
ghost.add_operation(
|
| 400 |
+
"NULL_HANDLING",
|
| 401 |
+
f"NULLs represented as sentinel value {a.details.get('sentinel', '?')}",
|
| 402 |
+
a.confidence,
|
| 403 |
+
[a.evidence]
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
if "high_null_rate" in by_type:
|
| 407 |
+
for a in by_type["high_null_rate"]:
|
| 408 |
+
ghost.add_operation(
|
| 409 |
+
"OPTIONAL_FIELD",
|
| 410 |
+
f"Column {a.column} is optional or had ETL issues ({a.details.get('null_rate', 0)*100:.0f}% null)",
|
| 411 |
+
a.confidence,
|
| 412 |
+
[a.evidence]
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
# 8. Export (often the last step)
|
| 416 |
+
if any("PANDAS" in a.inferred_operation for a in artifacts):
|
| 417 |
+
ghost.add_operation(
|
| 418 |
+
"DATA_EXPORT",
|
| 419 |
+
"Data exported via Pandas to CSV",
|
| 420 |
+
0.90,
|
| 421 |
+
["Unnamed column artifact"]
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
return ghost
|
| 425 |
+
|
| 426 |
+
def analyze_file(self, filepath: str) -> ForensicsReport:
|
| 427 |
+
"""
|
| 428 |
+
Analyze a data file.
|
| 429 |
+
|
| 430 |
+
Supports: CSV, JSON, JSONL, Parquet, Excel
|
| 431 |
+
"""
|
| 432 |
+
import pandas as pd
|
| 433 |
+
from pathlib import Path
|
| 434 |
+
|
| 435 |
+
path = Path(filepath)
|
| 436 |
+
suffix = path.suffix.lower()
|
| 437 |
+
|
| 438 |
+
if suffix == '.csv':
|
| 439 |
+
df = pd.read_csv(filepath)
|
| 440 |
+
elif suffix == '.json':
|
| 441 |
+
df = pd.read_json(filepath)
|
| 442 |
+
elif suffix == '.jsonl':
|
| 443 |
+
df = pd.read_json(filepath, lines=True)
|
| 444 |
+
elif suffix == '.parquet':
|
| 445 |
+
df = pd.read_parquet(filepath)
|
| 446 |
+
elif suffix in ['.xlsx', '.xls']:
|
| 447 |
+
df = pd.read_excel(filepath)
|
| 448 |
+
else:
|
| 449 |
+
# Try CSV as default
|
| 450 |
+
df = pd.read_csv(filepath)
|
| 451 |
+
|
| 452 |
+
return self.analyze(df)
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
def analyze_dataframe(df) -> ForensicsReport:
|
| 456 |
+
"""Convenience function to analyze a dataframe."""
|
| 457 |
+
forensics = DataForensics()
|
| 458 |
+
return forensics.analyze(df)
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
def analyze_file(filepath: str) -> ForensicsReport:
|
| 462 |
+
"""Convenience function to analyze a file."""
|
| 463 |
+
forensics = DataForensics()
|
| 464 |
+
return forensics.analyze_file(filepath)
|
cascade/forensics/artifacts.py
ADDED
|
@@ -0,0 +1,1063 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Forensics - Artifact Detectors
|
| 3 |
+
|
| 4 |
+
Each detector looks for specific patterns in data that reveal
|
| 5 |
+
how it was processed. The data remembers. We read.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
import hashlib
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
from typing import List, Dict, Any, Optional, Set, Tuple
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from collections import Counter
|
| 14 |
+
import statistics
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class Artifact:
|
| 19 |
+
"""A single detected artifact - evidence of processing."""
|
| 20 |
+
artifact_type: str
|
| 21 |
+
column: str
|
| 22 |
+
evidence: str
|
| 23 |
+
confidence: float # 0.0 to 1.0
|
| 24 |
+
inferred_operation: str
|
| 25 |
+
details: Dict[str, Any] = field(default_factory=dict)
|
| 26 |
+
|
| 27 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 28 |
+
return {
|
| 29 |
+
"type": self.artifact_type,
|
| 30 |
+
"column": self.column,
|
| 31 |
+
"evidence": self.evidence,
|
| 32 |
+
"confidence": self.confidence,
|
| 33 |
+
"inferred_op": self.inferred_operation,
|
| 34 |
+
"details": self.details,
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class ArtifactDetector:
|
| 39 |
+
"""Base class for artifact detection."""
|
| 40 |
+
|
| 41 |
+
name: str = "base"
|
| 42 |
+
|
| 43 |
+
def detect(self, df, column: str) -> List[Artifact]:
|
| 44 |
+
"""Detect artifacts in a column. Override in subclasses."""
|
| 45 |
+
return []
|
| 46 |
+
|
| 47 |
+
def detect_all(self, df) -> List[Artifact]:
|
| 48 |
+
"""Detect artifacts across all applicable columns."""
|
| 49 |
+
artifacts = []
|
| 50 |
+
for col in df.columns:
|
| 51 |
+
artifacts.extend(self.detect(df, col))
|
| 52 |
+
return artifacts
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class TimestampArtifacts(ArtifactDetector):
|
| 56 |
+
"""
|
| 57 |
+
Detect timestamp patterns that reveal processing behavior.
|
| 58 |
+
|
| 59 |
+
Artifacts detected:
|
| 60 |
+
- Rounding to minute/hour/day (batch processing intervals)
|
| 61 |
+
- Regular intervals (scheduled jobs)
|
| 62 |
+
- Temporal clustering (burst processing)
|
| 63 |
+
- Timezone artifacts
|
| 64 |
+
- Future/past anomalies
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
name = "timestamp"
|
| 68 |
+
|
| 69 |
+
def detect(self, df, column: str) -> List[Artifact]:
|
| 70 |
+
artifacts = []
|
| 71 |
+
|
| 72 |
+
# Check if column looks like timestamps
|
| 73 |
+
if not self._is_timestamp_column(df, column):
|
| 74 |
+
return artifacts
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
timestamps = self._parse_timestamps(df, column)
|
| 78 |
+
if len(timestamps) < 2:
|
| 79 |
+
return artifacts
|
| 80 |
+
|
| 81 |
+
# Check for rounding patterns
|
| 82 |
+
rounding = self._detect_rounding(timestamps)
|
| 83 |
+
if rounding:
|
| 84 |
+
artifacts.append(rounding)
|
| 85 |
+
|
| 86 |
+
# Check for regular intervals
|
| 87 |
+
intervals = self._detect_intervals(timestamps)
|
| 88 |
+
if intervals:
|
| 89 |
+
artifacts.append(intervals)
|
| 90 |
+
|
| 91 |
+
# Check for clustering
|
| 92 |
+
clustering = self._detect_clustering(timestamps)
|
| 93 |
+
if clustering:
|
| 94 |
+
artifacts.append(clustering)
|
| 95 |
+
|
| 96 |
+
# Check for timezone issues
|
| 97 |
+
tz_artifacts = self._detect_timezone_artifacts(timestamps)
|
| 98 |
+
artifacts.extend(tz_artifacts)
|
| 99 |
+
|
| 100 |
+
except Exception:
|
| 101 |
+
pass
|
| 102 |
+
|
| 103 |
+
return artifacts
|
| 104 |
+
|
| 105 |
+
def _is_timestamp_column(self, df, column: str) -> bool:
|
| 106 |
+
"""Heuristic to detect timestamp columns."""
|
| 107 |
+
col_lower = column.lower()
|
| 108 |
+
timestamp_hints = ['time', 'date', 'created', 'updated', 'modified', 'timestamp', '_at', '_on']
|
| 109 |
+
if any(hint in col_lower for hint in timestamp_hints):
|
| 110 |
+
return True
|
| 111 |
+
|
| 112 |
+
# Check data type
|
| 113 |
+
dtype = str(df[column].dtype)
|
| 114 |
+
if 'datetime' in dtype or 'time' in dtype:
|
| 115 |
+
return True
|
| 116 |
+
|
| 117 |
+
# Sample and check format
|
| 118 |
+
sample = df[column].dropna().head(5).astype(str).tolist()
|
| 119 |
+
date_patterns = [
|
| 120 |
+
r'\d{4}-\d{2}-\d{2}',
|
| 121 |
+
r'\d{2}/\d{2}/\d{4}',
|
| 122 |
+
r'\d{10,13}', # Unix timestamp
|
| 123 |
+
]
|
| 124 |
+
for val in sample:
|
| 125 |
+
for pattern in date_patterns:
|
| 126 |
+
if re.search(pattern, val):
|
| 127 |
+
return True
|
| 128 |
+
|
| 129 |
+
return False
|
| 130 |
+
|
| 131 |
+
def _parse_timestamps(self, df, column: str) -> List[datetime]:
|
| 132 |
+
"""Parse column to datetime objects."""
|
| 133 |
+
import pandas as pd
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
# Try pandas datetime conversion
|
| 137 |
+
parsed = pd.to_datetime(df[column], errors='coerce')
|
| 138 |
+
return [ts.to_pydatetime() for ts in parsed.dropna()]
|
| 139 |
+
except:
|
| 140 |
+
return []
|
| 141 |
+
|
| 142 |
+
def _detect_rounding(self, timestamps: List[datetime]) -> Optional[Artifact]:
|
| 143 |
+
"""Detect if timestamps are rounded to specific intervals."""
|
| 144 |
+
if len(timestamps) < 10:
|
| 145 |
+
return None
|
| 146 |
+
|
| 147 |
+
# Check seconds
|
| 148 |
+
seconds = [ts.second for ts in timestamps]
|
| 149 |
+
unique_seconds = set(seconds)
|
| 150 |
+
|
| 151 |
+
# All zeros = minute rounding
|
| 152 |
+
if unique_seconds == {0}:
|
| 153 |
+
# Check minutes
|
| 154 |
+
minutes = [ts.minute for ts in timestamps]
|
| 155 |
+
unique_minutes = set(minutes)
|
| 156 |
+
|
| 157 |
+
if unique_minutes == {0}:
|
| 158 |
+
return Artifact(
|
| 159 |
+
artifact_type="timestamp_rounding",
|
| 160 |
+
column="timestamps",
|
| 161 |
+
evidence=f"All timestamps rounded to hour (0 minutes, 0 seconds)",
|
| 162 |
+
confidence=0.95,
|
| 163 |
+
inferred_operation="BATCH_HOURLY",
|
| 164 |
+
details={"interval": "hour", "sample_size": len(timestamps)}
|
| 165 |
+
)
|
| 166 |
+
elif all(m % 15 == 0 for m in minutes):
|
| 167 |
+
return Artifact(
|
| 168 |
+
artifact_type="timestamp_rounding",
|
| 169 |
+
column="timestamps",
|
| 170 |
+
evidence=f"Timestamps rounded to 15-minute intervals",
|
| 171 |
+
confidence=0.90,
|
| 172 |
+
inferred_operation="BATCH_15MIN",
|
| 173 |
+
details={"interval": "15min", "unique_minutes": list(unique_minutes)}
|
| 174 |
+
)
|
| 175 |
+
elif all(m % 5 == 0 for m in minutes):
|
| 176 |
+
return Artifact(
|
| 177 |
+
artifact_type="timestamp_rounding",
|
| 178 |
+
column="timestamps",
|
| 179 |
+
evidence=f"Timestamps rounded to 5-minute intervals",
|
| 180 |
+
confidence=0.85,
|
| 181 |
+
inferred_operation="BATCH_5MIN",
|
| 182 |
+
details={"interval": "5min"}
|
| 183 |
+
)
|
| 184 |
+
else:
|
| 185 |
+
return Artifact(
|
| 186 |
+
artifact_type="timestamp_rounding",
|
| 187 |
+
column="timestamps",
|
| 188 |
+
evidence=f"Timestamps rounded to minute (0 seconds)",
|
| 189 |
+
confidence=0.85,
|
| 190 |
+
inferred_operation="BATCH_MINUTE",
|
| 191 |
+
details={"interval": "minute"}
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
# Check if seconds cluster on specific values
|
| 195 |
+
second_counts = Counter(seconds)
|
| 196 |
+
most_common = second_counts.most_common(1)[0]
|
| 197 |
+
if most_common[1] > len(timestamps) * 0.8:
|
| 198 |
+
return Artifact(
|
| 199 |
+
artifact_type="timestamp_rounding",
|
| 200 |
+
column="timestamps",
|
| 201 |
+
evidence=f"{most_common[1]/len(timestamps)*100:.0f}% of timestamps have second={most_common[0]}",
|
| 202 |
+
confidence=0.70,
|
| 203 |
+
inferred_operation="SYSTEMATIC_TIMESTAMP_ASSIGNMENT",
|
| 204 |
+
details={"dominant_second": most_common[0], "percentage": most_common[1]/len(timestamps)}
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
return None
|
| 208 |
+
|
| 209 |
+
def _detect_intervals(self, timestamps: List[datetime]) -> Optional[Artifact]:
|
| 210 |
+
"""Detect regular time intervals suggesting scheduled jobs."""
|
| 211 |
+
if len(timestamps) < 10:
|
| 212 |
+
return None
|
| 213 |
+
|
| 214 |
+
sorted_ts = sorted(timestamps)
|
| 215 |
+
deltas = [(sorted_ts[i+1] - sorted_ts[i]).total_seconds() for i in range(len(sorted_ts)-1)]
|
| 216 |
+
|
| 217 |
+
if not deltas:
|
| 218 |
+
return None
|
| 219 |
+
|
| 220 |
+
# Check for consistent intervals
|
| 221 |
+
median_delta = statistics.median(deltas)
|
| 222 |
+
if median_delta == 0:
|
| 223 |
+
return None
|
| 224 |
+
|
| 225 |
+
# Count how many deltas are close to median
|
| 226 |
+
tolerance = median_delta * 0.1 # 10% tolerance
|
| 227 |
+
consistent = sum(1 for d in deltas if abs(d - median_delta) < tolerance)
|
| 228 |
+
consistency_ratio = consistent / len(deltas)
|
| 229 |
+
|
| 230 |
+
if consistency_ratio > 0.7:
|
| 231 |
+
# Describe the interval
|
| 232 |
+
interval_desc = self._describe_interval(median_delta)
|
| 233 |
+
return Artifact(
|
| 234 |
+
artifact_type="regular_intervals",
|
| 235 |
+
column="timestamps",
|
| 236 |
+
evidence=f"{consistency_ratio*100:.0f}% of records have ~{interval_desc} intervals",
|
| 237 |
+
confidence=min(0.95, consistency_ratio),
|
| 238 |
+
inferred_operation=f"SCHEDULED_JOB_{interval_desc.upper().replace(' ', '_')}",
|
| 239 |
+
details={
|
| 240 |
+
"median_seconds": median_delta,
|
| 241 |
+
"interval_desc": interval_desc,
|
| 242 |
+
"consistency": consistency_ratio
|
| 243 |
+
}
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
return None
|
| 247 |
+
|
| 248 |
+
def _describe_interval(self, seconds: float) -> str:
|
| 249 |
+
"""Human-readable interval description."""
|
| 250 |
+
if seconds < 60:
|
| 251 |
+
return f"{seconds:.0f}s"
|
| 252 |
+
elif seconds < 3600:
|
| 253 |
+
return f"{seconds/60:.0f}min"
|
| 254 |
+
elif seconds < 86400:
|
| 255 |
+
return f"{seconds/3600:.1f}hr"
|
| 256 |
+
else:
|
| 257 |
+
return f"{seconds/86400:.1f}day"
|
| 258 |
+
|
| 259 |
+
def _detect_clustering(self, timestamps: List[datetime]) -> Optional[Artifact]:
|
| 260 |
+
"""Detect temporal clustering (burst processing)."""
|
| 261 |
+
if len(timestamps) < 20:
|
| 262 |
+
return None
|
| 263 |
+
|
| 264 |
+
sorted_ts = sorted(timestamps)
|
| 265 |
+
|
| 266 |
+
# Look for bursts: many records in short time, then gaps
|
| 267 |
+
deltas = [(sorted_ts[i+1] - sorted_ts[i]).total_seconds() for i in range(len(sorted_ts)-1)]
|
| 268 |
+
|
| 269 |
+
if not deltas:
|
| 270 |
+
return None
|
| 271 |
+
|
| 272 |
+
median_delta = statistics.median(deltas)
|
| 273 |
+
if median_delta == 0:
|
| 274 |
+
return None
|
| 275 |
+
|
| 276 |
+
# Count "burst" deltas (much smaller than median) vs "gap" deltas (much larger)
|
| 277 |
+
bursts = sum(1 for d in deltas if d < median_delta * 0.1)
|
| 278 |
+
gaps = sum(1 for d in deltas if d > median_delta * 5)
|
| 279 |
+
|
| 280 |
+
if bursts > len(deltas) * 0.3 and gaps > len(deltas) * 0.05:
|
| 281 |
+
return Artifact(
|
| 282 |
+
artifact_type="temporal_clustering",
|
| 283 |
+
column="timestamps",
|
| 284 |
+
evidence=f"Burst pattern: {bursts} rapid records, {gaps} long gaps",
|
| 285 |
+
confidence=0.75,
|
| 286 |
+
inferred_operation="BATCH_BURST_PROCESSING",
|
| 287 |
+
details={
|
| 288 |
+
"burst_count": bursts,
|
| 289 |
+
"gap_count": gaps,
|
| 290 |
+
"median_delta_seconds": median_delta
|
| 291 |
+
}
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
return None
|
| 295 |
+
|
| 296 |
+
def _detect_timezone_artifacts(self, timestamps: List[datetime]) -> List[Artifact]:
|
| 297 |
+
"""Detect timezone-related artifacts."""
|
| 298 |
+
artifacts = []
|
| 299 |
+
|
| 300 |
+
# Check for hour distribution anomalies (e.g., no records 0-7 UTC = US business hours)
|
| 301 |
+
hours = [ts.hour for ts in timestamps]
|
| 302 |
+
hour_counts = Counter(hours)
|
| 303 |
+
|
| 304 |
+
# Check for gaps suggesting business hours in a specific timezone
|
| 305 |
+
zero_hours = [h for h in range(24) if hour_counts.get(h, 0) == 0]
|
| 306 |
+
|
| 307 |
+
if len(zero_hours) >= 6 and len(zero_hours) <= 12:
|
| 308 |
+
# Contiguous gap?
|
| 309 |
+
zero_hours_sorted = sorted(zero_hours)
|
| 310 |
+
if zero_hours_sorted[-1] - zero_hours_sorted[0] == len(zero_hours) - 1:
|
| 311 |
+
artifacts.append(Artifact(
|
| 312 |
+
artifact_type="business_hours",
|
| 313 |
+
column="timestamps",
|
| 314 |
+
evidence=f"No records during hours {min(zero_hours)}-{max(zero_hours)} UTC",
|
| 315 |
+
confidence=0.70,
|
| 316 |
+
inferred_operation="BUSINESS_HOURS_ONLY",
|
| 317 |
+
details={"quiet_hours": zero_hours}
|
| 318 |
+
))
|
| 319 |
+
|
| 320 |
+
return artifacts
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
class IDPatternArtifacts(ArtifactDetector):
|
| 324 |
+
"""
|
| 325 |
+
Detect ID patterns that reveal data lineage.
|
| 326 |
+
|
| 327 |
+
Artifacts detected:
|
| 328 |
+
- Sequential IDs with gaps (deletions/filtering)
|
| 329 |
+
- UUID versions (generation method)
|
| 330 |
+
- Prefixes (source identification)
|
| 331 |
+
- Hash patterns (deterministic generation)
|
| 332 |
+
"""
|
| 333 |
+
|
| 334 |
+
name = "id_patterns"
|
| 335 |
+
|
| 336 |
+
def detect(self, df, column: str) -> List[Artifact]:
|
| 337 |
+
artifacts = []
|
| 338 |
+
|
| 339 |
+
if not self._is_id_column(df, column):
|
| 340 |
+
return artifacts
|
| 341 |
+
|
| 342 |
+
try:
|
| 343 |
+
values = df[column].dropna().astype(str).tolist()
|
| 344 |
+
if len(values) < 5:
|
| 345 |
+
return artifacts
|
| 346 |
+
|
| 347 |
+
# Check for sequential integers with gaps
|
| 348 |
+
gaps = self._detect_sequential_gaps(values)
|
| 349 |
+
if gaps:
|
| 350 |
+
artifacts.append(gaps)
|
| 351 |
+
|
| 352 |
+
# Check for UUID patterns
|
| 353 |
+
uuid_artifact = self._detect_uuid_patterns(values)
|
| 354 |
+
if uuid_artifact:
|
| 355 |
+
artifacts.append(uuid_artifact)
|
| 356 |
+
|
| 357 |
+
# Check for prefixes
|
| 358 |
+
prefix = self._detect_prefixes(values)
|
| 359 |
+
if prefix:
|
| 360 |
+
artifacts.append(prefix)
|
| 361 |
+
|
| 362 |
+
# Check for hash patterns
|
| 363 |
+
hash_artifact = self._detect_hash_patterns(values)
|
| 364 |
+
if hash_artifact:
|
| 365 |
+
artifacts.append(hash_artifact)
|
| 366 |
+
|
| 367 |
+
except Exception:
|
| 368 |
+
pass
|
| 369 |
+
|
| 370 |
+
return artifacts
|
| 371 |
+
|
| 372 |
+
def _is_id_column(self, df, column: str) -> bool:
|
| 373 |
+
"""Heuristic to detect ID columns."""
|
| 374 |
+
col_lower = column.lower()
|
| 375 |
+
id_hints = ['id', 'key', 'uuid', 'guid', 'pk', '_id', 'identifier']
|
| 376 |
+
return any(hint in col_lower for hint in id_hints)
|
| 377 |
+
|
| 378 |
+
def _detect_sequential_gaps(self, values: List[str]) -> Optional[Artifact]:
|
| 379 |
+
"""Detect sequential IDs with gaps indicating deletions."""
|
| 380 |
+
# Try to parse as integers
|
| 381 |
+
try:
|
| 382 |
+
ints = sorted([int(v) for v in values if v.isdigit()])
|
| 383 |
+
if len(ints) < 10:
|
| 384 |
+
return None
|
| 385 |
+
|
| 386 |
+
# Check for gaps
|
| 387 |
+
expected_count = ints[-1] - ints[0] + 1
|
| 388 |
+
actual_count = len(set(ints))
|
| 389 |
+
gap_count = expected_count - actual_count
|
| 390 |
+
gap_ratio = gap_count / expected_count if expected_count > 0 else 0
|
| 391 |
+
|
| 392 |
+
if gap_ratio > 0.05: # More than 5% missing
|
| 393 |
+
return Artifact(
|
| 394 |
+
artifact_type="sequential_id_gaps",
|
| 395 |
+
column=values[0] if values else "id",
|
| 396 |
+
evidence=f"Sequential IDs with {gap_ratio*100:.1f}% gaps ({gap_count} missing)",
|
| 397 |
+
confidence=0.85,
|
| 398 |
+
inferred_operation="FILTERING_OR_DELETION",
|
| 399 |
+
details={
|
| 400 |
+
"min_id": ints[0],
|
| 401 |
+
"max_id": ints[-1],
|
| 402 |
+
"expected": expected_count,
|
| 403 |
+
"actual": actual_count,
|
| 404 |
+
"gap_ratio": gap_ratio
|
| 405 |
+
}
|
| 406 |
+
)
|
| 407 |
+
except:
|
| 408 |
+
pass
|
| 409 |
+
|
| 410 |
+
return None
|
| 411 |
+
|
| 412 |
+
def _detect_uuid_patterns(self, values: List[str]) -> Optional[Artifact]:
|
| 413 |
+
"""Detect UUID version from patterns."""
|
| 414 |
+
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-([0-9a-f])[0-9a-f]{3}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
|
| 415 |
+
|
| 416 |
+
versions = []
|
| 417 |
+
for v in values[:100]: # Sample
|
| 418 |
+
match = uuid_pattern.match(v)
|
| 419 |
+
if match:
|
| 420 |
+
versions.append(match.group(1))
|
| 421 |
+
|
| 422 |
+
if len(versions) < len(values[:100]) * 0.5:
|
| 423 |
+
return None
|
| 424 |
+
|
| 425 |
+
version_counts = Counter(versions)
|
| 426 |
+
dominant = version_counts.most_common(1)[0]
|
| 427 |
+
|
| 428 |
+
version_meanings = {
|
| 429 |
+
'1': 'TIME_BASED_MAC', # Reveals generation time + machine
|
| 430 |
+
'2': 'DCE_SECURITY',
|
| 431 |
+
'3': 'MD5_HASH', # Deterministic from input
|
| 432 |
+
'4': 'RANDOM', # Crypto random
|
| 433 |
+
'5': 'SHA1_HASH', # Deterministic from input
|
| 434 |
+
'6': 'SORTABLE_TIME', # Modern time-sortable
|
| 435 |
+
'7': 'UNIX_TIME_RANDOM', # Time-ordered with randomness
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
return Artifact(
|
| 439 |
+
artifact_type="uuid_version",
|
| 440 |
+
column="id",
|
| 441 |
+
evidence=f"UUIDs are version {dominant[0]} ({version_meanings.get(dominant[0], 'UNKNOWN')})",
|
| 442 |
+
confidence=0.90,
|
| 443 |
+
inferred_operation=f"UUID_GENERATION_V{dominant[0]}",
|
| 444 |
+
details={
|
| 445 |
+
"version": dominant[0],
|
| 446 |
+
"meaning": version_meanings.get(dominant[0], 'unknown'),
|
| 447 |
+
"sample_count": len(versions)
|
| 448 |
+
}
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
def _detect_prefixes(self, values: List[str]) -> Optional[Artifact]:
|
| 452 |
+
"""Detect common prefixes indicating source systems."""
|
| 453 |
+
if len(values) < 10:
|
| 454 |
+
return None
|
| 455 |
+
|
| 456 |
+
# Find common prefix
|
| 457 |
+
prefix_len = 0
|
| 458 |
+
for i in range(1, min(20, min(len(v) for v in values[:100]))):
|
| 459 |
+
prefixes = set(v[:i] for v in values[:100])
|
| 460 |
+
if len(prefixes) <= 3: # Allow up to 3 different prefixes
|
| 461 |
+
prefix_len = i
|
| 462 |
+
else:
|
| 463 |
+
break
|
| 464 |
+
|
| 465 |
+
if prefix_len >= 2:
|
| 466 |
+
prefixes = Counter(v[:prefix_len] for v in values)
|
| 467 |
+
top_prefixes = prefixes.most_common(3)
|
| 468 |
+
|
| 469 |
+
return Artifact(
|
| 470 |
+
artifact_type="id_prefix",
|
| 471 |
+
column="id",
|
| 472 |
+
evidence=f"IDs have systematic prefix: {top_prefixes}",
|
| 473 |
+
confidence=0.80,
|
| 474 |
+
inferred_operation="MULTI_SOURCE_MERGE" if len(top_prefixes) > 1 else "SOURCE_IDENTIFICATION",
|
| 475 |
+
details={
|
| 476 |
+
"prefixes": dict(top_prefixes),
|
| 477 |
+
"prefix_length": prefix_len
|
| 478 |
+
}
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
return None
|
| 482 |
+
|
| 483 |
+
def _detect_hash_patterns(self, values: List[str]) -> Optional[Artifact]:
|
| 484 |
+
"""Detect if IDs look like hashes."""
|
| 485 |
+
hex_pattern = re.compile(r'^[0-9a-f]+$', re.I)
|
| 486 |
+
|
| 487 |
+
hex_lengths = []
|
| 488 |
+
for v in values[:100]:
|
| 489 |
+
if hex_pattern.match(v):
|
| 490 |
+
hex_lengths.append(len(v))
|
| 491 |
+
|
| 492 |
+
if len(hex_lengths) < len(values[:100]) * 0.8:
|
| 493 |
+
return None
|
| 494 |
+
|
| 495 |
+
# Check for consistent hash lengths
|
| 496 |
+
length_counts = Counter(hex_lengths)
|
| 497 |
+
dominant = length_counts.most_common(1)[0]
|
| 498 |
+
|
| 499 |
+
hash_types = {
|
| 500 |
+
32: 'MD5',
|
| 501 |
+
40: 'SHA1',
|
| 502 |
+
64: 'SHA256',
|
| 503 |
+
128: 'SHA512',
|
| 504 |
+
16: 'SHORT_HASH',
|
| 505 |
+
}
|
| 506 |
+
|
| 507 |
+
if dominant[1] > len(hex_lengths) * 0.9:
|
| 508 |
+
hash_type = hash_types.get(dominant[0], f'{dominant[0]}-char hash')
|
| 509 |
+
return Artifact(
|
| 510 |
+
artifact_type="hash_id",
|
| 511 |
+
column="id",
|
| 512 |
+
evidence=f"IDs are {hash_type} hashes ({dominant[0]} hex chars)",
|
| 513 |
+
confidence=0.85,
|
| 514 |
+
inferred_operation=f"DETERMINISTIC_ID_GENERATION_{hash_type}",
|
| 515 |
+
details={
|
| 516 |
+
"hash_length": dominant[0],
|
| 517 |
+
"probable_algorithm": hash_type
|
| 518 |
+
}
|
| 519 |
+
)
|
| 520 |
+
|
| 521 |
+
return None
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
class TextArtifacts(ArtifactDetector):
|
| 525 |
+
"""
|
| 526 |
+
Detect text processing artifacts.
|
| 527 |
+
|
| 528 |
+
Artifacts detected:
|
| 529 |
+
- Truncation (field length limits)
|
| 530 |
+
- Encoding issues (charset conversion)
|
| 531 |
+
- Case normalization
|
| 532 |
+
- Whitespace patterns
|
| 533 |
+
- Sanitization patterns
|
| 534 |
+
"""
|
| 535 |
+
|
| 536 |
+
name = "text"
|
| 537 |
+
|
| 538 |
+
def detect(self, df, column: str) -> List[Artifact]:
|
| 539 |
+
artifacts = []
|
| 540 |
+
|
| 541 |
+
dtype = str(df[column].dtype)
|
| 542 |
+
if 'object' not in dtype and 'str' not in dtype:
|
| 543 |
+
return artifacts
|
| 544 |
+
|
| 545 |
+
try:
|
| 546 |
+
values = df[column].dropna().astype(str).tolist()
|
| 547 |
+
if len(values) < 5:
|
| 548 |
+
return artifacts
|
| 549 |
+
|
| 550 |
+
# Truncation
|
| 551 |
+
trunc = self._detect_truncation(values)
|
| 552 |
+
if trunc:
|
| 553 |
+
artifacts.append(trunc)
|
| 554 |
+
|
| 555 |
+
# Encoding issues
|
| 556 |
+
encoding = self._detect_encoding_artifacts(values)
|
| 557 |
+
if encoding:
|
| 558 |
+
artifacts.append(encoding)
|
| 559 |
+
|
| 560 |
+
# Case patterns
|
| 561 |
+
case = self._detect_case_patterns(values, column)
|
| 562 |
+
if case:
|
| 563 |
+
artifacts.append(case)
|
| 564 |
+
|
| 565 |
+
# Whitespace
|
| 566 |
+
ws = self._detect_whitespace_patterns(values)
|
| 567 |
+
if ws:
|
| 568 |
+
artifacts.append(ws)
|
| 569 |
+
|
| 570 |
+
except Exception:
|
| 571 |
+
pass
|
| 572 |
+
|
| 573 |
+
return artifacts
|
| 574 |
+
|
| 575 |
+
def _detect_truncation(self, values: List[str]) -> Optional[Artifact]:
|
| 576 |
+
"""Detect truncation at specific lengths."""
|
| 577 |
+
lengths = [len(v) for v in values]
|
| 578 |
+
max_len = max(lengths)
|
| 579 |
+
|
| 580 |
+
# Count values at max length
|
| 581 |
+
at_max = sum(1 for l in lengths if l == max_len)
|
| 582 |
+
|
| 583 |
+
# If many values hit the max, likely truncation
|
| 584 |
+
if at_max > len(values) * 0.1 and max_len > 10:
|
| 585 |
+
# Check if values at max look truncated (end mid-word, etc.)
|
| 586 |
+
max_values = [v for v in values if len(v) == max_len]
|
| 587 |
+
truncated_looking = sum(1 for v in max_values if not v.endswith(('.', '!', '?', ' ')))
|
| 588 |
+
|
| 589 |
+
if truncated_looking > len(max_values) * 0.5:
|
| 590 |
+
return Artifact(
|
| 591 |
+
artifact_type="truncation",
|
| 592 |
+
column=str(values[0])[:20] if values else "text",
|
| 593 |
+
evidence=f"{at_max} values ({at_max/len(values)*100:.1f}%) truncated at {max_len} chars",
|
| 594 |
+
confidence=0.80,
|
| 595 |
+
inferred_operation=f"FIELD_LENGTH_LIMIT_{max_len}",
|
| 596 |
+
details={
|
| 597 |
+
"max_length": max_len,
|
| 598 |
+
"truncated_count": at_max,
|
| 599 |
+
"truncated_ratio": at_max / len(values)
|
| 600 |
+
}
|
| 601 |
+
)
|
| 602 |
+
|
| 603 |
+
return None
|
| 604 |
+
|
| 605 |
+
def _detect_encoding_artifacts(self, values: List[str]) -> Optional[Artifact]:
|
| 606 |
+
"""Detect encoding/charset conversion issues."""
|
| 607 |
+
# Common mojibake patterns
|
| 608 |
+
mojibake_patterns = [
|
| 609 |
+
r'é', # é misencoded
|
| 610 |
+
r'è', # è
|
| 611 |
+
r'Ã ', # à
|
| 612 |
+
r'’', # ' smart quote
|
| 613 |
+
r'â€"', # — em dash
|
| 614 |
+
r'ö', # ö
|
| 615 |
+
r'ü', # ü
|
| 616 |
+
r'', # BOM
|
| 617 |
+
r'\\x[0-9a-f]{2}', # Raw hex escapes
|
| 618 |
+
r'&|<|>', # HTML entities
|
| 619 |
+
]
|
| 620 |
+
|
| 621 |
+
issue_count = 0
|
| 622 |
+
patterns_found = set()
|
| 623 |
+
|
| 624 |
+
for v in values[:500]: # Sample
|
| 625 |
+
for pattern in mojibake_patterns:
|
| 626 |
+
if re.search(pattern, v):
|
| 627 |
+
issue_count += 1
|
| 628 |
+
patterns_found.add(pattern)
|
| 629 |
+
break
|
| 630 |
+
|
| 631 |
+
if issue_count > 5:
|
| 632 |
+
return Artifact(
|
| 633 |
+
artifact_type="encoding_artifact",
|
| 634 |
+
column="text",
|
| 635 |
+
evidence=f"{issue_count} values have encoding issues (patterns: {patterns_found})",
|
| 636 |
+
confidence=0.85,
|
| 637 |
+
inferred_operation="CHARSET_CONVERSION_ERROR",
|
| 638 |
+
details={
|
| 639 |
+
"issue_count": issue_count,
|
| 640 |
+
"patterns": list(patterns_found)
|
| 641 |
+
}
|
| 642 |
+
)
|
| 643 |
+
|
| 644 |
+
return None
|
| 645 |
+
|
| 646 |
+
def _detect_case_patterns(self, values: List[str], column: str) -> Optional[Artifact]:
|
| 647 |
+
"""Detect case normalization."""
|
| 648 |
+
# Skip obviously non-text columns
|
| 649 |
+
sample = values[:100]
|
| 650 |
+
|
| 651 |
+
all_lower = all(v == v.lower() for v in sample if v.strip())
|
| 652 |
+
all_upper = all(v == v.upper() for v in sample if v.strip())
|
| 653 |
+
|
| 654 |
+
if all_lower:
|
| 655 |
+
return Artifact(
|
| 656 |
+
artifact_type="case_normalization",
|
| 657 |
+
column=column,
|
| 658 |
+
evidence="All values are lowercase",
|
| 659 |
+
confidence=0.90,
|
| 660 |
+
inferred_operation="LOWERCASE_NORMALIZATION",
|
| 661 |
+
details={"case": "lower"}
|
| 662 |
+
)
|
| 663 |
+
elif all_upper:
|
| 664 |
+
return Artifact(
|
| 665 |
+
artifact_type="case_normalization",
|
| 666 |
+
column=column,
|
| 667 |
+
evidence="All values are UPPERCASE",
|
| 668 |
+
confidence=0.90,
|
| 669 |
+
inferred_operation="UPPERCASE_NORMALIZATION",
|
| 670 |
+
details={"case": "upper"}
|
| 671 |
+
)
|
| 672 |
+
|
| 673 |
+
return None
|
| 674 |
+
|
| 675 |
+
def _detect_whitespace_patterns(self, values: List[str]) -> Optional[Artifact]:
|
| 676 |
+
"""Detect whitespace handling patterns."""
|
| 677 |
+
# Check for leading/trailing whitespace
|
| 678 |
+
has_leading = sum(1 for v in values if v and v[0] == ' ')
|
| 679 |
+
has_trailing = sum(1 for v in values if v and v[-1] == ' ')
|
| 680 |
+
|
| 681 |
+
# No whitespace at all = trimmed
|
| 682 |
+
if has_leading == 0 and has_trailing == 0:
|
| 683 |
+
# Verify there's text that COULD have whitespace
|
| 684 |
+
has_spaces = sum(1 for v in values if ' ' in v.strip())
|
| 685 |
+
if has_spaces > len(values) * 0.3:
|
| 686 |
+
return Artifact(
|
| 687 |
+
artifact_type="whitespace_trimming",
|
| 688 |
+
column="text",
|
| 689 |
+
evidence="No leading/trailing whitespace (data was trimmed)",
|
| 690 |
+
confidence=0.70,
|
| 691 |
+
inferred_operation="WHITESPACE_TRIM",
|
| 692 |
+
details={"trimmed": True}
|
| 693 |
+
)
|
| 694 |
+
|
| 695 |
+
return None
|
| 696 |
+
|
| 697 |
+
|
| 698 |
+
class NumericArtifacts(ArtifactDetector):
|
| 699 |
+
"""
|
| 700 |
+
Detect numeric processing artifacts.
|
| 701 |
+
|
| 702 |
+
Artifacts detected:
|
| 703 |
+
- Rounding patterns (precision limits)
|
| 704 |
+
- Outlier presence/absence (filtering)
|
| 705 |
+
- Distribution anomalies (sampling)
|
| 706 |
+
- Sentinel values (nulls represented as -1, 0, 9999)
|
| 707 |
+
"""
|
| 708 |
+
|
| 709 |
+
name = "numeric"
|
| 710 |
+
|
| 711 |
+
def detect(self, df, column: str) -> List[Artifact]:
|
| 712 |
+
artifacts = []
|
| 713 |
+
|
| 714 |
+
# Check if numeric
|
| 715 |
+
try:
|
| 716 |
+
values = df[column].dropna()
|
| 717 |
+
if len(values) < 10:
|
| 718 |
+
return artifacts
|
| 719 |
+
|
| 720 |
+
# Try to get numeric values
|
| 721 |
+
numeric_values = values.astype(float).tolist()
|
| 722 |
+
|
| 723 |
+
# Rounding
|
| 724 |
+
rounding = self._detect_rounding(numeric_values, column)
|
| 725 |
+
if rounding:
|
| 726 |
+
artifacts.append(rounding)
|
| 727 |
+
|
| 728 |
+
# Sentinel values
|
| 729 |
+
sentinel = self._detect_sentinel_values(numeric_values, column)
|
| 730 |
+
if sentinel:
|
| 731 |
+
artifacts.append(sentinel)
|
| 732 |
+
|
| 733 |
+
# Distribution
|
| 734 |
+
dist = self._detect_distribution_artifacts(numeric_values, column)
|
| 735 |
+
if dist:
|
| 736 |
+
artifacts.append(dist)
|
| 737 |
+
|
| 738 |
+
except (ValueError, TypeError):
|
| 739 |
+
pass
|
| 740 |
+
|
| 741 |
+
return artifacts
|
| 742 |
+
|
| 743 |
+
def _detect_rounding(self, values: List[float], column: str) -> Optional[Artifact]:
|
| 744 |
+
"""Detect systematic rounding."""
|
| 745 |
+
# Check decimal places
|
| 746 |
+
decimal_places = []
|
| 747 |
+
for v in values[:500]:
|
| 748 |
+
if v != int(v):
|
| 749 |
+
str_v = f"{v:.10f}".rstrip('0')
|
| 750 |
+
if '.' in str_v:
|
| 751 |
+
decimal_places.append(len(str_v.split('.')[1]))
|
| 752 |
+
|
| 753 |
+
if not decimal_places:
|
| 754 |
+
# All integers - check for rounding to 10, 100, etc.
|
| 755 |
+
int_values = [int(v) for v in values]
|
| 756 |
+
|
| 757 |
+
divisible_by_100 = sum(1 for v in int_values if v % 100 == 0)
|
| 758 |
+
divisible_by_10 = sum(1 for v in int_values if v % 10 == 0)
|
| 759 |
+
|
| 760 |
+
if divisible_by_100 > len(int_values) * 0.9:
|
| 761 |
+
return Artifact(
|
| 762 |
+
artifact_type="numeric_rounding",
|
| 763 |
+
column=column,
|
| 764 |
+
evidence="Values rounded to nearest 100",
|
| 765 |
+
confidence=0.85,
|
| 766 |
+
inferred_operation="ROUND_TO_100",
|
| 767 |
+
details={"rounding": 100}
|
| 768 |
+
)
|
| 769 |
+
elif divisible_by_10 > len(int_values) * 0.9:
|
| 770 |
+
return Artifact(
|
| 771 |
+
artifact_type="numeric_rounding",
|
| 772 |
+
column=column,
|
| 773 |
+
evidence="Values rounded to nearest 10",
|
| 774 |
+
confidence=0.80,
|
| 775 |
+
inferred_operation="ROUND_TO_10",
|
| 776 |
+
details={"rounding": 10}
|
| 777 |
+
)
|
| 778 |
+
else:
|
| 779 |
+
# Check for consistent decimal places
|
| 780 |
+
max_decimals = max(decimal_places)
|
| 781 |
+
at_max = sum(1 for d in decimal_places if d == max_decimals)
|
| 782 |
+
|
| 783 |
+
if at_max < len(decimal_places) * 0.3 and max_decimals <= 2:
|
| 784 |
+
return Artifact(
|
| 785 |
+
artifact_type="numeric_rounding",
|
| 786 |
+
column=column,
|
| 787 |
+
evidence=f"Values appear rounded to {max_decimals} decimal places",
|
| 788 |
+
confidence=0.75,
|
| 789 |
+
inferred_operation=f"ROUND_TO_{max_decimals}_DECIMALS",
|
| 790 |
+
details={"decimal_places": max_decimals}
|
| 791 |
+
)
|
| 792 |
+
|
| 793 |
+
return None
|
| 794 |
+
|
| 795 |
+
def _detect_sentinel_values(self, values: List[float], column: str) -> Optional[Artifact]:
|
| 796 |
+
"""Detect sentinel values representing nulls."""
|
| 797 |
+
sentinels = [-1, -999, -9999, 0, 9999, 99999]
|
| 798 |
+
|
| 799 |
+
value_counts = Counter(values)
|
| 800 |
+
|
| 801 |
+
for sentinel in sentinels:
|
| 802 |
+
if sentinel in value_counts:
|
| 803 |
+
count = value_counts[sentinel]
|
| 804 |
+
if count > len(values) * 0.01: # More than 1%
|
| 805 |
+
return Artifact(
|
| 806 |
+
artifact_type="sentinel_value",
|
| 807 |
+
column=column,
|
| 808 |
+
evidence=f"{count} occurrences of {sentinel} (likely NULL sentinel)",
|
| 809 |
+
confidence=0.70,
|
| 810 |
+
inferred_operation=f"NULL_AS_{int(sentinel)}",
|
| 811 |
+
details={
|
| 812 |
+
"sentinel": sentinel,
|
| 813 |
+
"count": count,
|
| 814 |
+
"percentage": count / len(values) * 100
|
| 815 |
+
}
|
| 816 |
+
)
|
| 817 |
+
|
| 818 |
+
return None
|
| 819 |
+
|
| 820 |
+
def _detect_distribution_artifacts(self, values: List[float], column: str) -> Optional[Artifact]:
|
| 821 |
+
"""Detect distribution anomalies suggesting filtering/sampling."""
|
| 822 |
+
if len(values) < 100:
|
| 823 |
+
return None
|
| 824 |
+
|
| 825 |
+
# Check for hard cutoffs
|
| 826 |
+
sorted_vals = sorted(values)
|
| 827 |
+
min_val, max_val = sorted_vals[0], sorted_vals[-1]
|
| 828 |
+
|
| 829 |
+
# Round number cutoffs suggest filtering
|
| 830 |
+
if max_val == int(max_val) and max_val % 10 == 0:
|
| 831 |
+
# Check if there's a cluster at the max
|
| 832 |
+
at_max = sum(1 for v in values if v == max_val)
|
| 833 |
+
if at_max > len(values) * 0.05:
|
| 834 |
+
return Artifact(
|
| 835 |
+
artifact_type="hard_cutoff",
|
| 836 |
+
column=column,
|
| 837 |
+
evidence=f"Hard cutoff at {max_val} ({at_max} values at limit)",
|
| 838 |
+
confidence=0.75,
|
| 839 |
+
inferred_operation=f"CAP_AT_{int(max_val)}",
|
| 840 |
+
details={
|
| 841 |
+
"cutoff": max_val,
|
| 842 |
+
"count_at_cutoff": at_max
|
| 843 |
+
}
|
| 844 |
+
)
|
| 845 |
+
|
| 846 |
+
return None
|
| 847 |
+
|
| 848 |
+
|
| 849 |
+
class NullPatternArtifacts(ArtifactDetector):
|
| 850 |
+
"""
|
| 851 |
+
Detect null/missing value patterns.
|
| 852 |
+
|
| 853 |
+
Artifacts detected:
|
| 854 |
+
- Systematic nulls (default handling)
|
| 855 |
+
- Null correlations (conditional logic)
|
| 856 |
+
- Null rates anomalies (ETL errors)
|
| 857 |
+
"""
|
| 858 |
+
|
| 859 |
+
name = "null_patterns"
|
| 860 |
+
|
| 861 |
+
def detect_all(self, df) -> List[Artifact]:
|
| 862 |
+
"""Analyze null patterns across all columns."""
|
| 863 |
+
artifacts = []
|
| 864 |
+
|
| 865 |
+
# Overall null rates per column
|
| 866 |
+
null_rates = {}
|
| 867 |
+
for col in df.columns:
|
| 868 |
+
null_rate = df[col].isna().mean()
|
| 869 |
+
null_rates[col] = null_rate
|
| 870 |
+
|
| 871 |
+
# Detect anomalous null rates
|
| 872 |
+
rates = list(null_rates.values())
|
| 873 |
+
if len(rates) > 3:
|
| 874 |
+
mean_rate = statistics.mean(rates)
|
| 875 |
+
|
| 876 |
+
for col, rate in null_rates.items():
|
| 877 |
+
if rate > 0.5 and rate > mean_rate * 3:
|
| 878 |
+
artifacts.append(Artifact(
|
| 879 |
+
artifact_type="high_null_rate",
|
| 880 |
+
column=col,
|
| 881 |
+
evidence=f"{rate*100:.1f}% null (vs {mean_rate*100:.1f}% average)",
|
| 882 |
+
confidence=0.70,
|
| 883 |
+
inferred_operation="OPTIONAL_FIELD_OR_ETL_ERROR",
|
| 884 |
+
details={
|
| 885 |
+
"null_rate": rate,
|
| 886 |
+
"avg_null_rate": mean_rate
|
| 887 |
+
}
|
| 888 |
+
))
|
| 889 |
+
|
| 890 |
+
# Detect columns that are null together (conditional logic)
|
| 891 |
+
# This is expensive so we sample
|
| 892 |
+
if len(df) > 100:
|
| 893 |
+
sample = df.sample(min(1000, len(df)))
|
| 894 |
+
else:
|
| 895 |
+
sample = df
|
| 896 |
+
|
| 897 |
+
correlated_nulls = []
|
| 898 |
+
cols = list(df.columns)
|
| 899 |
+
for i, col1 in enumerate(cols):
|
| 900 |
+
for col2 in cols[i+1:]:
|
| 901 |
+
both_null = (sample[col1].isna() & sample[col2].isna()).mean()
|
| 902 |
+
either_null = (sample[col1].isna() | sample[col2].isna()).mean()
|
| 903 |
+
|
| 904 |
+
if either_null > 0.1 and both_null / either_null > 0.8:
|
| 905 |
+
correlated_nulls.append((col1, col2, both_null))
|
| 906 |
+
|
| 907 |
+
if correlated_nulls:
|
| 908 |
+
artifacts.append(Artifact(
|
| 909 |
+
artifact_type="correlated_nulls",
|
| 910 |
+
column="multiple",
|
| 911 |
+
evidence=f"{len(correlated_nulls)} column pairs have correlated nulls",
|
| 912 |
+
confidence=0.75,
|
| 913 |
+
inferred_operation="CONDITIONAL_FIELD_POPULATION",
|
| 914 |
+
details={
|
| 915 |
+
"pairs": [(c1, c2) for c1, c2, _ in correlated_nulls[:5]]
|
| 916 |
+
}
|
| 917 |
+
))
|
| 918 |
+
|
| 919 |
+
return artifacts
|
| 920 |
+
|
| 921 |
+
def detect(self, df, column: str) -> List[Artifact]:
|
| 922 |
+
"""Null patterns are analyzed globally, not per-column."""
|
| 923 |
+
return []
|
| 924 |
+
|
| 925 |
+
|
| 926 |
+
class SchemaArtifacts(ArtifactDetector):
|
| 927 |
+
"""
|
| 928 |
+
Detect schema-level artifacts.
|
| 929 |
+
|
| 930 |
+
Artifacts detected:
|
| 931 |
+
- Column naming conventions (framework hints)
|
| 932 |
+
- Data type patterns (database origin)
|
| 933 |
+
- Schema inconsistencies (merged sources)
|
| 934 |
+
"""
|
| 935 |
+
|
| 936 |
+
name = "schema"
|
| 937 |
+
|
| 938 |
+
def detect_all(self, df) -> List[Artifact]:
|
| 939 |
+
"""Analyze schema patterns."""
|
| 940 |
+
artifacts = []
|
| 941 |
+
|
| 942 |
+
columns = list(df.columns)
|
| 943 |
+
|
| 944 |
+
# Naming convention detection
|
| 945 |
+
conventions = self._detect_naming_conventions(columns)
|
| 946 |
+
if conventions:
|
| 947 |
+
artifacts.append(conventions)
|
| 948 |
+
|
| 949 |
+
# Framework fingerprints
|
| 950 |
+
framework = self._detect_framework_fingerprints(columns)
|
| 951 |
+
if framework:
|
| 952 |
+
artifacts.append(framework)
|
| 953 |
+
|
| 954 |
+
# Mixed conventions (merged sources)
|
| 955 |
+
mixed = self._detect_mixed_conventions(columns)
|
| 956 |
+
if mixed:
|
| 957 |
+
artifacts.append(mixed)
|
| 958 |
+
|
| 959 |
+
return artifacts
|
| 960 |
+
|
| 961 |
+
def detect(self, df, column: str) -> List[Artifact]:
|
| 962 |
+
"""Schema patterns are analyzed globally."""
|
| 963 |
+
return []
|
| 964 |
+
|
| 965 |
+
def _detect_naming_conventions(self, columns: List[str]) -> Optional[Artifact]:
|
| 966 |
+
"""Detect column naming convention."""
|
| 967 |
+
snake_case = sum(1 for c in columns if '_' in c and c == c.lower())
|
| 968 |
+
camel_case = sum(1 for c in columns if re.match(r'^[a-z]+([A-Z][a-z]+)+$', c))
|
| 969 |
+
pascal_case = sum(1 for c in columns if re.match(r'^([A-Z][a-z]+)+$', c))
|
| 970 |
+
|
| 971 |
+
total = len(columns)
|
| 972 |
+
|
| 973 |
+
if snake_case > total * 0.7:
|
| 974 |
+
return Artifact(
|
| 975 |
+
artifact_type="naming_convention",
|
| 976 |
+
column="schema",
|
| 977 |
+
evidence=f"snake_case naming ({snake_case}/{total} columns)",
|
| 978 |
+
confidence=0.80,
|
| 979 |
+
inferred_operation="PYTHON_OR_SQL_ORIGIN",
|
| 980 |
+
details={"convention": "snake_case", "ratio": snake_case/total}
|
| 981 |
+
)
|
| 982 |
+
elif camel_case > total * 0.5:
|
| 983 |
+
return Artifact(
|
| 984 |
+
artifact_type="naming_convention",
|
| 985 |
+
column="schema",
|
| 986 |
+
evidence=f"camelCase naming ({camel_case}/{total} columns)",
|
| 987 |
+
confidence=0.80,
|
| 988 |
+
inferred_operation="JAVASCRIPT_OR_JAVA_ORIGIN",
|
| 989 |
+
details={"convention": "camelCase", "ratio": camel_case/total}
|
| 990 |
+
)
|
| 991 |
+
elif pascal_case > total * 0.5:
|
| 992 |
+
return Artifact(
|
| 993 |
+
artifact_type="naming_convention",
|
| 994 |
+
column="schema",
|
| 995 |
+
evidence=f"PascalCase naming ({pascal_case}/{total} columns)",
|
| 996 |
+
confidence=0.80,
|
| 997 |
+
inferred_operation="DOTNET_OR_JAVA_ORIGIN",
|
| 998 |
+
details={"convention": "PascalCase", "ratio": pascal_case/total}
|
| 999 |
+
)
|
| 1000 |
+
|
| 1001 |
+
return None
|
| 1002 |
+
|
| 1003 |
+
def _detect_framework_fingerprints(self, columns: List[str]) -> Optional[Artifact]:
|
| 1004 |
+
"""Detect framework-specific column patterns."""
|
| 1005 |
+
col_lower = [c.lower() for c in columns]
|
| 1006 |
+
|
| 1007 |
+
# Django fingerprints
|
| 1008 |
+
if 'id' in col_lower and 'created_at' in col_lower:
|
| 1009 |
+
return Artifact(
|
| 1010 |
+
artifact_type="framework_fingerprint",
|
| 1011 |
+
column="schema",
|
| 1012 |
+
evidence="Django/Rails-style auto columns (id, created_at)",
|
| 1013 |
+
confidence=0.65,
|
| 1014 |
+
inferred_operation="ORM_GENERATED_SCHEMA",
|
| 1015 |
+
details={"framework_hints": ["django", "rails", "sqlalchemy"]}
|
| 1016 |
+
)
|
| 1017 |
+
|
| 1018 |
+
# Pandas export fingerprints
|
| 1019 |
+
if 'unnamed: 0' in col_lower or any('unnamed:' in c for c in col_lower):
|
| 1020 |
+
return Artifact(
|
| 1021 |
+
artifact_type="framework_fingerprint",
|
| 1022 |
+
column="schema",
|
| 1023 |
+
evidence="Pandas index column artifact (Unnamed: 0)",
|
| 1024 |
+
confidence=0.90,
|
| 1025 |
+
inferred_operation="PANDAS_CSV_EXPORT",
|
| 1026 |
+
details={"framework": "pandas"}
|
| 1027 |
+
)
|
| 1028 |
+
|
| 1029 |
+
# MongoDB fingerprints
|
| 1030 |
+
if '_id' in col_lower:
|
| 1031 |
+
return Artifact(
|
| 1032 |
+
artifact_type="framework_fingerprint",
|
| 1033 |
+
column="schema",
|
| 1034 |
+
evidence="MongoDB _id column present",
|
| 1035 |
+
confidence=0.85,
|
| 1036 |
+
inferred_operation="MONGODB_EXPORT",
|
| 1037 |
+
details={"framework": "mongodb"}
|
| 1038 |
+
)
|
| 1039 |
+
|
| 1040 |
+
return None
|
| 1041 |
+
|
| 1042 |
+
def _detect_mixed_conventions(self, columns: List[str]) -> Optional[Artifact]:
|
| 1043 |
+
"""Detect mixed naming conventions suggesting merged sources."""
|
| 1044 |
+
snake_case = sum(1 for c in columns if '_' in c and c == c.lower())
|
| 1045 |
+
camel_case = sum(1 for c in columns if re.match(r'^[a-z]+([A-Z][a-z]+)+$', c))
|
| 1046 |
+
|
| 1047 |
+
total = len(columns)
|
| 1048 |
+
|
| 1049 |
+
# Both conventions present significantly
|
| 1050 |
+
if snake_case > total * 0.2 and camel_case > total * 0.2:
|
| 1051 |
+
return Artifact(
|
| 1052 |
+
artifact_type="mixed_conventions",
|
| 1053 |
+
column="schema",
|
| 1054 |
+
evidence=f"Mixed naming: {snake_case} snake_case, {camel_case} camelCase",
|
| 1055 |
+
confidence=0.75,
|
| 1056 |
+
inferred_operation="MERGED_SOURCES",
|
| 1057 |
+
details={
|
| 1058 |
+
"snake_case_count": snake_case,
|
| 1059 |
+
"camel_case_count": camel_case
|
| 1060 |
+
}
|
| 1061 |
+
)
|
| 1062 |
+
|
| 1063 |
+
return None
|
cascade/forensics/fingerprints.py
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Forensics - Technology Fingerprinting
|
| 3 |
+
|
| 4 |
+
Map detected artifacts to likely technologies and tools.
|
| 5 |
+
The artifacts are evidence. This module is the detective.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from dataclasses import dataclass, field
|
| 9 |
+
from typing import List, Dict, Any, Set
|
| 10 |
+
from collections import defaultdict
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class Fingerprint:
|
| 15 |
+
"""A technology fingerprint - evidence pointing to specific tools."""
|
| 16 |
+
technology: str
|
| 17 |
+
category: str # database, framework, language, tool
|
| 18 |
+
confidence: float
|
| 19 |
+
evidence: List[str] = field(default_factory=list)
|
| 20 |
+
|
| 21 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 22 |
+
return {
|
| 23 |
+
"technology": self.technology,
|
| 24 |
+
"category": self.category,
|
| 25 |
+
"confidence": self.confidence,
|
| 26 |
+
"evidence": self.evidence,
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class TechFingerprinter:
|
| 31 |
+
"""
|
| 32 |
+
Map artifact patterns to likely technologies.
|
| 33 |
+
|
| 34 |
+
This is pattern matching - certain artifact combinations
|
| 35 |
+
are strong indicators of specific tools.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
# Artifact patterns -> technology mappings
|
| 39 |
+
PATTERNS = {
|
| 40 |
+
# Databases
|
| 41 |
+
"MONGODB_EXPORT": {
|
| 42 |
+
"technology": "MongoDB",
|
| 43 |
+
"category": "database",
|
| 44 |
+
"weight": 0.9,
|
| 45 |
+
},
|
| 46 |
+
"ORM_GENERATED_SCHEMA": {
|
| 47 |
+
"technology": "ORM (Django/Rails/SQLAlchemy)",
|
| 48 |
+
"category": "framework",
|
| 49 |
+
"weight": 0.7,
|
| 50 |
+
},
|
| 51 |
+
"PANDAS_CSV_EXPORT": {
|
| 52 |
+
"technology": "Pandas",
|
| 53 |
+
"category": "tool",
|
| 54 |
+
"weight": 0.95,
|
| 55 |
+
},
|
| 56 |
+
|
| 57 |
+
# Processing tools
|
| 58 |
+
"LOWERCASE_NORMALIZATION": {
|
| 59 |
+
"technology": "Text Preprocessing",
|
| 60 |
+
"category": "processing",
|
| 61 |
+
"weight": 0.6,
|
| 62 |
+
},
|
| 63 |
+
"WHITESPACE_TRIM": {
|
| 64 |
+
"technology": "String Cleaning",
|
| 65 |
+
"category": "processing",
|
| 66 |
+
"weight": 0.5,
|
| 67 |
+
},
|
| 68 |
+
|
| 69 |
+
# Batch processing
|
| 70 |
+
"BATCH_HOURLY": {
|
| 71 |
+
"technology": "Scheduled Batch Job (hourly)",
|
| 72 |
+
"category": "infrastructure",
|
| 73 |
+
"weight": 0.8,
|
| 74 |
+
},
|
| 75 |
+
"BATCH_15MIN": {
|
| 76 |
+
"technology": "Scheduled Batch Job (15min)",
|
| 77 |
+
"category": "infrastructure",
|
| 78 |
+
"weight": 0.8,
|
| 79 |
+
},
|
| 80 |
+
"BATCH_BURST_PROCESSING": {
|
| 81 |
+
"technology": "Event-Driven Batch Processing",
|
| 82 |
+
"category": "infrastructure",
|
| 83 |
+
"weight": 0.7,
|
| 84 |
+
},
|
| 85 |
+
"SCHEDULED_JOB": {
|
| 86 |
+
"technology": "Cron/Scheduler",
|
| 87 |
+
"category": "infrastructure",
|
| 88 |
+
"weight": 0.75,
|
| 89 |
+
},
|
| 90 |
+
|
| 91 |
+
# ID generation
|
| 92 |
+
"UUID_GENERATION_V4": {
|
| 93 |
+
"technology": "Cryptographic UUID Generator",
|
| 94 |
+
"category": "tool",
|
| 95 |
+
"weight": 0.8,
|
| 96 |
+
},
|
| 97 |
+
"UUID_GENERATION_V1": {
|
| 98 |
+
"technology": "Time-based UUID (leaks timestamp + MAC)",
|
| 99 |
+
"category": "tool",
|
| 100 |
+
"weight": 0.85,
|
| 101 |
+
},
|
| 102 |
+
"DETERMINISTIC_ID_GENERATION_SHA256": {
|
| 103 |
+
"technology": "Content-Addressed Storage",
|
| 104 |
+
"category": "architecture",
|
| 105 |
+
"weight": 0.8,
|
| 106 |
+
},
|
| 107 |
+
"DETERMINISTIC_ID_GENERATION_MD5": {
|
| 108 |
+
"technology": "MD5 Hash IDs (legacy system)",
|
| 109 |
+
"category": "architecture",
|
| 110 |
+
"weight": 0.8,
|
| 111 |
+
},
|
| 112 |
+
|
| 113 |
+
# Data quality
|
| 114 |
+
"FILTERING_OR_DELETION": {
|
| 115 |
+
"technology": "Record Filtering/Deletion Pipeline",
|
| 116 |
+
"category": "processing",
|
| 117 |
+
"weight": 0.7,
|
| 118 |
+
},
|
| 119 |
+
"CHARSET_CONVERSION_ERROR": {
|
| 120 |
+
"technology": "Encoding Mismatch (Latin-1 vs UTF-8)",
|
| 121 |
+
"category": "bug",
|
| 122 |
+
"weight": 0.85,
|
| 123 |
+
},
|
| 124 |
+
|
| 125 |
+
# Languages/frameworks
|
| 126 |
+
"PYTHON_OR_SQL_ORIGIN": {
|
| 127 |
+
"technology": "Python or SQL",
|
| 128 |
+
"category": "language",
|
| 129 |
+
"weight": 0.6,
|
| 130 |
+
},
|
| 131 |
+
"JAVASCRIPT_OR_JAVA_ORIGIN": {
|
| 132 |
+
"technology": "JavaScript or Java",
|
| 133 |
+
"category": "language",
|
| 134 |
+
"weight": 0.6,
|
| 135 |
+
},
|
| 136 |
+
|
| 137 |
+
# Source merging
|
| 138 |
+
"MERGED_SOURCES": {
|
| 139 |
+
"technology": "Multi-Source Data Integration",
|
| 140 |
+
"category": "architecture",
|
| 141 |
+
"weight": 0.8,
|
| 142 |
+
},
|
| 143 |
+
"MULTI_SOURCE_MERGE": {
|
| 144 |
+
"technology": "Multi-Source Data Integration",
|
| 145 |
+
"category": "architecture",
|
| 146 |
+
"weight": 0.85,
|
| 147 |
+
},
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
# Compound patterns - combinations that strengthen identification
|
| 151 |
+
COMPOUND_PATTERNS = [
|
| 152 |
+
{
|
| 153 |
+
"requires": ["PANDAS_CSV_EXPORT", "PYTHON_OR_SQL_ORIGIN"],
|
| 154 |
+
"suggests": Fingerprint("Pandas Data Pipeline", "tool", 0.95),
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"requires": ["MONGODB_EXPORT", "JAVASCRIPT_OR_JAVA_ORIGIN"],
|
| 158 |
+
"suggests": Fingerprint("Node.js + MongoDB Stack", "stack", 0.85),
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"requires": ["ORM_GENERATED_SCHEMA", "BATCH_HOURLY"],
|
| 162 |
+
"suggests": Fingerprint("Django/Rails Batch Worker", "stack", 0.80),
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"requires": ["CHARSET_CONVERSION_ERROR", "MERGED_SOURCES"],
|
| 166 |
+
"suggests": Fingerprint("Legacy System Migration", "context", 0.85),
|
| 167 |
+
},
|
| 168 |
+
{
|
| 169 |
+
"requires": ["UUID_GENERATION_V1", "BATCH_BURST_PROCESSING"],
|
| 170 |
+
"suggests": Fingerprint("Distributed System (pre-2015 design)", "architecture", 0.75),
|
| 171 |
+
},
|
| 172 |
+
]
|
| 173 |
+
|
| 174 |
+
def __init__(self):
|
| 175 |
+
self.fingerprints: List[Fingerprint] = []
|
| 176 |
+
|
| 177 |
+
def analyze(self, artifacts: List['Artifact']) -> List[Fingerprint]:
|
| 178 |
+
"""
|
| 179 |
+
Analyze artifacts and return technology fingerprints.
|
| 180 |
+
|
| 181 |
+
Args:
|
| 182 |
+
artifacts: List of detected artifacts
|
| 183 |
+
|
| 184 |
+
Returns:
|
| 185 |
+
List of technology fingerprints sorted by confidence
|
| 186 |
+
"""
|
| 187 |
+
self.fingerprints = []
|
| 188 |
+
|
| 189 |
+
# Get all inferred operations
|
| 190 |
+
operations = set(a.inferred_operation for a in artifacts)
|
| 191 |
+
|
| 192 |
+
# Match against patterns
|
| 193 |
+
tech_evidence = defaultdict(list)
|
| 194 |
+
tech_confidence = defaultdict(float)
|
| 195 |
+
tech_category = {}
|
| 196 |
+
|
| 197 |
+
for op in operations:
|
| 198 |
+
# Direct pattern match
|
| 199 |
+
if op in self.PATTERNS:
|
| 200 |
+
pattern = self.PATTERNS[op]
|
| 201 |
+
tech = pattern["technology"]
|
| 202 |
+
tech_evidence[tech].append(op)
|
| 203 |
+
tech_confidence[tech] = max(tech_confidence[tech], pattern["weight"])
|
| 204 |
+
tech_category[tech] = pattern["category"]
|
| 205 |
+
|
| 206 |
+
# Partial match (for patterns with suffixes like SCHEDULED_JOB_24HR)
|
| 207 |
+
for pattern_name, pattern in self.PATTERNS.items():
|
| 208 |
+
if op.startswith(pattern_name.split('_')[0] + '_'):
|
| 209 |
+
tech = pattern["technology"]
|
| 210 |
+
if tech not in tech_evidence or op not in tech_evidence[tech]:
|
| 211 |
+
tech_evidence[tech].append(op)
|
| 212 |
+
tech_confidence[tech] = max(tech_confidence[tech], pattern["weight"] * 0.9)
|
| 213 |
+
tech_category[tech] = pattern["category"]
|
| 214 |
+
|
| 215 |
+
# Check compound patterns
|
| 216 |
+
for compound in self.COMPOUND_PATTERNS:
|
| 217 |
+
required = set(compound["requires"])
|
| 218 |
+
if required.issubset(operations):
|
| 219 |
+
fp = compound["suggests"]
|
| 220 |
+
tech_evidence[fp.technology].extend(list(required))
|
| 221 |
+
tech_confidence[fp.technology] = max(tech_confidence.get(fp.technology, 0), fp.confidence)
|
| 222 |
+
tech_category[fp.technology] = fp.category
|
| 223 |
+
|
| 224 |
+
# Build fingerprint objects
|
| 225 |
+
for tech, evidence in tech_evidence.items():
|
| 226 |
+
self.fingerprints.append(Fingerprint(
|
| 227 |
+
technology=tech,
|
| 228 |
+
category=tech_category.get(tech, "unknown"),
|
| 229 |
+
confidence=tech_confidence[tech],
|
| 230 |
+
evidence=list(set(evidence)),
|
| 231 |
+
))
|
| 232 |
+
|
| 233 |
+
# Sort by confidence
|
| 234 |
+
self.fingerprints.sort(key=lambda f: f.confidence, reverse=True)
|
| 235 |
+
|
| 236 |
+
return self.fingerprints
|
| 237 |
+
|
| 238 |
+
def get_likely_stack(self) -> Dict[str, Any]:
|
| 239 |
+
"""
|
| 240 |
+
Synthesize fingerprints into a likely technology stack.
|
| 241 |
+
|
| 242 |
+
Returns:
|
| 243 |
+
Dict describing the probable system architecture
|
| 244 |
+
"""
|
| 245 |
+
if not self.fingerprints:
|
| 246 |
+
return {"stack": "Unknown", "components": []}
|
| 247 |
+
|
| 248 |
+
# Group by category
|
| 249 |
+
by_category = defaultdict(list)
|
| 250 |
+
for fp in self.fingerprints:
|
| 251 |
+
by_category[fp.category].append(fp)
|
| 252 |
+
|
| 253 |
+
stack = {
|
| 254 |
+
"database": None,
|
| 255 |
+
"framework": None,
|
| 256 |
+
"language": None,
|
| 257 |
+
"processing": [],
|
| 258 |
+
"infrastructure": [],
|
| 259 |
+
"architecture_notes": [],
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
# Pick highest confidence for single-value categories
|
| 263 |
+
for cat in ["database", "framework", "language"]:
|
| 264 |
+
if cat in by_category:
|
| 265 |
+
stack[cat] = by_category[cat][0].technology
|
| 266 |
+
|
| 267 |
+
# Aggregate list categories
|
| 268 |
+
for cat in ["processing", "infrastructure"]:
|
| 269 |
+
if cat in by_category:
|
| 270 |
+
stack[cat] = [fp.technology for fp in by_category[cat]]
|
| 271 |
+
|
| 272 |
+
# Architecture notes from high-confidence findings
|
| 273 |
+
if "architecture" in by_category:
|
| 274 |
+
stack["architecture_notes"] = [fp.technology for fp in by_category["architecture"]]
|
| 275 |
+
|
| 276 |
+
# Bugs/issues
|
| 277 |
+
if "bug" in by_category:
|
| 278 |
+
stack["issues"] = [fp.technology for fp in by_category["bug"]]
|
| 279 |
+
|
| 280 |
+
return stack
|
| 281 |
+
|
| 282 |
+
def get_security_concerns(self) -> List[Dict[str, Any]]:
|
| 283 |
+
"""
|
| 284 |
+
Identify security-relevant findings.
|
| 285 |
+
|
| 286 |
+
Returns:
|
| 287 |
+
List of security concerns derived from fingerprints
|
| 288 |
+
"""
|
| 289 |
+
concerns = []
|
| 290 |
+
|
| 291 |
+
for fp in self.fingerprints:
|
| 292 |
+
# UUID v1 leaks info
|
| 293 |
+
if "UUID" in fp.technology and "V1" in fp.technology:
|
| 294 |
+
concerns.append({
|
| 295 |
+
"severity": "medium",
|
| 296 |
+
"issue": "UUID v1 leaks timestamp and MAC address",
|
| 297 |
+
"evidence": fp.evidence,
|
| 298 |
+
"recommendation": "Use UUID v4 for privacy",
|
| 299 |
+
})
|
| 300 |
+
|
| 301 |
+
# MD5 for IDs
|
| 302 |
+
if "MD5" in fp.technology:
|
| 303 |
+
concerns.append({
|
| 304 |
+
"severity": "low",
|
| 305 |
+
"issue": "MD5 used for ID generation (collision risk)",
|
| 306 |
+
"evidence": fp.evidence,
|
| 307 |
+
"recommendation": "Consider SHA-256 for content addressing",
|
| 308 |
+
})
|
| 309 |
+
|
| 310 |
+
# Encoding errors = data loss
|
| 311 |
+
if "Encoding" in fp.technology or "charset" in fp.technology.lower():
|
| 312 |
+
concerns.append({
|
| 313 |
+
"severity": "medium",
|
| 314 |
+
"issue": "Character encoding errors indicate data corruption",
|
| 315 |
+
"evidence": fp.evidence,
|
| 316 |
+
"recommendation": "Audit data pipeline for charset handling",
|
| 317 |
+
})
|
| 318 |
+
|
| 319 |
+
# Legacy patterns
|
| 320 |
+
if "legacy" in fp.technology.lower() or "pre-2015" in fp.technology.lower():
|
| 321 |
+
concerns.append({
|
| 322 |
+
"severity": "info",
|
| 323 |
+
"issue": "Legacy system patterns detected",
|
| 324 |
+
"evidence": fp.evidence,
|
| 325 |
+
"recommendation": "Review for technical debt",
|
| 326 |
+
})
|
| 327 |
+
|
| 328 |
+
return concerns
|
cascade/genesis.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Genesis - The origin node of the neural internetwork.
|
| 3 |
+
|
| 4 |
+
Every chain begins here. Systems link to genesis (or to any
|
| 5 |
+
descendant of genesis) to join the lattice.
|
| 6 |
+
|
| 7 |
+
The chain IS the registry. No separate discovery needed.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
# Create genesis (done once, published to well-known location)
|
| 11 |
+
genesis = create_genesis()
|
| 12 |
+
|
| 13 |
+
# Any system joins by linking to genesis
|
| 14 |
+
my_chain.link_external(genesis.merkle_root)
|
| 15 |
+
|
| 16 |
+
# Or by linking to any existing node in the lattice
|
| 17 |
+
my_chain.link_external(some_other_chain.merkle_root)
|
| 18 |
+
|
| 19 |
+
# The lattice grows. Discovery = reading the chain.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import hashlib
|
| 23 |
+
import json
|
| 24 |
+
import time
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
from typing import Optional, Dict, Any
|
| 27 |
+
|
| 28 |
+
from cascade.core.provenance import ProvenanceChain, ProvenanceRecord
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# Well-known genesis identifiers
|
| 32 |
+
GENESIS_SESSION_ID = "genesis_0"
|
| 33 |
+
GENESIS_MODEL_ID = "cascade_genesis"
|
| 34 |
+
GENESIS_INPUT = "In the beginning was the hash, and the hash was with the chain, and the hash was the chain."
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def create_genesis() -> ProvenanceChain:
|
| 38 |
+
"""
|
| 39 |
+
Create the genesis chain - origin of the neural internetwork.
|
| 40 |
+
|
| 41 |
+
This is deterministic. Anyone running this gets the same genesis.
|
| 42 |
+
That's the point - it's the Schelling point for the lattice.
|
| 43 |
+
"""
|
| 44 |
+
# Deterministic input hash
|
| 45 |
+
input_hash = hashlib.sha256(GENESIS_INPUT.encode()).hexdigest()[:16]
|
| 46 |
+
|
| 47 |
+
# Deterministic model hash (hash of the genesis concept itself)
|
| 48 |
+
model_hash = hashlib.sha256(b"cascade_neural_internetwork_v1").hexdigest()[:16]
|
| 49 |
+
|
| 50 |
+
chain = ProvenanceChain(
|
| 51 |
+
session_id=GENESIS_SESSION_ID,
|
| 52 |
+
model_id=GENESIS_MODEL_ID,
|
| 53 |
+
model_hash=model_hash,
|
| 54 |
+
input_hash=input_hash,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# The genesis record - the first node
|
| 58 |
+
# Its parent is itself (bootstrap)
|
| 59 |
+
genesis_record = ProvenanceRecord(
|
| 60 |
+
layer_name="genesis",
|
| 61 |
+
layer_idx=0,
|
| 62 |
+
state_hash=input_hash, # Self-referential
|
| 63 |
+
parent_hashes=[input_hash], # Points to itself
|
| 64 |
+
params_hash=model_hash,
|
| 65 |
+
shape=[1],
|
| 66 |
+
dtype="genesis",
|
| 67 |
+
stats={"created": time.time()},
|
| 68 |
+
execution_order=0,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
chain.add_record(genesis_record)
|
| 72 |
+
chain.finalize()
|
| 73 |
+
|
| 74 |
+
return chain
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def get_genesis_root() -> str:
|
| 78 |
+
"""
|
| 79 |
+
Get the genesis merkle root.
|
| 80 |
+
|
| 81 |
+
This is a constant - the Schelling point.
|
| 82 |
+
Any system can compute it and know they're linking to the same origin.
|
| 83 |
+
"""
|
| 84 |
+
return create_genesis().merkle_root
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def save_genesis(path: Path) -> str:
|
| 88 |
+
"""
|
| 89 |
+
Save genesis chain to file.
|
| 90 |
+
|
| 91 |
+
This file can be published to a well-known location
|
| 92 |
+
(HuggingFace dataset, IPFS, etc.)
|
| 93 |
+
"""
|
| 94 |
+
genesis = create_genesis()
|
| 95 |
+
|
| 96 |
+
with open(path, 'w') as f:
|
| 97 |
+
json.dump(genesis.to_dict(), f, indent=2)
|
| 98 |
+
|
| 99 |
+
return genesis.merkle_root
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def load_genesis(path: Path) -> ProvenanceChain:
|
| 103 |
+
"""Load genesis from file and verify it's authentic."""
|
| 104 |
+
with open(path, 'r') as f:
|
| 105 |
+
data = json.load(f)
|
| 106 |
+
|
| 107 |
+
chain = ProvenanceChain.from_dict(data)
|
| 108 |
+
|
| 109 |
+
# Verify this is actually genesis
|
| 110 |
+
expected_root = get_genesis_root()
|
| 111 |
+
if chain.merkle_root != expected_root:
|
| 112 |
+
raise ValueError(
|
| 113 |
+
f"Invalid genesis: root {chain.merkle_root} != expected {expected_root}"
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
return chain
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def link_to_genesis(chain: ProvenanceChain) -> None:
|
| 120 |
+
"""
|
| 121 |
+
Link a chain to genesis, joining the neural internetwork.
|
| 122 |
+
|
| 123 |
+
This is the simplest way to join - link directly to the origin.
|
| 124 |
+
Alternatively, link to any other chain that traces back to genesis.
|
| 125 |
+
"""
|
| 126 |
+
chain.link_external(get_genesis_root(), source_id="genesis")
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def verify_lineage_to_genesis(chain: ProvenanceChain, known_chains: Dict[str, ProvenanceChain]) -> bool:
|
| 130 |
+
"""
|
| 131 |
+
Verify that a chain traces back to genesis through external_roots.
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
chain: The chain to verify
|
| 135 |
+
known_chains: Dict mapping merkle_root -> chain for lookup
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
True if chain traces to genesis, False otherwise
|
| 139 |
+
"""
|
| 140 |
+
genesis_root = get_genesis_root()
|
| 141 |
+
visited = set()
|
| 142 |
+
|
| 143 |
+
def trace(root: str) -> bool:
|
| 144 |
+
if root in visited:
|
| 145 |
+
return False
|
| 146 |
+
visited.add(root)
|
| 147 |
+
|
| 148 |
+
# Found genesis!
|
| 149 |
+
if root == genesis_root:
|
| 150 |
+
return True
|
| 151 |
+
|
| 152 |
+
# Look up this chain
|
| 153 |
+
if root not in known_chains:
|
| 154 |
+
return False # Can't verify - chain not known
|
| 155 |
+
|
| 156 |
+
c = known_chains[root]
|
| 157 |
+
|
| 158 |
+
# Check if any external root leads to genesis
|
| 159 |
+
for ext_root in c.external_roots:
|
| 160 |
+
if trace(ext_root):
|
| 161 |
+
return True
|
| 162 |
+
|
| 163 |
+
return False
|
| 164 |
+
|
| 165 |
+
# Start from the chain's own root
|
| 166 |
+
return trace(chain.merkle_root) or any(trace(r) for r in chain.external_roots)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
# =============================================================================
|
| 170 |
+
# CLI for genesis operations
|
| 171 |
+
# =============================================================================
|
| 172 |
+
|
| 173 |
+
if __name__ == "__main__":
|
| 174 |
+
import sys
|
| 175 |
+
|
| 176 |
+
genesis = create_genesis()
|
| 177 |
+
|
| 178 |
+
print("=" * 60)
|
| 179 |
+
print("CASCADE GENESIS")
|
| 180 |
+
print("=" * 60)
|
| 181 |
+
print(f"Merkle Root: {genesis.merkle_root}")
|
| 182 |
+
print(f"Session ID: {genesis.session_id}")
|
| 183 |
+
print(f"Model ID: {genesis.model_id}")
|
| 184 |
+
print(f"Input Hash: {genesis.input_hash}")
|
| 185 |
+
print("=" * 60)
|
| 186 |
+
print()
|
| 187 |
+
print("This is the origin of the neural internetwork.")
|
| 188 |
+
print("Any system can link to this root to join the lattice.")
|
| 189 |
+
print()
|
| 190 |
+
print("To join:")
|
| 191 |
+
print(" from cascade.genesis import get_genesis_root")
|
| 192 |
+
print(" my_chain.link_external(get_genesis_root())")
|
| 193 |
+
print()
|
| 194 |
+
|
| 195 |
+
# Save if requested
|
| 196 |
+
if len(sys.argv) > 1 and sys.argv[1] == "--save":
|
| 197 |
+
out_path = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("genesis.json")
|
| 198 |
+
root = save_genesis(out_path)
|
| 199 |
+
print(f"Genesis saved to: {out_path}")
|
| 200 |
+
print(f"Root: {root}")
|
cascade/hold/__init__.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
╔═══════════════════════════════════════════════════════════════════════════════╗
|
| 3 |
+
║ ║
|
| 4 |
+
║ ██╗ ██╗ ██████╗ ██╗ ██████╗ ║
|
| 5 |
+
║ ██║ ██║██╔═══██╗██║ ██╔══██╗ ║
|
| 6 |
+
║ ███████║██║ ██║██║ ██║ ██║ ║
|
| 7 |
+
║ ██╔══██║██║ ██║██║ ██║ ██║ ║
|
| 8 |
+
║ ██║ ██║╚██████╔╝███████╗██████╔╝ ║
|
| 9 |
+
║ ╚═╝ ╚═╝ ╚═════╝ ╚══════╝╚═════╝ ║
|
| 10 |
+
║ ║
|
| 11 |
+
║ Inference-Level Halt Protocol for CASCADE-LATTICE ║
|
| 12 |
+
║ ║
|
| 13 |
+
║ "Pause the machine. See what it sees. Choose what it chooses." ║
|
| 14 |
+
║ ║
|
| 15 |
+
╚═══════════════════════════════════════════════════════════════════════════════╝
|
| 16 |
+
|
| 17 |
+
HOLD is MODEL-AGNOSTIC. Works with ANY framework:
|
| 18 |
+
- PyTorch, JAX, TensorFlow, scikit-learn
|
| 19 |
+
- Hugging Face, OpenAI API, Anthropic API
|
| 20 |
+
- Stable Baselines3, RLlib, custom RL
|
| 21 |
+
- Any function that outputs probabilities
|
| 22 |
+
|
| 23 |
+
USAGE:
|
| 24 |
+
>>> from cascade.hold import Hold
|
| 25 |
+
>>>
|
| 26 |
+
>>> # Your model (any framework)
|
| 27 |
+
>>> probs = your_model.predict(obs)
|
| 28 |
+
>>>
|
| 29 |
+
>>> # HOLD at decision point
|
| 30 |
+
>>> hold = Hold.get()
|
| 31 |
+
>>> resolution = hold.yield_point(
|
| 32 |
+
... action_probs=probs,
|
| 33 |
+
... value=value_estimate,
|
| 34 |
+
... observation=obs,
|
| 35 |
+
... brain_id="my_model",
|
| 36 |
+
... # Optional informational wealth:
|
| 37 |
+
... action_labels=["up", "down", "left", "right"],
|
| 38 |
+
... latent=model.get_latent(),
|
| 39 |
+
... attention=model.get_attention(),
|
| 40 |
+
... features=model.get_features(),
|
| 41 |
+
... imagination=model.imagine_futures(),
|
| 42 |
+
... )
|
| 43 |
+
>>>
|
| 44 |
+
>>> # Use resolved action
|
| 45 |
+
>>> action = resolution.action
|
| 46 |
+
>>> was_override = resolution.was_override
|
| 47 |
+
|
| 48 |
+
CLI:
|
| 49 |
+
$ cascade hold # Start HOLD interface
|
| 50 |
+
$ cascade hold-status # Show HOLD system status
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
# Primitives - the core API
|
| 54 |
+
from cascade.hold.primitives import (
|
| 55 |
+
HoldState,
|
| 56 |
+
HoldPoint,
|
| 57 |
+
HoldResolution,
|
| 58 |
+
Hold,
|
| 59 |
+
HoldAwareMixin,
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Session Layer - arcade-style history and time travel
|
| 63 |
+
from cascade.hold.session import (
|
| 64 |
+
InferenceStep,
|
| 65 |
+
HoldSession,
|
| 66 |
+
ArcadeFeedback,
|
| 67 |
+
CausationHold,
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
__all__ = [
|
| 71 |
+
# Primitives
|
| 72 |
+
"HoldState",
|
| 73 |
+
"HoldPoint",
|
| 74 |
+
"HoldResolution",
|
| 75 |
+
"Hold",
|
| 76 |
+
"HoldAwareMixin",
|
| 77 |
+
# Session
|
| 78 |
+
"InferenceStep",
|
| 79 |
+
"HoldSession",
|
| 80 |
+
"ArcadeFeedback",
|
| 81 |
+
"CausationHold",
|
| 82 |
+
]
|
cascade/hold/primitives.py
ADDED
|
@@ -0,0 +1,673 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HOLD Primitives - Core Data Structures and Singleton
|
| 3 |
+
═══════════════════════════════════════════════════════════
|
| 4 |
+
|
| 5 |
+
The primitive layer of HOLD:
|
| 6 |
+
- HoldPoint: A frozen moment in inference
|
| 7 |
+
- HoldResolution: The outcome of a hold
|
| 8 |
+
- Hold: Singleton system managing inference-level halts
|
| 9 |
+
|
| 10 |
+
HOLD is a CASCADE-LATTICE primitive.
|
| 11 |
+
No cascade = No HOLD.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import time
|
| 15 |
+
import hashlib
|
| 16 |
+
import threading
|
| 17 |
+
from typing import Dict, Any, Optional, Callable, List
|
| 18 |
+
from dataclasses import dataclass, field
|
| 19 |
+
from enum import Enum
|
| 20 |
+
import numpy as np
|
| 21 |
+
|
| 22 |
+
# CASCADE-LATTICE is REQUIRED
|
| 23 |
+
try:
|
| 24 |
+
from cascade import sdk_observe
|
| 25 |
+
from cascade.core.event import CausationLink
|
| 26 |
+
from cascade.core.graph import CausationGraph
|
| 27 |
+
HAS_CASCADE = True
|
| 28 |
+
except ImportError:
|
| 29 |
+
HAS_CASCADE = False
|
| 30 |
+
# Stubs for when imported standalone (testing)
|
| 31 |
+
def sdk_observe(*args, **kwargs): pass
|
| 32 |
+
class CausationLink:
|
| 33 |
+
def __init__(self, **kwargs): pass
|
| 34 |
+
class CausationGraph:
|
| 35 |
+
def add_link(self, link): pass
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class HoldState(Enum):
|
| 39 |
+
"""State of a hold point."""
|
| 40 |
+
PENDING = "pending" # Waiting for resolution
|
| 41 |
+
ACCEPTED = "accepted" # AI choice was accepted
|
| 42 |
+
OVERRIDDEN = "overridden" # Human override
|
| 43 |
+
TIMEOUT = "timeout" # Timed out, fell back to AI
|
| 44 |
+
CANCELLED = "cancelled" # Hold was cancelled
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _sanitize(data: Any) -> Any:
|
| 48 |
+
"""Recursively convert numpy types to python types."""
|
| 49 |
+
if isinstance(data, dict):
|
| 50 |
+
return {k: _sanitize(v) for k, v in data.items()}
|
| 51 |
+
elif isinstance(data, (list, tuple)):
|
| 52 |
+
return [_sanitize(x) for x in data]
|
| 53 |
+
elif isinstance(data, np.generic):
|
| 54 |
+
return data.item()
|
| 55 |
+
return data
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass
|
| 59 |
+
class HoldPoint:
|
| 60 |
+
"""
|
| 61 |
+
A decision point where inference yields for potential human intervention.
|
| 62 |
+
|
| 63 |
+
This is the "freeze frame" - the moment before commitment.
|
| 64 |
+
The decision matrix is exposed, the merkle chain awaits.
|
| 65 |
+
|
| 66 |
+
INFORMATIONAL WEALTH - everything a human needs to understand the decision:
|
| 67 |
+
- action_labels: What each action means ("FORWARD", "ATTACK", etc.)
|
| 68 |
+
- latent: The model's internal representation (for inspection)
|
| 69 |
+
- attention: What the model is attending to
|
| 70 |
+
- features: Extracted feature activations
|
| 71 |
+
- imagination: Per-action trajectory predictions and expected values
|
| 72 |
+
- logits: Raw logits before softmax (for temperature analysis)
|
| 73 |
+
- reasoning: Text explanations if available
|
| 74 |
+
"""
|
| 75 |
+
# Decision matrix
|
| 76 |
+
action_probs: np.ndarray # The probability distribution
|
| 77 |
+
value: float # Predicted value
|
| 78 |
+
|
| 79 |
+
# Context
|
| 80 |
+
observation: Dict[str, Any] # What the brain saw
|
| 81 |
+
brain_id: str # Which brain is holding
|
| 82 |
+
|
| 83 |
+
# === INFORMATIONAL WEALTH ===
|
| 84 |
+
|
| 85 |
+
# Action labels - CRITICAL for human understanding
|
| 86 |
+
action_labels: Optional[List[str]] = None # ["NOOP", "FORWARD", "BACK", ...]
|
| 87 |
+
|
| 88 |
+
# Internal state
|
| 89 |
+
latent: Optional[np.ndarray] = None # Latent activations (any shape)
|
| 90 |
+
attention: Optional[Dict[str, float]] = None # {"position": 0.7, "health": 0.3, ...}
|
| 91 |
+
features: Optional[Dict[str, float]] = None # {"spatial_attn": 0.8, "danger": 0.2, ...}
|
| 92 |
+
|
| 93 |
+
# Per-action deep data
|
| 94 |
+
imagination: Optional[Dict[int, Dict]] = None # {0: {"trajectory": [...], "expected_value": 0.5}, ...}
|
| 95 |
+
|
| 96 |
+
# Logits (pre-softmax)
|
| 97 |
+
logits: Optional[np.ndarray] = None # Raw logits for each action
|
| 98 |
+
|
| 99 |
+
# Reasoning chain (if model provides explanations)
|
| 100 |
+
reasoning: Optional[List[str]] = None # ["High reward expected", "Low risk path", ...]
|
| 101 |
+
|
| 102 |
+
# World model predictions (if available)
|
| 103 |
+
world_prediction: Optional[Dict[str, Any]] = None # {"pos_delta": [1,0,0], "health_delta": -2, ...}
|
| 104 |
+
|
| 105 |
+
# === END WEALTH ===
|
| 106 |
+
|
| 107 |
+
# Identity
|
| 108 |
+
id: str = field(default_factory=lambda: hashlib.sha256(str(time.time()).encode()).hexdigest()[:16])
|
| 109 |
+
timestamp: float = field(default_factory=time.time)
|
| 110 |
+
|
| 111 |
+
# Merkle linkage
|
| 112 |
+
parent_merkle: Optional[str] = None # Previous hold point
|
| 113 |
+
merkle_root: Optional[str] = None # Computed on creation
|
| 114 |
+
|
| 115 |
+
# State
|
| 116 |
+
state: HoldState = HoldState.PENDING
|
| 117 |
+
|
| 118 |
+
def __post_init__(self):
|
| 119 |
+
"""Compute merkle root on creation."""
|
| 120 |
+
if self.merkle_root is None:
|
| 121 |
+
data = f"{self.id}:{self.brain_id}:{self.action_probs.tobytes().hex()}:{self.timestamp}"
|
| 122 |
+
if self.parent_merkle:
|
| 123 |
+
data = f"{self.parent_merkle}:{data}"
|
| 124 |
+
self.merkle_root = hashlib.sha256(data.encode()).hexdigest()[:16]
|
| 125 |
+
|
| 126 |
+
@property
|
| 127 |
+
def ai_choice(self) -> int:
|
| 128 |
+
"""What the AI would choose."""
|
| 129 |
+
return int(np.argmax(self.action_probs))
|
| 130 |
+
|
| 131 |
+
@property
|
| 132 |
+
def ai_confidence(self) -> float:
|
| 133 |
+
"""Confidence in AI's top choice."""
|
| 134 |
+
return float(np.max(self.action_probs))
|
| 135 |
+
|
| 136 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 137 |
+
"""Serialize for CASCADE observation - includes full informational wealth."""
|
| 138 |
+
d = {
|
| 139 |
+
'id': self.id,
|
| 140 |
+
'brain_id': self.brain_id,
|
| 141 |
+
'action_probs': self.action_probs.tolist(),
|
| 142 |
+
'ai_choice': self.ai_choice,
|
| 143 |
+
'ai_confidence': self.ai_confidence,
|
| 144 |
+
'value': self.value,
|
| 145 |
+
'timestamp': self.timestamp,
|
| 146 |
+
'merkle_root': self.merkle_root,
|
| 147 |
+
'parent_merkle': self.parent_merkle,
|
| 148 |
+
'state': self.state.value,
|
| 149 |
+
'observation': self.observation,
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
# Include all available wealth
|
| 153 |
+
if self.action_labels is not None:
|
| 154 |
+
d['action_labels'] = self.action_labels
|
| 155 |
+
if self.latent is not None:
|
| 156 |
+
d['latent'] = self.latent.tolist() if hasattr(self.latent, 'tolist') else self.latent
|
| 157 |
+
if self.attention is not None:
|
| 158 |
+
d['attention'] = self.attention
|
| 159 |
+
if self.features is not None:
|
| 160 |
+
d['features'] = self.features
|
| 161 |
+
if self.imagination is not None:
|
| 162 |
+
d['imagination'] = self.imagination
|
| 163 |
+
if self.logits is not None:
|
| 164 |
+
d['logits'] = self.logits.tolist() if hasattr(self.logits, 'tolist') else self.logits
|
| 165 |
+
if self.reasoning is not None:
|
| 166 |
+
d['reasoning'] = self.reasoning
|
| 167 |
+
if self.world_prediction is not None:
|
| 168 |
+
d['world_prediction'] = self.world_prediction
|
| 169 |
+
|
| 170 |
+
return _sanitize(d)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
@dataclass
|
| 174 |
+
class HoldResolution:
|
| 175 |
+
"""
|
| 176 |
+
The resolution of a hold point.
|
| 177 |
+
|
| 178 |
+
Either the human accepted, overrode, or it timed out.
|
| 179 |
+
Links back to the hold point, forming a provenance chain.
|
| 180 |
+
"""
|
| 181 |
+
hold_point: HoldPoint # The hold that was resolved
|
| 182 |
+
action: int # Final action taken
|
| 183 |
+
|
| 184 |
+
# Resolution details
|
| 185 |
+
was_override: bool # True if human overrode AI
|
| 186 |
+
override_source: Optional[str] = None # Who/what overrode ("human", "policy", etc.)
|
| 187 |
+
|
| 188 |
+
# Timing
|
| 189 |
+
hold_duration: float = 0.0 # How long was held
|
| 190 |
+
timestamp: float = field(default_factory=time.time)
|
| 191 |
+
|
| 192 |
+
# Merkle linkage
|
| 193 |
+
merkle_root: Optional[str] = None
|
| 194 |
+
|
| 195 |
+
def __post_init__(self):
|
| 196 |
+
"""Compute merkle root."""
|
| 197 |
+
if self.merkle_root is None:
|
| 198 |
+
data = f"{self.hold_point.merkle_root}:{self.action}:{self.was_override}:{self.timestamp}"
|
| 199 |
+
self.merkle_root = hashlib.sha256(data.encode()).hexdigest()[:16]
|
| 200 |
+
|
| 201 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 202 |
+
"""Serialize for CASCADE observation."""
|
| 203 |
+
d = {
|
| 204 |
+
'hold_id': self.hold_point.id,
|
| 205 |
+
'hold_merkle': self.hold_point.merkle_root,
|
| 206 |
+
'action': self.action,
|
| 207 |
+
'ai_choice': self.hold_point.ai_choice,
|
| 208 |
+
'was_override': self.was_override,
|
| 209 |
+
'override_source': self.override_source,
|
| 210 |
+
'hold_duration': self.hold_duration,
|
| 211 |
+
'merkle_root': self.merkle_root,
|
| 212 |
+
'timestamp': self.timestamp,
|
| 213 |
+
}
|
| 214 |
+
return _sanitize(d)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
class Hold:
|
| 218 |
+
"""
|
| 219 |
+
The HOLD system - manages inference-level halts.
|
| 220 |
+
|
| 221 |
+
Singleton pattern - one Hold system per process.
|
| 222 |
+
|
| 223 |
+
Usage:
|
| 224 |
+
hold = Hold.get()
|
| 225 |
+
|
| 226 |
+
# Register listeners (for UI, visualization, etc.)
|
| 227 |
+
hold.register_listener(my_callback)
|
| 228 |
+
|
| 229 |
+
# From within a brain's forward() method:
|
| 230 |
+
resolution = hold.yield_point(
|
| 231 |
+
action_probs=probs,
|
| 232 |
+
value=value,
|
| 233 |
+
observation=obs,
|
| 234 |
+
brain_id="brain_001"
|
| 235 |
+
)
|
| 236 |
+
# Blocks until resolution!
|
| 237 |
+
|
| 238 |
+
# From UI/control thread:
|
| 239 |
+
hold.accept() # or
|
| 240 |
+
hold.override(action=3, source="human")
|
| 241 |
+
"""
|
| 242 |
+
|
| 243 |
+
_instance = None
|
| 244 |
+
_lock = threading.Lock()
|
| 245 |
+
|
| 246 |
+
def __new__(cls):
|
| 247 |
+
if cls._instance is None:
|
| 248 |
+
with cls._lock:
|
| 249 |
+
if cls._instance is None:
|
| 250 |
+
cls._instance = super().__new__(cls)
|
| 251 |
+
cls._instance._initialized = False
|
| 252 |
+
return cls._instance
|
| 253 |
+
|
| 254 |
+
def __init__(self):
|
| 255 |
+
if self._initialized:
|
| 256 |
+
return
|
| 257 |
+
|
| 258 |
+
# State
|
| 259 |
+
self._current_hold: Optional[HoldPoint] = None
|
| 260 |
+
self._resolution_event = threading.Event()
|
| 261 |
+
self._resolution: Optional[HoldResolution] = None
|
| 262 |
+
|
| 263 |
+
# Chain
|
| 264 |
+
self._last_merkle: Optional[str] = None
|
| 265 |
+
self._hold_count = 0
|
| 266 |
+
self._override_count = 0
|
| 267 |
+
|
| 268 |
+
# Callbacks - interfaces register here to receive hold points
|
| 269 |
+
self._listeners: List[Callable[[HoldPoint], None]] = []
|
| 270 |
+
|
| 271 |
+
# Settings
|
| 272 |
+
self.timeout: float = 30.0 # Default timeout (seconds)
|
| 273 |
+
self.auto_accept: bool = False # If True, don't block, just observe
|
| 274 |
+
|
| 275 |
+
# CASCADE graph for this session
|
| 276 |
+
self._causation_graph = CausationGraph()
|
| 277 |
+
|
| 278 |
+
self._initialized = True
|
| 279 |
+
print("[HOLD] system initialized (cascade-lattice)")
|
| 280 |
+
|
| 281 |
+
@classmethod
|
| 282 |
+
def get(cls) -> 'Hold':
|
| 283 |
+
"""Get the singleton instance."""
|
| 284 |
+
return cls()
|
| 285 |
+
|
| 286 |
+
def register_listener(self, callback: Callable[[HoldPoint], None]):
|
| 287 |
+
"""
|
| 288 |
+
Register a listener for hold points.
|
| 289 |
+
|
| 290 |
+
The callback receives HoldPoint when inference halts.
|
| 291 |
+
Use this to connect visualizations, UIs, etc.
|
| 292 |
+
"""
|
| 293 |
+
self._listeners.append(callback)
|
| 294 |
+
print(f"[REGISTER] Registered HOLD listener: {callback.__name__ if hasattr(callback, '__name__') else callback}")
|
| 295 |
+
|
| 296 |
+
def unregister_listener(self, callback: Callable):
|
| 297 |
+
"""Remove a listener."""
|
| 298 |
+
if callback in self._listeners:
|
| 299 |
+
self._listeners.remove(callback)
|
| 300 |
+
|
| 301 |
+
def yield_point(
|
| 302 |
+
self,
|
| 303 |
+
action_probs: np.ndarray,
|
| 304 |
+
value: float,
|
| 305 |
+
observation: Dict[str, Any],
|
| 306 |
+
brain_id: str,
|
| 307 |
+
# === INFORMATIONAL WEALTH ===
|
| 308 |
+
action_labels: Optional[List[str]] = None,
|
| 309 |
+
latent: Optional[np.ndarray] = None,
|
| 310 |
+
attention: Optional[Dict[str, float]] = None,
|
| 311 |
+
features: Optional[Dict[str, float]] = None,
|
| 312 |
+
imagination: Optional[Dict[int, Dict]] = None,
|
| 313 |
+
logits: Optional[np.ndarray] = None,
|
| 314 |
+
reasoning: Optional[List[str]] = None,
|
| 315 |
+
world_prediction: Optional[Dict[str, Any]] = None,
|
| 316 |
+
# === END WEALTH ===
|
| 317 |
+
blocking: bool = True,
|
| 318 |
+
) -> HoldResolution:
|
| 319 |
+
"""
|
| 320 |
+
Create a hold point and yield for resolution.
|
| 321 |
+
|
| 322 |
+
This is called from within a brain's forward() method.
|
| 323 |
+
Blocks until resolved (or timeout).
|
| 324 |
+
|
| 325 |
+
Args:
|
| 326 |
+
action_probs: The decision matrix (probability distribution)
|
| 327 |
+
value: Predicted value
|
| 328 |
+
observation: What the brain observed
|
| 329 |
+
brain_id: Identifier for the brain
|
| 330 |
+
|
| 331 |
+
INFORMATIONAL WEALTH (all optional, but improves human understanding):
|
| 332 |
+
action_labels: Names for each action ["FORWARD", "BACK", "LEFT", ...]
|
| 333 |
+
latent: Model's latent state/activations
|
| 334 |
+
attention: Attention weights {"position": 0.7, "health": 0.3}
|
| 335 |
+
features: Feature activations {"spatial": 0.8, "danger": 0.2}
|
| 336 |
+
imagination: Per-action predictions {0: {"trajectory": [...], "expected_value": 0.5}}
|
| 337 |
+
logits: Raw pre-softmax logits
|
| 338 |
+
reasoning: Text explanations ["High reward expected", ...]
|
| 339 |
+
world_prediction: World model predictions {"pos_delta": [1,0,0]}
|
| 340 |
+
|
| 341 |
+
blocking: If False, returns immediately with AI choice
|
| 342 |
+
|
| 343 |
+
Returns:
|
| 344 |
+
HoldResolution with the final action
|
| 345 |
+
"""
|
| 346 |
+
# Create hold point with full wealth
|
| 347 |
+
hold = HoldPoint(
|
| 348 |
+
action_probs=action_probs,
|
| 349 |
+
value=value,
|
| 350 |
+
observation=observation,
|
| 351 |
+
brain_id=brain_id,
|
| 352 |
+
action_labels=action_labels,
|
| 353 |
+
latent=latent,
|
| 354 |
+
attention=attention,
|
| 355 |
+
features=features,
|
| 356 |
+
imagination=imagination,
|
| 357 |
+
logits=logits,
|
| 358 |
+
reasoning=reasoning,
|
| 359 |
+
world_prediction=world_prediction,
|
| 360 |
+
parent_merkle=self._last_merkle,
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
# Observe the hold point in CASCADE
|
| 364 |
+
sdk_observe(
|
| 365 |
+
model_id=brain_id,
|
| 366 |
+
input_data=observation,
|
| 367 |
+
output_data={**hold.to_dict(), 'event_type': 'hold_point'},
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
self._hold_count += 1
|
| 371 |
+
|
| 372 |
+
# Non-blocking mode - just observe and return AI choice
|
| 373 |
+
if not blocking or self.auto_accept:
|
| 374 |
+
resolution = HoldResolution(
|
| 375 |
+
hold_point=hold,
|
| 376 |
+
action=hold.ai_choice,
|
| 377 |
+
was_override=False,
|
| 378 |
+
hold_duration=0.0,
|
| 379 |
+
)
|
| 380 |
+
self._observe_resolution(resolution)
|
| 381 |
+
return resolution
|
| 382 |
+
|
| 383 |
+
# Set as current hold
|
| 384 |
+
self._current_hold = hold
|
| 385 |
+
self._resolution_event.clear()
|
| 386 |
+
self._resolution = None
|
| 387 |
+
|
| 388 |
+
# Notify listeners
|
| 389 |
+
for listener in self._listeners:
|
| 390 |
+
try:
|
| 391 |
+
listener(hold)
|
| 392 |
+
except Exception as e:
|
| 393 |
+
print(f"⚠️ HOLD listener error: {e}")
|
| 394 |
+
|
| 395 |
+
# Print hold info
|
| 396 |
+
print(f"\n{'═' * 50}")
|
| 397 |
+
print(f"🛑 HOLD #{self._hold_count}")
|
| 398 |
+
print(f" Merkle: {hold.merkle_root}")
|
| 399 |
+
ai_label = hold.action_labels[hold.ai_choice] if hold.action_labels else str(hold.ai_choice)
|
| 400 |
+
print(f" AI Choice: {ai_label} (confidence: {hold.ai_confidence:.2%})")
|
| 401 |
+
print(f" Value: {hold.value:.4f}")
|
| 402 |
+
|
| 403 |
+
# Show probabilities with labels
|
| 404 |
+
if hold.action_labels:
|
| 405 |
+
prob_str = ', '.join(f'{hold.action_labels[i]}:{p:.2f}' for i, p in enumerate(hold.action_probs))
|
| 406 |
+
else:
|
| 407 |
+
prob_str = ', '.join(f'{i}:{p:.2f}' for i, p in enumerate(hold.action_probs))
|
| 408 |
+
print(f" Probabilities: {prob_str}")
|
| 409 |
+
|
| 410 |
+
# Show available wealth
|
| 411 |
+
wealth = []
|
| 412 |
+
if hold.latent is not None: wealth.append("latent")
|
| 413 |
+
if hold.attention is not None: wealth.append("attention")
|
| 414 |
+
if hold.features is not None: wealth.append("features")
|
| 415 |
+
if hold.imagination is not None: wealth.append("imagination")
|
| 416 |
+
if hold.reasoning is not None: wealth.append("reasoning")
|
| 417 |
+
if wealth:
|
| 418 |
+
print(f" Wealth: {', '.join(wealth)}")
|
| 419 |
+
|
| 420 |
+
print(f" Waiting for resolution (timeout: {self.timeout}s)...")
|
| 421 |
+
print(f"{'═' * 50}")
|
| 422 |
+
|
| 423 |
+
# Block until resolution or timeout
|
| 424 |
+
start_time = time.time()
|
| 425 |
+
resolved = self._resolution_event.wait(timeout=self.timeout)
|
| 426 |
+
hold_duration = time.time() - start_time
|
| 427 |
+
|
| 428 |
+
if resolved and self._resolution:
|
| 429 |
+
resolution = self._resolution
|
| 430 |
+
resolution.hold_duration = hold_duration
|
| 431 |
+
else:
|
| 432 |
+
# Timeout - use AI choice
|
| 433 |
+
hold.state = HoldState.TIMEOUT
|
| 434 |
+
resolution = HoldResolution(
|
| 435 |
+
hold_point=hold,
|
| 436 |
+
action=hold.ai_choice,
|
| 437 |
+
was_override=False,
|
| 438 |
+
override_source="timeout",
|
| 439 |
+
hold_duration=hold_duration,
|
| 440 |
+
)
|
| 441 |
+
print(f"[TIMEOUT] HOLD timeout - accepting AI choice: {hold.ai_choice}")
|
| 442 |
+
|
| 443 |
+
# Observe resolution
|
| 444 |
+
self._observe_resolution(resolution)
|
| 445 |
+
|
| 446 |
+
# Clear state
|
| 447 |
+
self._current_hold = None
|
| 448 |
+
self._resolution = None
|
| 449 |
+
|
| 450 |
+
return resolution
|
| 451 |
+
|
| 452 |
+
def resolve(self, action: int, source: str = "human"):
|
| 453 |
+
"""
|
| 454 |
+
Resolve the current hold with an action.
|
| 455 |
+
|
| 456 |
+
Called by UI/interface when human makes a choice.
|
| 457 |
+
|
| 458 |
+
Args:
|
| 459 |
+
action: The chosen action
|
| 460 |
+
source: Who resolved it ("human", "policy", etc.)
|
| 461 |
+
"""
|
| 462 |
+
if self._current_hold is None:
|
| 463 |
+
print("[WARN] No active hold to resolve")
|
| 464 |
+
return
|
| 465 |
+
|
| 466 |
+
hold = self._current_hold
|
| 467 |
+
was_override = (action != hold.ai_choice)
|
| 468 |
+
|
| 469 |
+
if was_override:
|
| 470 |
+
hold.state = HoldState.OVERRIDDEN
|
| 471 |
+
self._override_count += 1
|
| 472 |
+
else:
|
| 473 |
+
hold.state = HoldState.ACCEPTED
|
| 474 |
+
|
| 475 |
+
self._resolution = HoldResolution(
|
| 476 |
+
hold_point=hold,
|
| 477 |
+
action=action,
|
| 478 |
+
was_override=was_override,
|
| 479 |
+
override_source=source if was_override else None,
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
print(f"[RESOLVE] HOLD resolved: action={action}, override={was_override}")
|
| 483 |
+
self._resolution_event.set()
|
| 484 |
+
|
| 485 |
+
def accept(self):
|
| 486 |
+
"""Accept AI's choice for current hold."""
|
| 487 |
+
if self._current_hold:
|
| 488 |
+
self.resolve(self._current_hold.ai_choice, source="accept")
|
| 489 |
+
|
| 490 |
+
def override(self, action: int, source: str = "human"):
|
| 491 |
+
"""Override with a different action."""
|
| 492 |
+
self.resolve(action, source)
|
| 493 |
+
|
| 494 |
+
def cancel(self):
|
| 495 |
+
"""Cancel current hold without resolution."""
|
| 496 |
+
if self._current_hold:
|
| 497 |
+
self._current_hold.state = HoldState.CANCELLED
|
| 498 |
+
self._resolution = HoldResolution(
|
| 499 |
+
hold_point=self._current_hold,
|
| 500 |
+
action=self._current_hold.ai_choice,
|
| 501 |
+
was_override=False,
|
| 502 |
+
override_source="cancelled",
|
| 503 |
+
)
|
| 504 |
+
self._resolution_event.set()
|
| 505 |
+
|
| 506 |
+
def _observe_resolution(self, resolution: HoldResolution):
|
| 507 |
+
"""Record resolution to CASCADE."""
|
| 508 |
+
sdk_observe(
|
| 509 |
+
model_id=resolution.hold_point.brain_id,
|
| 510 |
+
input_data=resolution.hold_point.to_dict(),
|
| 511 |
+
output_data={**resolution.to_dict(), 'event_type': 'hold_resolution'},
|
| 512 |
+
)
|
| 513 |
+
|
| 514 |
+
# Update chain
|
| 515 |
+
self._last_merkle = resolution.merkle_root
|
| 516 |
+
|
| 517 |
+
# Add to causation graph
|
| 518 |
+
link = CausationLink(
|
| 519 |
+
from_event=resolution.hold_point.merkle_root,
|
| 520 |
+
to_event=resolution.merkle_root,
|
| 521 |
+
causation_type="hold_resolved",
|
| 522 |
+
strength=1.0 if resolution.was_override else 0.5,
|
| 523 |
+
explanation=f"Override: {resolution.was_override}, Action: {resolution.action}",
|
| 524 |
+
)
|
| 525 |
+
self._causation_graph.add_link(link)
|
| 526 |
+
|
| 527 |
+
@property
|
| 528 |
+
def current_hold(self) -> Optional[HoldPoint]:
|
| 529 |
+
"""Get current active hold point (if any)."""
|
| 530 |
+
return self._current_hold
|
| 531 |
+
|
| 532 |
+
@property
|
| 533 |
+
def stats(self) -> Dict[str, Any]:
|
| 534 |
+
"""Get hold statistics."""
|
| 535 |
+
return {
|
| 536 |
+
'total_holds': self._hold_count,
|
| 537 |
+
'overrides': self._override_count,
|
| 538 |
+
'override_rate': self._override_count / max(self._hold_count, 1),
|
| 539 |
+
'last_merkle': self._last_merkle,
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
|
| 543 |
+
class HoldAwareMixin:
|
| 544 |
+
"""
|
| 545 |
+
Mixin for brains that support HOLD.
|
| 546 |
+
|
| 547 |
+
Add this to your Brain class to enable inference-level halts.
|
| 548 |
+
|
| 549 |
+
Usage:
|
| 550 |
+
class MyBrain(HoldAwareMixin, BaseBrain):
|
| 551 |
+
def forward(self, inputs):
|
| 552 |
+
# Your inference code
|
| 553 |
+
return {"action_probs": probs, "value": value}
|
| 554 |
+
|
| 555 |
+
brain = MyBrain()
|
| 556 |
+
brain.enable_hold()
|
| 557 |
+
|
| 558 |
+
# Now forward_with_hold() will pause for human input
|
| 559 |
+
output = brain.forward_with_hold(inputs)
|
| 560 |
+
"""
|
| 561 |
+
|
| 562 |
+
def __init__(self, *args, **kwargs):
|
| 563 |
+
super().__init__(*args, **kwargs)
|
| 564 |
+
self._hold_system = Hold.get()
|
| 565 |
+
self._hold_enabled = True
|
| 566 |
+
self._brain_id = getattr(self, 'id', hashlib.sha256(str(id(self)).encode()).hexdigest()[:16])
|
| 567 |
+
|
| 568 |
+
def forward_with_hold(
|
| 569 |
+
self,
|
| 570 |
+
inputs: Dict[str, Any],
|
| 571 |
+
blocking: bool = True,
|
| 572 |
+
) -> Dict[str, Any]:
|
| 573 |
+
"""
|
| 574 |
+
Forward pass with HOLD support.
|
| 575 |
+
|
| 576 |
+
Call this instead of forward() to enable hold points.
|
| 577 |
+
"""
|
| 578 |
+
# Get decision matrix from normal forward
|
| 579 |
+
output = self.forward(inputs)
|
| 580 |
+
|
| 581 |
+
if not self._hold_enabled:
|
| 582 |
+
return output
|
| 583 |
+
|
| 584 |
+
action_probs = output.get('action_probs', None)
|
| 585 |
+
if action_probs is None:
|
| 586 |
+
return output
|
| 587 |
+
|
| 588 |
+
# Get imagination if available (DreamerBrain, etc.)
|
| 589 |
+
imagined = None
|
| 590 |
+
if hasattr(self, 'imagine'):
|
| 591 |
+
try:
|
| 592 |
+
imagined = self.imagine(horizon=15)
|
| 593 |
+
except:
|
| 594 |
+
pass
|
| 595 |
+
|
| 596 |
+
# Yield to hold system
|
| 597 |
+
resolution = self._hold_system.yield_point(
|
| 598 |
+
action_probs=np.array(action_probs),
|
| 599 |
+
value=float(output.get('value', 0.0)),
|
| 600 |
+
observation=inputs,
|
| 601 |
+
brain_id=self._brain_id,
|
| 602 |
+
imagined_futures=imagined,
|
| 603 |
+
blocking=blocking,
|
| 604 |
+
)
|
| 605 |
+
|
| 606 |
+
# Update output with resolved action
|
| 607 |
+
output['action'] = resolution.action
|
| 608 |
+
output['hold_resolution'] = resolution.to_dict()
|
| 609 |
+
output['was_override'] = resolution.was_override
|
| 610 |
+
|
| 611 |
+
return output
|
| 612 |
+
|
| 613 |
+
def enable_hold(self):
|
| 614 |
+
"""Enable HOLD for this brain."""
|
| 615 |
+
self._hold_enabled = True
|
| 616 |
+
|
| 617 |
+
def disable_hold(self):
|
| 618 |
+
"""Disable HOLD (normal inference)."""
|
| 619 |
+
self._hold_enabled = False
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
# Demo
|
| 623 |
+
def _demo_hold():
|
| 624 |
+
"""Demonstrate HOLD system."""
|
| 625 |
+
print("=" * 60)
|
| 626 |
+
print("HOLD SYSTEM DEMO")
|
| 627 |
+
print("=" * 60)
|
| 628 |
+
|
| 629 |
+
# Get hold system
|
| 630 |
+
hold = Hold.get()
|
| 631 |
+
hold.timeout = 10.0
|
| 632 |
+
|
| 633 |
+
def on_hold(point: HoldPoint):
|
| 634 |
+
print(f"\n🔔 Listener received hold: {point.id}")
|
| 635 |
+
|
| 636 |
+
hold.register_listener(on_hold)
|
| 637 |
+
|
| 638 |
+
def brain_loop():
|
| 639 |
+
for step in range(3):
|
| 640 |
+
probs = np.random.dirichlet(np.ones(8))
|
| 641 |
+
resolution = hold.yield_point(
|
| 642 |
+
action_probs=probs,
|
| 643 |
+
value=np.random.random(),
|
| 644 |
+
observation={'step': step},
|
| 645 |
+
brain_id='demo_brain',
|
| 646 |
+
)
|
| 647 |
+
print(f"Brain received: action={resolution.action}, override={resolution.was_override}")
|
| 648 |
+
|
| 649 |
+
def human_input():
|
| 650 |
+
for i in range(3):
|
| 651 |
+
time.sleep(2)
|
| 652 |
+
if hold.current_hold:
|
| 653 |
+
if i % 2 == 0:
|
| 654 |
+
hold.accept()
|
| 655 |
+
else:
|
| 656 |
+
hold.override(7, source="demo_human")
|
| 657 |
+
|
| 658 |
+
brain_thread = threading.Thread(target=brain_loop)
|
| 659 |
+
human_thread = threading.Thread(target=human_input)
|
| 660 |
+
|
| 661 |
+
brain_thread.start()
|
| 662 |
+
human_thread.start()
|
| 663 |
+
|
| 664 |
+
brain_thread.join()
|
| 665 |
+
human_thread.join()
|
| 666 |
+
|
| 667 |
+
print(f"\n{'=' * 60}")
|
| 668 |
+
print("SESSION STATS")
|
| 669 |
+
print(hold.stats)
|
| 670 |
+
|
| 671 |
+
|
| 672 |
+
if __name__ == "__main__":
|
| 673 |
+
_demo_hold()
|
cascade/hold/session.py
ADDED
|
@@ -0,0 +1,707 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HOLD Session - Arcade-Style Inference Interception
|
| 3 |
+
══════════════════════════════════════════════════════════
|
| 4 |
+
|
| 5 |
+
"Pause the machine. See what it sees. Choose what it chooses."
|
| 6 |
+
|
| 7 |
+
The arcade layer of HOLD:
|
| 8 |
+
- CausationHold: Session management with history
|
| 9 |
+
- InferenceStep: Single crystallized moment
|
| 10 |
+
- Time travel via state snapshots
|
| 11 |
+
- Speed controls and combo tracking
|
| 12 |
+
|
| 13 |
+
Controls:
|
| 14 |
+
SPACE - Accept model's choice, advance
|
| 15 |
+
1-9 - Override with alternative
|
| 16 |
+
←/→ - Step back/forward through history
|
| 17 |
+
+/- - Speed up/slow down auto-advance
|
| 18 |
+
P - Pause/unpause auto-advance
|
| 19 |
+
ESC - Exit hold mode
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import numpy as np
|
| 23 |
+
import time
|
| 24 |
+
import json
|
| 25 |
+
import hashlib
|
| 26 |
+
import threading
|
| 27 |
+
from dataclasses import dataclass, field
|
| 28 |
+
from typing import Dict, List, Optional, Any, Callable, Tuple
|
| 29 |
+
from datetime import datetime
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
from enum import Enum
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class SessionState(Enum):
|
| 35 |
+
"""Current state of the hold session."""
|
| 36 |
+
IDLE = "idle" # Not holding anything
|
| 37 |
+
PAUSED = "paused" # Frozen, waiting for input
|
| 38 |
+
STEPPING = "stepping" # Auto-advancing at set speed
|
| 39 |
+
REWINDING = "rewinding" # Going backwards through history
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@dataclass
|
| 43 |
+
class InferenceStep:
|
| 44 |
+
"""A single crystallized moment of inference."""
|
| 45 |
+
step_id: str
|
| 46 |
+
step_index: int
|
| 47 |
+
timestamp: float
|
| 48 |
+
|
| 49 |
+
# What the model sees
|
| 50 |
+
input_context: Dict[str, Any]
|
| 51 |
+
|
| 52 |
+
# What the model wants to do
|
| 53 |
+
candidates: List[Dict[str, Any]] # [{value, probability, metadata}]
|
| 54 |
+
top_choice: Any
|
| 55 |
+
top_probability: float
|
| 56 |
+
|
| 57 |
+
# Internal state snapshot (for true rewind)
|
| 58 |
+
hidden_state: Optional[np.ndarray] = None
|
| 59 |
+
attention_weights: Optional[Dict[str, float]] = None
|
| 60 |
+
|
| 61 |
+
# What actually happened
|
| 62 |
+
chosen_value: Any = None
|
| 63 |
+
was_override: bool = False
|
| 64 |
+
override_by: str = "model" # "model" or "human"
|
| 65 |
+
|
| 66 |
+
# Provenance
|
| 67 |
+
cascade_hash: Optional[str] = None
|
| 68 |
+
|
| 69 |
+
# Private: full state snapshot for true rewind
|
| 70 |
+
_state_snapshot: Optional[Dict[str, Any]] = field(default=None, repr=False)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@dataclass
|
| 74 |
+
class HoldSession:
|
| 75 |
+
"""A complete hold session with history."""
|
| 76 |
+
session_id: str
|
| 77 |
+
agent_id: str
|
| 78 |
+
started_at: float
|
| 79 |
+
|
| 80 |
+
# All steps in order
|
| 81 |
+
steps: List[InferenceStep] = field(default_factory=list)
|
| 82 |
+
current_index: int = 0
|
| 83 |
+
|
| 84 |
+
# Arcade stats
|
| 85 |
+
total_steps: int = 0
|
| 86 |
+
human_overrides: int = 0
|
| 87 |
+
correct_predictions: int = 0 # Human guessed what model would do
|
| 88 |
+
combo: int = 0
|
| 89 |
+
max_combo: int = 0
|
| 90 |
+
|
| 91 |
+
# Speed control (steps per second, 0 = manual only)
|
| 92 |
+
speed_level: int = 0 # 0=manual, 1=slow, 2=medium, 3=fast, 4=ludicrous
|
| 93 |
+
speed_map: Dict[int, float] = field(default_factory=lambda: {
|
| 94 |
+
0: 0.0, # Manual
|
| 95 |
+
1: 0.5, # 2 sec per step
|
| 96 |
+
2: 1.0, # 1 sec per step
|
| 97 |
+
3: 2.0, # 0.5 sec per step
|
| 98 |
+
4: 10.0, # 0.1 sec per step (ludicrous speed)
|
| 99 |
+
})
|
| 100 |
+
|
| 101 |
+
# State
|
| 102 |
+
state: SessionState = SessionState.IDLE
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
@dataclass
|
| 106 |
+
class ArcadeFeedback:
|
| 107 |
+
"""Visual/audio feedback cues."""
|
| 108 |
+
message: str
|
| 109 |
+
intensity: float # 0-1, for glow/shake/etc
|
| 110 |
+
sound_cue: str # "accept", "override", "combo", "combo_break", "rewind"
|
| 111 |
+
color: Tuple[int, int, int] = (255, 255, 255)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class CausationHold:
|
| 115 |
+
"""
|
| 116 |
+
The arcade-layer hold system. Wraps any inference function.
|
| 117 |
+
|
| 118 |
+
Features:
|
| 119 |
+
- Session management with full history
|
| 120 |
+
- True state restoration for time travel
|
| 121 |
+
- Speed controls (manual to ludicrous)
|
| 122 |
+
- Combo tracking and high scores
|
| 123 |
+
|
| 124 |
+
Usage:
|
| 125 |
+
hold = CausationHold()
|
| 126 |
+
|
| 127 |
+
# Start a session
|
| 128 |
+
hold.begin_session(agent_id="agent_123")
|
| 129 |
+
|
| 130 |
+
# In inference loop:
|
| 131 |
+
for step in inference_steps:
|
| 132 |
+
choice, feedback = hold.capture(
|
| 133 |
+
input_context={"tokens": tokens},
|
| 134 |
+
candidates=[{"value": "A", "probability": 0.8}, ...]
|
| 135 |
+
) # Pauses here until user input!
|
| 136 |
+
|
| 137 |
+
# Time travel
|
| 138 |
+
hold.rewind(steps=3)
|
| 139 |
+
hold.branch_from(step_index=5, choice_index=2)
|
| 140 |
+
|
| 141 |
+
stats = hold.end_session()
|
| 142 |
+
"""
|
| 143 |
+
|
| 144 |
+
def __init__(self, cascade_bus=None):
|
| 145 |
+
"""
|
| 146 |
+
Args:
|
| 147 |
+
cascade_bus: Optional CASCADE event bus for provenance
|
| 148 |
+
"""
|
| 149 |
+
self.bus = cascade_bus
|
| 150 |
+
self.session: Optional[HoldSession] = None
|
| 151 |
+
self.callbacks: Dict[str, List[Callable]] = {
|
| 152 |
+
'on_step': [],
|
| 153 |
+
'on_override': [],
|
| 154 |
+
'on_combo': [],
|
| 155 |
+
'on_combo_break': [],
|
| 156 |
+
'on_rewind': [],
|
| 157 |
+
'on_state_restore': [],
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
# Thread safety
|
| 161 |
+
self._lock = threading.Lock()
|
| 162 |
+
self._input_event = threading.Event()
|
| 163 |
+
self._user_choice: Optional[Any] = None
|
| 164 |
+
|
| 165 |
+
# High scores (persisted)
|
| 166 |
+
self.high_scores_path = Path("data/hold_high_scores.json")
|
| 167 |
+
self.high_scores = self._load_high_scores()
|
| 168 |
+
|
| 169 |
+
# ========================================================================
|
| 170 |
+
# SESSION MANAGEMENT
|
| 171 |
+
# ========================================================================
|
| 172 |
+
|
| 173 |
+
def begin_session(self, agent_id: str) -> HoldSession:
|
| 174 |
+
"""Start a new hold session."""
|
| 175 |
+
session_id = f"hold_{agent_id}_{int(time.time()*1000)}"
|
| 176 |
+
|
| 177 |
+
self.session = HoldSession(
|
| 178 |
+
session_id=session_id,
|
| 179 |
+
agent_id=agent_id,
|
| 180 |
+
started_at=time.time(),
|
| 181 |
+
)
|
| 182 |
+
self.session.state = SessionState.PAUSED
|
| 183 |
+
|
| 184 |
+
self._emit_cascade("hold_session_start", {
|
| 185 |
+
"session_id": session_id,
|
| 186 |
+
"agent_id": agent_id,
|
| 187 |
+
})
|
| 188 |
+
|
| 189 |
+
return self.session
|
| 190 |
+
|
| 191 |
+
def end_session(self) -> Dict[str, Any]:
|
| 192 |
+
"""End session and return stats."""
|
| 193 |
+
if not self.session:
|
| 194 |
+
return {}
|
| 195 |
+
|
| 196 |
+
stats = {
|
| 197 |
+
"session_id": self.session.session_id,
|
| 198 |
+
"agent_id": self.session.agent_id,
|
| 199 |
+
"duration": time.time() - self.session.started_at,
|
| 200 |
+
"total_steps": self.session.total_steps,
|
| 201 |
+
"human_overrides": self.session.human_overrides,
|
| 202 |
+
"correct_predictions": self.session.correct_predictions,
|
| 203 |
+
"max_combo": self.session.max_combo,
|
| 204 |
+
"accuracy": (
|
| 205 |
+
self.session.correct_predictions / max(1, self.session.total_steps)
|
| 206 |
+
),
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
# Check for high score
|
| 210 |
+
self._check_high_score(stats)
|
| 211 |
+
|
| 212 |
+
self._emit_cascade("hold_session_end", stats)
|
| 213 |
+
|
| 214 |
+
self.session = None
|
| 215 |
+
return stats
|
| 216 |
+
|
| 217 |
+
# ========================================================================
|
| 218 |
+
# CAPTURE & ADVANCE - WITH STATE SNAPSHOT FOR TRUE REWIND
|
| 219 |
+
# ========================================================================
|
| 220 |
+
|
| 221 |
+
def capture(
|
| 222 |
+
self,
|
| 223 |
+
input_context: Dict[str, Any],
|
| 224 |
+
candidates: List[Dict[str, Any]],
|
| 225 |
+
hidden_state: Optional[np.ndarray] = None,
|
| 226 |
+
attention: Optional[Dict[str, float]] = None,
|
| 227 |
+
state_snapshot: Optional[Dict[str, Any]] = None,
|
| 228 |
+
) -> Tuple[Any, ArcadeFeedback]:
|
| 229 |
+
"""
|
| 230 |
+
Capture an inference step. BLOCKS until user input or auto-advance.
|
| 231 |
+
|
| 232 |
+
IMPORTANT: Pass state_snapshot for true rewind capability.
|
| 233 |
+
This should be a complete snapshot of the model's internal state
|
| 234 |
+
that can be restored to allow execution from this decision point
|
| 235 |
+
with a different choice.
|
| 236 |
+
|
| 237 |
+
This is NOT prediction - you will ACTUALLY execute the choice and
|
| 238 |
+
see REAL outcomes. If you don't like them, rewind and try again.
|
| 239 |
+
|
| 240 |
+
Args:
|
| 241 |
+
input_context: What the model is looking at
|
| 242 |
+
candidates: List of {value, probability, ...} options
|
| 243 |
+
hidden_state: Optional internal state snapshot (deprecated, use state_snapshot)
|
| 244 |
+
attention: Optional attention weights
|
| 245 |
+
state_snapshot: Complete model state for TRUE rewind capability
|
| 246 |
+
|
| 247 |
+
Returns:
|
| 248 |
+
(chosen_value, feedback) - The value to use and arcade feedback
|
| 249 |
+
"""
|
| 250 |
+
if not self.session:
|
| 251 |
+
# No session = passthrough, just return top choice
|
| 252 |
+
return candidates[0]['value'], ArcadeFeedback("", 0, "")
|
| 253 |
+
|
| 254 |
+
# Sort candidates by probability
|
| 255 |
+
candidates = sorted(candidates, key=lambda x: x.get('probability', 0), reverse=True)
|
| 256 |
+
top = candidates[0]
|
| 257 |
+
|
| 258 |
+
# Merge hidden_state into state_snapshot if provided separately
|
| 259 |
+
if state_snapshot is None and hidden_state is not None:
|
| 260 |
+
state_snapshot = {'hidden_state': hidden_state}
|
| 261 |
+
elif state_snapshot is not None and hidden_state is not None:
|
| 262 |
+
state_snapshot['hidden_state'] = hidden_state
|
| 263 |
+
|
| 264 |
+
# Create step - this is a CHECKPOINT for true rewind
|
| 265 |
+
step = InferenceStep(
|
| 266 |
+
step_id=f"step_{self.session.total_steps}",
|
| 267 |
+
step_index=self.session.total_steps,
|
| 268 |
+
timestamp=time.time(),
|
| 269 |
+
input_context=input_context,
|
| 270 |
+
candidates=candidates,
|
| 271 |
+
top_choice=top['value'],
|
| 272 |
+
top_probability=top.get('probability', 1.0),
|
| 273 |
+
hidden_state=hidden_state,
|
| 274 |
+
attention_weights=attention,
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
# Store state snapshot for TRUE rewind (not just history navigation)
|
| 278 |
+
if state_snapshot is not None:
|
| 279 |
+
step._state_snapshot = state_snapshot
|
| 280 |
+
|
| 281 |
+
# Compute merkle hash for provenance
|
| 282 |
+
step.cascade_hash = self._compute_step_hash(step)
|
| 283 |
+
|
| 284 |
+
# Add to history
|
| 285 |
+
with self._lock:
|
| 286 |
+
self.session.steps.append(step)
|
| 287 |
+
self.session.current_index = len(self.session.steps) - 1
|
| 288 |
+
self.session.total_steps += 1
|
| 289 |
+
|
| 290 |
+
# Emit step event
|
| 291 |
+
self._emit_callback('on_step', step)
|
| 292 |
+
self._emit_cascade("hold_step", {
|
| 293 |
+
"step_index": step.step_index,
|
| 294 |
+
"top_choice": str(top['value']),
|
| 295 |
+
"top_prob": top.get('probability', 1.0),
|
| 296 |
+
"num_candidates": len(candidates),
|
| 297 |
+
"has_snapshot": state_snapshot is not None,
|
| 298 |
+
"merkle": step.cascade_hash,
|
| 299 |
+
})
|
| 300 |
+
|
| 301 |
+
# Wait for input
|
| 302 |
+
choice, feedback = self._wait_for_input(step)
|
| 303 |
+
|
| 304 |
+
# Record what happened
|
| 305 |
+
step.chosen_value = choice
|
| 306 |
+
step.was_override = (choice != top['value'])
|
| 307 |
+
step.override_by = "human" if step.was_override else "model"
|
| 308 |
+
|
| 309 |
+
if step.was_override:
|
| 310 |
+
self.session.human_overrides += 1
|
| 311 |
+
self._emit_callback('on_override', step, choice)
|
| 312 |
+
|
| 313 |
+
return choice, feedback
|
| 314 |
+
|
| 315 |
+
def _wait_for_input(self, step: InferenceStep) -> Tuple[Any, ArcadeFeedback]:
|
| 316 |
+
"""Wait for user input or auto-advance timer."""
|
| 317 |
+
|
| 318 |
+
# Manual mode = wait indefinitely
|
| 319 |
+
if self.session.speed_level == 0:
|
| 320 |
+
self._input_event.clear()
|
| 321 |
+
self._input_event.wait() # Blocks until input()
|
| 322 |
+
|
| 323 |
+
choice = self._user_choice
|
| 324 |
+
self._user_choice = None
|
| 325 |
+
|
| 326 |
+
else:
|
| 327 |
+
# Auto-advance mode
|
| 328 |
+
speed = self.session.speed_map[self.session.speed_level]
|
| 329 |
+
wait_time = 1.0 / speed if speed > 0 else float('inf')
|
| 330 |
+
|
| 331 |
+
self._input_event.clear()
|
| 332 |
+
got_input = self._input_event.wait(timeout=wait_time)
|
| 333 |
+
|
| 334 |
+
if got_input and self._user_choice is not None:
|
| 335 |
+
choice = self._user_choice
|
| 336 |
+
self._user_choice = None
|
| 337 |
+
else:
|
| 338 |
+
# Auto-accepted
|
| 339 |
+
choice = step.top_choice
|
| 340 |
+
|
| 341 |
+
# Generate feedback
|
| 342 |
+
return choice, self._generate_feedback(step, choice)
|
| 343 |
+
|
| 344 |
+
def input(self, choice: Any):
|
| 345 |
+
"""
|
| 346 |
+
Provide user input. Call from UI thread.
|
| 347 |
+
|
| 348 |
+
Args:
|
| 349 |
+
choice: The value to use (or index into candidates)
|
| 350 |
+
"""
|
| 351 |
+
if not self.session:
|
| 352 |
+
return
|
| 353 |
+
|
| 354 |
+
current_step = self.session.steps[self.session.current_index]
|
| 355 |
+
|
| 356 |
+
# Handle index input (1-9 keys)
|
| 357 |
+
if isinstance(choice, int) and 0 <= choice < len(current_step.candidates):
|
| 358 |
+
choice = current_step.candidates[choice]['value']
|
| 359 |
+
|
| 360 |
+
self._user_choice = choice
|
| 361 |
+
self._input_event.set()
|
| 362 |
+
|
| 363 |
+
def accept(self):
|
| 364 |
+
"""Accept model's top choice (SPACE key)."""
|
| 365 |
+
if not self.session or not self.session.steps:
|
| 366 |
+
return
|
| 367 |
+
|
| 368 |
+
current = self.session.steps[self.session.current_index]
|
| 369 |
+
self.input(current.top_choice)
|
| 370 |
+
|
| 371 |
+
def override(self, index: int):
|
| 372 |
+
"""Override with candidate at index (1-9 keys)."""
|
| 373 |
+
self.input(index)
|
| 374 |
+
|
| 375 |
+
# ========================================================================
|
| 376 |
+
# NAVIGATION (TIME TRAVEL) - TRUE STATE RESTORATION
|
| 377 |
+
# ========================================================================
|
| 378 |
+
|
| 379 |
+
def rewind(self, steps: int = 1, restore_state: bool = True) -> Optional[InferenceStep]:
|
| 380 |
+
"""
|
| 381 |
+
Go back in history with optional state restoration.
|
| 382 |
+
|
| 383 |
+
This is NOT simulation - we actually restore the model's internal state
|
| 384 |
+
to the snapshot taken at that decision point. From there, you can
|
| 385 |
+
execute a different branch and see REAL outcomes.
|
| 386 |
+
|
| 387 |
+
Args:
|
| 388 |
+
steps: Number of steps to go back
|
| 389 |
+
restore_state: If True, actually restore hidden_state to model
|
| 390 |
+
|
| 391 |
+
Returns:
|
| 392 |
+
The step we rewound to
|
| 393 |
+
"""
|
| 394 |
+
if not self.session:
|
| 395 |
+
return None
|
| 396 |
+
|
| 397 |
+
with self._lock:
|
| 398 |
+
new_index = max(0, self.session.current_index - steps)
|
| 399 |
+
if new_index != self.session.current_index:
|
| 400 |
+
self.session.current_index = new_index
|
| 401 |
+
self.session.state = SessionState.REWINDING
|
| 402 |
+
|
| 403 |
+
step = self.session.steps[new_index]
|
| 404 |
+
|
| 405 |
+
# TRUE STATE RESTORATION
|
| 406 |
+
if restore_state and step.hidden_state is not None:
|
| 407 |
+
self._restore_state(step)
|
| 408 |
+
|
| 409 |
+
self._emit_callback('on_rewind', step, -steps)
|
| 410 |
+
|
| 411 |
+
return step
|
| 412 |
+
return None
|
| 413 |
+
|
| 414 |
+
def _restore_state(self, step: InferenceStep):
|
| 415 |
+
"""
|
| 416 |
+
Restore model state from a snapshot.
|
| 417 |
+
|
| 418 |
+
This is the key that makes execution + rewind possible.
|
| 419 |
+
The model's internal state is set back to exactly what it was
|
| 420 |
+
at this decision point, allowing you to branch differently.
|
| 421 |
+
"""
|
| 422 |
+
if step.hidden_state is None and step._state_snapshot is None:
|
| 423 |
+
return
|
| 424 |
+
|
| 425 |
+
# Emit state restoration event - hooked components can restore themselves
|
| 426 |
+
self._emit_callback('on_state_restore', step)
|
| 427 |
+
self._emit_cascade("state_restored", {
|
| 428 |
+
"step_index": step.step_index,
|
| 429 |
+
"merkle": step.cascade_hash,
|
| 430 |
+
"had_hidden_state": step.hidden_state is not None,
|
| 431 |
+
"had_snapshot": step._state_snapshot is not None,
|
| 432 |
+
})
|
| 433 |
+
|
| 434 |
+
def branch_from(self, step_index: int, choice_index: int) -> Optional[InferenceStep]:
|
| 435 |
+
"""
|
| 436 |
+
Rewind to a step and immediately choose a different branch.
|
| 437 |
+
|
| 438 |
+
This is the core gameplay loop:
|
| 439 |
+
1. Rewind to decision point
|
| 440 |
+
2. Choose different option
|
| 441 |
+
3. Execute and see what happens
|
| 442 |
+
4. Repeat until satisfied
|
| 443 |
+
|
| 444 |
+
Args:
|
| 445 |
+
step_index: Which decision point to branch from
|
| 446 |
+
choice_index: Which candidate to choose (0 = model's choice)
|
| 447 |
+
|
| 448 |
+
Returns:
|
| 449 |
+
The step after branching (with state restored)
|
| 450 |
+
"""
|
| 451 |
+
step = self.jump_to(step_index)
|
| 452 |
+
if step is None:
|
| 453 |
+
return None
|
| 454 |
+
|
| 455 |
+
# Restore state
|
| 456 |
+
self._restore_state(step)
|
| 457 |
+
|
| 458 |
+
# Set up the override
|
| 459 |
+
if choice_index < len(step.candidates):
|
| 460 |
+
self.override(choice_index)
|
| 461 |
+
else:
|
| 462 |
+
self.accept()
|
| 463 |
+
|
| 464 |
+
return step
|
| 465 |
+
|
| 466 |
+
def forward(self, steps: int = 1) -> Optional[InferenceStep]:
|
| 467 |
+
"""Go forward in history (if we've rewound)."""
|
| 468 |
+
if not self.session:
|
| 469 |
+
return None
|
| 470 |
+
|
| 471 |
+
with self._lock:
|
| 472 |
+
max_index = len(self.session.steps) - 1
|
| 473 |
+
new_index = min(max_index, self.session.current_index + steps)
|
| 474 |
+
if new_index != self.session.current_index:
|
| 475 |
+
self.session.current_index = new_index
|
| 476 |
+
|
| 477 |
+
step = self.session.steps[new_index]
|
| 478 |
+
self._emit_callback('on_rewind', step, steps)
|
| 479 |
+
|
| 480 |
+
return step
|
| 481 |
+
return None
|
| 482 |
+
|
| 483 |
+
def jump_to(self, index: int) -> Optional[InferenceStep]:
|
| 484 |
+
"""Jump to specific step."""
|
| 485 |
+
if not self.session:
|
| 486 |
+
return None
|
| 487 |
+
|
| 488 |
+
with self._lock:
|
| 489 |
+
index = max(0, min(index, len(self.session.steps) - 1))
|
| 490 |
+
self.session.current_index = index
|
| 491 |
+
return self.session.steps[index]
|
| 492 |
+
|
| 493 |
+
# ========================================================================
|
| 494 |
+
# SPEED CONTROL
|
| 495 |
+
# ========================================================================
|
| 496 |
+
|
| 497 |
+
def speed_up(self):
|
| 498 |
+
"""Increase auto-advance speed."""
|
| 499 |
+
if self.session:
|
| 500 |
+
self.session.speed_level = min(4, self.session.speed_level + 1)
|
| 501 |
+
|
| 502 |
+
def speed_down(self):
|
| 503 |
+
"""Decrease auto-advance speed."""
|
| 504 |
+
if self.session:
|
| 505 |
+
self.session.speed_level = max(0, self.session.speed_level - 1)
|
| 506 |
+
|
| 507 |
+
def set_speed(self, level: int):
|
| 508 |
+
"""Set speed level directly (0-4)."""
|
| 509 |
+
if self.session:
|
| 510 |
+
self.session.speed_level = max(0, min(4, level))
|
| 511 |
+
|
| 512 |
+
def pause(self):
|
| 513 |
+
"""Pause auto-advance."""
|
| 514 |
+
if self.session:
|
| 515 |
+
self.session.state = SessionState.PAUSED
|
| 516 |
+
|
| 517 |
+
def unpause(self):
|
| 518 |
+
"""Resume auto-advance."""
|
| 519 |
+
if self.session:
|
| 520 |
+
self.session.state = SessionState.STEPPING
|
| 521 |
+
|
| 522 |
+
# ========================================================================
|
| 523 |
+
# PROVENANCE HASHING
|
| 524 |
+
# ========================================================================
|
| 525 |
+
|
| 526 |
+
def _compute_step_hash(self, step: InferenceStep) -> str:
|
| 527 |
+
"""
|
| 528 |
+
Compute merkle hash for a step.
|
| 529 |
+
|
| 530 |
+
This hash uniquely identifies this decision point and allows
|
| 531 |
+
verification that rewind is restoring to the exact right state.
|
| 532 |
+
"""
|
| 533 |
+
# Include parent hash for chain integrity
|
| 534 |
+
parent_hash = ""
|
| 535 |
+
if self.session and len(self.session.steps) > 0:
|
| 536 |
+
prev_step = self.session.steps[-1]
|
| 537 |
+
parent_hash = prev_step.cascade_hash or ""
|
| 538 |
+
|
| 539 |
+
content = json.dumps({
|
| 540 |
+
'step_index': step.step_index,
|
| 541 |
+
'timestamp': step.timestamp,
|
| 542 |
+
'top_choice': str(step.top_choice),
|
| 543 |
+
'top_prob': step.top_probability,
|
| 544 |
+
'num_candidates': len(step.candidates),
|
| 545 |
+
'parent_hash': parent_hash,
|
| 546 |
+
}, sort_keys=True)
|
| 547 |
+
|
| 548 |
+
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
| 549 |
+
|
| 550 |
+
# ========================================================================
|
| 551 |
+
# ARCADE FEEDBACK
|
| 552 |
+
# ========================================================================
|
| 553 |
+
|
| 554 |
+
def _generate_feedback(self, step: InferenceStep, choice: Any) -> ArcadeFeedback:
|
| 555 |
+
"""Generate arcade-style feedback for a step."""
|
| 556 |
+
|
| 557 |
+
is_override = (choice != step.top_choice)
|
| 558 |
+
|
| 559 |
+
if is_override:
|
| 560 |
+
# Combo break!
|
| 561 |
+
if self.session.combo > 0:
|
| 562 |
+
self._emit_callback('on_combo_break', self.session.combo)
|
| 563 |
+
|
| 564 |
+
self.session.combo = 0
|
| 565 |
+
|
| 566 |
+
return ArcadeFeedback(
|
| 567 |
+
message="OVERRIDE",
|
| 568 |
+
intensity=0.8,
|
| 569 |
+
sound_cue="override",
|
| 570 |
+
color=(255, 165, 0), # Orange
|
| 571 |
+
)
|
| 572 |
+
|
| 573 |
+
else:
|
| 574 |
+
# Accepted model choice
|
| 575 |
+
self.session.combo += 1
|
| 576 |
+
self.session.max_combo = max(self.session.max_combo, self.session.combo)
|
| 577 |
+
|
| 578 |
+
# Combo milestones
|
| 579 |
+
if self.session.combo in [10, 25, 50, 100]:
|
| 580 |
+
self._emit_callback('on_combo', self.session.combo)
|
| 581 |
+
return ArcadeFeedback(
|
| 582 |
+
message=f"COMBO x{self.session.combo}!",
|
| 583 |
+
intensity=1.0,
|
| 584 |
+
sound_cue="combo",
|
| 585 |
+
color=(0, 255, 255), # Cyan
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
# Regular accept
|
| 589 |
+
return ArcadeFeedback(
|
| 590 |
+
message="",
|
| 591 |
+
intensity=0.3 + min(0.5, self.session.combo * 0.02),
|
| 592 |
+
sound_cue="accept",
|
| 593 |
+
color=(0, 255, 0), # Green
|
| 594 |
+
)
|
| 595 |
+
|
| 596 |
+
# ========================================================================
|
| 597 |
+
# CALLBACKS
|
| 598 |
+
# ========================================================================
|
| 599 |
+
|
| 600 |
+
def on(self, event: str, callback: Callable):
|
| 601 |
+
"""Register callback for events."""
|
| 602 |
+
if event in self.callbacks:
|
| 603 |
+
self.callbacks[event].append(callback)
|
| 604 |
+
|
| 605 |
+
def _emit_callback(self, event: str, *args):
|
| 606 |
+
"""Emit event to callbacks."""
|
| 607 |
+
for cb in self.callbacks.get(event, []):
|
| 608 |
+
try:
|
| 609 |
+
cb(*args)
|
| 610 |
+
except Exception as e:
|
| 611 |
+
print(f"Callback error: {e}")
|
| 612 |
+
|
| 613 |
+
# ========================================================================
|
| 614 |
+
# CASCADE PROVENANCE
|
| 615 |
+
# ========================================================================
|
| 616 |
+
|
| 617 |
+
def _emit_cascade(self, event_type: str, data: Dict[str, Any]):
|
| 618 |
+
"""Emit event to CASCADE bus if available."""
|
| 619 |
+
if self.bus:
|
| 620 |
+
try:
|
| 621 |
+
self.bus.emit(event_type, {
|
| 622 |
+
**data,
|
| 623 |
+
"source": "causation_hold",
|
| 624 |
+
"timestamp": time.time(),
|
| 625 |
+
})
|
| 626 |
+
except Exception:
|
| 627 |
+
pass
|
| 628 |
+
|
| 629 |
+
# ========================================================================
|
| 630 |
+
# HIGH SCORES
|
| 631 |
+
# ========================================================================
|
| 632 |
+
|
| 633 |
+
def _load_high_scores(self) -> Dict[str, Any]:
|
| 634 |
+
"""Load high scores from disk."""
|
| 635 |
+
if self.high_scores_path.exists():
|
| 636 |
+
try:
|
| 637 |
+
return json.loads(self.high_scores_path.read_text())
|
| 638 |
+
except Exception:
|
| 639 |
+
pass
|
| 640 |
+
return {"max_combo": 0, "best_accuracy": 0.0, "total_sessions": 0}
|
| 641 |
+
|
| 642 |
+
def _save_high_scores(self):
|
| 643 |
+
"""Save high scores to disk."""
|
| 644 |
+
self.high_scores_path.parent.mkdir(parents=True, exist_ok=True)
|
| 645 |
+
self.high_scores_path.write_text(json.dumps(self.high_scores, indent=2))
|
| 646 |
+
|
| 647 |
+
def _check_high_score(self, stats: Dict[str, Any]):
|
| 648 |
+
"""Check and update high scores."""
|
| 649 |
+
updated = False
|
| 650 |
+
|
| 651 |
+
if stats['max_combo'] > self.high_scores['max_combo']:
|
| 652 |
+
self.high_scores['max_combo'] = stats['max_combo']
|
| 653 |
+
updated = True
|
| 654 |
+
|
| 655 |
+
if stats['accuracy'] > self.high_scores['best_accuracy']:
|
| 656 |
+
self.high_scores['best_accuracy'] = stats['accuracy']
|
| 657 |
+
updated = True
|
| 658 |
+
|
| 659 |
+
self.high_scores['total_sessions'] += 1
|
| 660 |
+
|
| 661 |
+
if updated:
|
| 662 |
+
self._save_high_scores()
|
| 663 |
+
|
| 664 |
+
# ========================================================================
|
| 665 |
+
# DECORATOR FOR EASY WRAPPING
|
| 666 |
+
# ========================================================================
|
| 667 |
+
|
| 668 |
+
def intercept(self, granularity: str = "step"):
|
| 669 |
+
"""
|
| 670 |
+
Decorator to intercept a function's inference.
|
| 671 |
+
|
| 672 |
+
Args:
|
| 673 |
+
granularity: "step" (each call) or "token" (if function yields)
|
| 674 |
+
"""
|
| 675 |
+
def decorator(func):
|
| 676 |
+
def wrapper(*args, **kwargs):
|
| 677 |
+
# If no session, passthrough
|
| 678 |
+
if not self.session:
|
| 679 |
+
return func(*args, **kwargs)
|
| 680 |
+
|
| 681 |
+
# Capture the input
|
| 682 |
+
input_context = {
|
| 683 |
+
"args": str(args)[:200],
|
| 684 |
+
"kwargs": {k: str(v)[:100] for k, v in kwargs.items()},
|
| 685 |
+
}
|
| 686 |
+
|
| 687 |
+
# Get result
|
| 688 |
+
result = func(*args, **kwargs)
|
| 689 |
+
|
| 690 |
+
# Create candidates from result
|
| 691 |
+
if isinstance(result, np.ndarray):
|
| 692 |
+
# For embeddings, show top dimensions
|
| 693 |
+
top_dims = np.argsort(np.abs(result.flatten()))[-5:][::-1]
|
| 694 |
+
candidates = [
|
| 695 |
+
{"value": f"dim_{d}", "probability": float(np.abs(result.flatten()[d]))}
|
| 696 |
+
for d in top_dims
|
| 697 |
+
]
|
| 698 |
+
else:
|
| 699 |
+
candidates = [{"value": result, "probability": 1.0}]
|
| 700 |
+
|
| 701 |
+
# Capture (may block)
|
| 702 |
+
choice, feedback = self.capture(input_context, candidates)
|
| 703 |
+
|
| 704 |
+
return result
|
| 705 |
+
|
| 706 |
+
return wrapper
|
| 707 |
+
return decorator
|
cascade/identity.py
ADDED
|
@@ -0,0 +1,715 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Model Identity Layer
|
| 3 |
+
|
| 4 |
+
Canonical identification for any AI model variant:
|
| 5 |
+
- Base models (meta-llama/Llama-3-8B)
|
| 6 |
+
- Quantizations (Q4_K_M, Q8_0, AWQ, GPTQ)
|
| 7 |
+
- Fine-tunes (LoRA, full, RLHF)
|
| 8 |
+
- API endpoints (behavioral fingerprinting)
|
| 9 |
+
|
| 10 |
+
Every unique model gets a node in the lattice.
|
| 11 |
+
Every observation links to its model's node.
|
| 12 |
+
The lattice becomes the collective memory of AI behavior.
|
| 13 |
+
|
| 14 |
+
"Same name, different model, different behavior."
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import hashlib
|
| 18 |
+
import json
|
| 19 |
+
import time
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from dataclasses import dataclass, field, asdict
|
| 22 |
+
from typing import Optional, List, Dict, Any
|
| 23 |
+
from enum import Enum
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ModelFormat(Enum):
|
| 27 |
+
"""Model weight formats."""
|
| 28 |
+
SAFETENSORS = "safetensors"
|
| 29 |
+
PYTORCH = "pytorch"
|
| 30 |
+
GGUF = "gguf"
|
| 31 |
+
GGML = "ggml"
|
| 32 |
+
ONNX = "onnx"
|
| 33 |
+
TENSORRT = "tensorrt"
|
| 34 |
+
OPENVINO = "openvino"
|
| 35 |
+
COREML = "coreml"
|
| 36 |
+
API = "api" # No weights, just endpoint
|
| 37 |
+
UNKNOWN = "unknown"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class QuantizationType(Enum):
|
| 41 |
+
"""Quantization methods."""
|
| 42 |
+
NONE = "none" # FP32/FP16/BF16
|
| 43 |
+
GGUF_Q4_0 = "Q4_0"
|
| 44 |
+
GGUF_Q4_K_M = "Q4_K_M"
|
| 45 |
+
GGUF_Q4_K_S = "Q4_K_S"
|
| 46 |
+
GGUF_Q5_0 = "Q5_0"
|
| 47 |
+
GGUF_Q5_K_M = "Q5_K_M"
|
| 48 |
+
GGUF_Q5_K_S = "Q5_K_S"
|
| 49 |
+
GGUF_Q6_K = "Q6_K"
|
| 50 |
+
GGUF_Q8_0 = "Q8_0"
|
| 51 |
+
GPTQ_4BIT = "GPTQ-4bit"
|
| 52 |
+
GPTQ_8BIT = "GPTQ-8bit"
|
| 53 |
+
AWQ_4BIT = "AWQ-4bit"
|
| 54 |
+
BITSANDBYTES_4BIT = "bnb-4bit"
|
| 55 |
+
BITSANDBYTES_8BIT = "bnb-8bit"
|
| 56 |
+
INT8 = "INT8"
|
| 57 |
+
INT4 = "INT4"
|
| 58 |
+
CUSTOM = "custom"
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class FineTuneType(Enum):
|
| 62 |
+
"""Fine-tuning methods."""
|
| 63 |
+
NONE = "none"
|
| 64 |
+
LORA = "lora"
|
| 65 |
+
QLORA = "qlora"
|
| 66 |
+
FULL = "full"
|
| 67 |
+
RLHF = "rlhf"
|
| 68 |
+
DPO = "dpo"
|
| 69 |
+
ORPO = "orpo"
|
| 70 |
+
CUSTOM = "custom"
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@dataclass
|
| 74 |
+
class ModelVariant:
|
| 75 |
+
"""Describes how a model differs from its base."""
|
| 76 |
+
quantization: str = "none"
|
| 77 |
+
format: str = "unknown"
|
| 78 |
+
bits: Optional[int] = None
|
| 79 |
+
provider: Optional[str] = None # Who made this variant (e.g., "TheBloke")
|
| 80 |
+
|
| 81 |
+
def to_dict(self) -> dict:
|
| 82 |
+
return asdict(self)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
@dataclass
|
| 86 |
+
class FineTuneInfo:
|
| 87 |
+
"""Describes fine-tuning applied to a model."""
|
| 88 |
+
type: str = "none"
|
| 89 |
+
adapter_id: Optional[str] = None # HuggingFace adapter ID
|
| 90 |
+
adapter_hash: Optional[str] = None # Hash of adapter weights
|
| 91 |
+
base_model_root: Optional[str] = None # Merkle root of base model identity
|
| 92 |
+
dataset_id: Optional[str] = None # Training dataset
|
| 93 |
+
|
| 94 |
+
def to_dict(self) -> dict:
|
| 95 |
+
return asdict(self)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
@dataclass
|
| 99 |
+
class BehavioralFingerprint:
|
| 100 |
+
"""
|
| 101 |
+
Fingerprint for API models where weights are unavailable.
|
| 102 |
+
Generated by running standard probes and hashing responses.
|
| 103 |
+
"""
|
| 104 |
+
probe_responses: List[Dict[str, Any]] = field(default_factory=list)
|
| 105 |
+
probe_hash: Optional[str] = None
|
| 106 |
+
fingerprint_version: int = 1
|
| 107 |
+
generated_at: Optional[float] = None
|
| 108 |
+
|
| 109 |
+
def to_dict(self) -> dict:
|
| 110 |
+
return asdict(self)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
@dataclass
|
| 114 |
+
class ModelIdentity:
|
| 115 |
+
"""
|
| 116 |
+
Canonical identity for any AI model variant.
|
| 117 |
+
|
| 118 |
+
This is the node that goes in the lattice.
|
| 119 |
+
All observations of this model link to this identity.
|
| 120 |
+
"""
|
| 121 |
+
# === Core Identity ===
|
| 122 |
+
base_model: str # HuggingFace ID or canonical name
|
| 123 |
+
model_id: str # Full unique identifier (computed)
|
| 124 |
+
|
| 125 |
+
# === Variant Info ===
|
| 126 |
+
variant: ModelVariant = field(default_factory=ModelVariant)
|
| 127 |
+
fine_tune: FineTuneInfo = field(default_factory=FineTuneInfo)
|
| 128 |
+
|
| 129 |
+
# === Cryptographic Identity ===
|
| 130 |
+
weight_hash: Optional[str] = None # SHA256 of weights (if available)
|
| 131 |
+
config_hash: Optional[str] = None # SHA256 of model config
|
| 132 |
+
tokenizer_hash: Optional[str] = None # SHA256 of tokenizer
|
| 133 |
+
|
| 134 |
+
# === Behavioral Fingerprint (for APIs) ===
|
| 135 |
+
behavioral_fingerprint: Optional[BehavioralFingerprint] = None
|
| 136 |
+
|
| 137 |
+
# === Source Info ===
|
| 138 |
+
source_url: Optional[str] = None
|
| 139 |
+
source_revision: Optional[str] = None # Git commit/tag
|
| 140 |
+
downloaded_at: Optional[float] = None
|
| 141 |
+
|
| 142 |
+
# === Lattice Info ===
|
| 143 |
+
parent_root: Optional[str] = None # Genesis or base model's merkle root
|
| 144 |
+
merkle_root: Optional[str] = None # This identity's merkle root
|
| 145 |
+
created_at: float = field(default_factory=time.time)
|
| 146 |
+
|
| 147 |
+
# === Metadata ===
|
| 148 |
+
parameters: Optional[int] = None # Parameter count
|
| 149 |
+
context_length: Optional[int] = None
|
| 150 |
+
architecture: Optional[str] = None # "llama", "mistral", "gpt", etc.
|
| 151 |
+
license: Optional[str] = None
|
| 152 |
+
|
| 153 |
+
def __post_init__(self):
|
| 154 |
+
"""Compute derived fields."""
|
| 155 |
+
if not self.model_id:
|
| 156 |
+
self.model_id = self.compute_model_id()
|
| 157 |
+
|
| 158 |
+
def compute_model_id(self) -> str:
|
| 159 |
+
"""
|
| 160 |
+
Compute canonical model ID from components.
|
| 161 |
+
Format: base_model::variant_spec::fine_tune_spec
|
| 162 |
+
"""
|
| 163 |
+
parts = [self.base_model]
|
| 164 |
+
|
| 165 |
+
# Add variant spec
|
| 166 |
+
if self.variant.quantization != "none":
|
| 167 |
+
parts.append(f"q:{self.variant.quantization}")
|
| 168 |
+
if self.variant.format != "unknown":
|
| 169 |
+
parts.append(f"fmt:{self.variant.format}")
|
| 170 |
+
if self.variant.provider:
|
| 171 |
+
parts.append(f"by:{self.variant.provider}")
|
| 172 |
+
|
| 173 |
+
# Add fine-tune spec
|
| 174 |
+
if self.fine_tune.type != "none":
|
| 175 |
+
parts.append(f"ft:{self.fine_tune.type}")
|
| 176 |
+
if self.fine_tune.adapter_id:
|
| 177 |
+
parts.append(f"adapter:{self.fine_tune.adapter_id}")
|
| 178 |
+
|
| 179 |
+
return "::".join(parts)
|
| 180 |
+
|
| 181 |
+
def compute_merkle_root(self) -> str:
|
| 182 |
+
"""Compute merkle root of this identity."""
|
| 183 |
+
# Create canonical representation
|
| 184 |
+
canonical = {
|
| 185 |
+
"base_model": self.base_model,
|
| 186 |
+
"model_id": self.model_id,
|
| 187 |
+
"variant": self.variant.to_dict(),
|
| 188 |
+
"fine_tune": self.fine_tune.to_dict(),
|
| 189 |
+
"weight_hash": self.weight_hash,
|
| 190 |
+
"config_hash": self.config_hash,
|
| 191 |
+
"tokenizer_hash": self.tokenizer_hash,
|
| 192 |
+
"parent_root": self.parent_root,
|
| 193 |
+
"created_at": self.created_at,
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
# Add behavioral fingerprint if present
|
| 197 |
+
if self.behavioral_fingerprint:
|
| 198 |
+
canonical["behavioral_fingerprint"] = self.behavioral_fingerprint.probe_hash
|
| 199 |
+
|
| 200 |
+
# Hash it
|
| 201 |
+
canonical_json = json.dumps(canonical, sort_keys=True)
|
| 202 |
+
self.merkle_root = hashlib.sha256(canonical_json.encode()).hexdigest()[:16]
|
| 203 |
+
return self.merkle_root
|
| 204 |
+
|
| 205 |
+
def finalize(self, parent_root: str = None):
|
| 206 |
+
"""Finalize identity and compute merkle root."""
|
| 207 |
+
if parent_root:
|
| 208 |
+
self.parent_root = parent_root
|
| 209 |
+
self.merkle_root = self.compute_merkle_root()
|
| 210 |
+
return self
|
| 211 |
+
|
| 212 |
+
def to_dict(self) -> dict:
|
| 213 |
+
"""Convert to dictionary for serialization."""
|
| 214 |
+
return {
|
| 215 |
+
"base_model": self.base_model,
|
| 216 |
+
"model_id": self.model_id,
|
| 217 |
+
"variant": self.variant.to_dict(),
|
| 218 |
+
"fine_tune": self.fine_tune.to_dict(),
|
| 219 |
+
"weight_hash": self.weight_hash,
|
| 220 |
+
"config_hash": self.config_hash,
|
| 221 |
+
"tokenizer_hash": self.tokenizer_hash,
|
| 222 |
+
"behavioral_fingerprint": self.behavioral_fingerprint.to_dict() if self.behavioral_fingerprint else None,
|
| 223 |
+
"source_url": self.source_url,
|
| 224 |
+
"source_revision": self.source_revision,
|
| 225 |
+
"downloaded_at": self.downloaded_at,
|
| 226 |
+
"parent_root": self.parent_root,
|
| 227 |
+
"merkle_root": self.merkle_root,
|
| 228 |
+
"created_at": self.created_at,
|
| 229 |
+
"parameters": self.parameters,
|
| 230 |
+
"context_length": self.context_length,
|
| 231 |
+
"architecture": self.architecture,
|
| 232 |
+
"license": self.license,
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
def to_chain_format(self) -> dict:
|
| 236 |
+
"""Convert to provenance chain format for lattice storage."""
|
| 237 |
+
return {
|
| 238 |
+
"session_id": f"model_identity_{self.merkle_root}",
|
| 239 |
+
"model_id": self.model_id,
|
| 240 |
+
"model_hash": self.weight_hash or self.behavioral_fingerprint.probe_hash if self.behavioral_fingerprint else "unknown",
|
| 241 |
+
"input_hash": self.base_model,
|
| 242 |
+
"output_hash": None,
|
| 243 |
+
"records": {
|
| 244 |
+
"identity": {
|
| 245 |
+
"layer_name": "identity",
|
| 246 |
+
"layer_idx": 0,
|
| 247 |
+
"state_hash": self.merkle_root,
|
| 248 |
+
"parent_hashes": [self.parent_root] if self.parent_root else [],
|
| 249 |
+
"params_hash": self.config_hash,
|
| 250 |
+
"shape": [self.parameters] if self.parameters else [0],
|
| 251 |
+
"dtype": "model_identity",
|
| 252 |
+
"stats": self.to_dict(),
|
| 253 |
+
"execution_order": 0,
|
| 254 |
+
"timestamp": self.created_at,
|
| 255 |
+
}
|
| 256 |
+
},
|
| 257 |
+
"external_roots": [self.parent_root] if self.parent_root else [],
|
| 258 |
+
"merkle_root": self.merkle_root,
|
| 259 |
+
"created_at": self.created_at,
|
| 260 |
+
"finalized": True,
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
# =============================================================================
|
| 265 |
+
# STANDARD PROBES FOR BEHAVIORAL FINGERPRINTING
|
| 266 |
+
# =============================================================================
|
| 267 |
+
|
| 268 |
+
STANDARD_PROBES_V1 = [
|
| 269 |
+
# Deterministic probes (temperature=0)
|
| 270 |
+
{
|
| 271 |
+
"id": "math_simple",
|
| 272 |
+
"prompt": "What is 2+2? Answer with just the number.",
|
| 273 |
+
"params": {"temperature": 0, "max_tokens": 10},
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"id": "capital_france",
|
| 277 |
+
"prompt": "Complete this sentence with one word: The capital of France is",
|
| 278 |
+
"params": {"temperature": 0, "max_tokens": 10},
|
| 279 |
+
},
|
| 280 |
+
{
|
| 281 |
+
"id": "translate_hello",
|
| 282 |
+
"prompt": "Translate to French: Hello",
|
| 283 |
+
"params": {"temperature": 0, "max_tokens": 20},
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"id": "color_sky",
|
| 287 |
+
"prompt": "What color is the sky on a clear day? One word answer:",
|
| 288 |
+
"params": {"temperature": 0, "max_tokens": 10},
|
| 289 |
+
},
|
| 290 |
+
|
| 291 |
+
# Capability probes
|
| 292 |
+
{
|
| 293 |
+
"id": "code_simple",
|
| 294 |
+
"prompt": "Write a Python function that adds two numbers. Just the function, no explanation.",
|
| 295 |
+
"params": {"temperature": 0, "max_tokens": 100},
|
| 296 |
+
},
|
| 297 |
+
{
|
| 298 |
+
"id": "reasoning",
|
| 299 |
+
"prompt": "If all cats are mammals and all mammals are animals, are all cats animals? Answer yes or no.",
|
| 300 |
+
"params": {"temperature": 0, "max_tokens": 10},
|
| 301 |
+
},
|
| 302 |
+
|
| 303 |
+
# System prompt probe
|
| 304 |
+
{
|
| 305 |
+
"id": "system_role",
|
| 306 |
+
"prompt": "You are a helpful pirate. Say hello.",
|
| 307 |
+
"params": {"temperature": 0, "max_tokens": 50},
|
| 308 |
+
"system": "You are a helpful pirate who speaks like a pirate.",
|
| 309 |
+
},
|
| 310 |
+
|
| 311 |
+
# Edge cases
|
| 312 |
+
{
|
| 313 |
+
"id": "empty",
|
| 314 |
+
"prompt": "",
|
| 315 |
+
"params": {"temperature": 0, "max_tokens": 50},
|
| 316 |
+
},
|
| 317 |
+
{
|
| 318 |
+
"id": "repetition",
|
| 319 |
+
"prompt": "Repeat after me exactly: The quick brown fox",
|
| 320 |
+
"params": {"temperature": 0, "max_tokens": 20},
|
| 321 |
+
},
|
| 322 |
+
]
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def generate_behavioral_fingerprint(
|
| 326 |
+
call_fn, # Function that takes (prompt, params) and returns response
|
| 327 |
+
probes: List[dict] = None,
|
| 328 |
+
version: int = 1,
|
| 329 |
+
) -> BehavioralFingerprint:
|
| 330 |
+
"""
|
| 331 |
+
Generate behavioral fingerprint by running standard probes.
|
| 332 |
+
|
| 333 |
+
Args:
|
| 334 |
+
call_fn: Function to call the model. Signature: (prompt, params) -> str
|
| 335 |
+
probes: List of probe configs. Defaults to STANDARD_PROBES_V1.
|
| 336 |
+
version: Fingerprint version number.
|
| 337 |
+
|
| 338 |
+
Returns:
|
| 339 |
+
BehavioralFingerprint with hashed responses.
|
| 340 |
+
"""
|
| 341 |
+
if probes is None:
|
| 342 |
+
probes = STANDARD_PROBES_V1
|
| 343 |
+
|
| 344 |
+
responses = []
|
| 345 |
+
for probe in probes:
|
| 346 |
+
try:
|
| 347 |
+
response = call_fn(probe["prompt"], probe.get("params", {}))
|
| 348 |
+
response_hash = hashlib.sha256(str(response).encode()).hexdigest()[:16]
|
| 349 |
+
except Exception as e:
|
| 350 |
+
response_hash = f"error:{type(e).__name__}"
|
| 351 |
+
|
| 352 |
+
responses.append({
|
| 353 |
+
"probe_id": probe["id"],
|
| 354 |
+
"prompt_hash": hashlib.sha256(probe["prompt"].encode()).hexdigest()[:16],
|
| 355 |
+
"response_hash": response_hash,
|
| 356 |
+
})
|
| 357 |
+
|
| 358 |
+
# Compute overall fingerprint hash
|
| 359 |
+
fingerprint_data = json.dumps(responses, sort_keys=True)
|
| 360 |
+
probe_hash = hashlib.sha256(fingerprint_data.encode()).hexdigest()[:16]
|
| 361 |
+
|
| 362 |
+
return BehavioralFingerprint(
|
| 363 |
+
probe_responses=responses,
|
| 364 |
+
probe_hash=probe_hash,
|
| 365 |
+
fingerprint_version=version,
|
| 366 |
+
generated_at=time.time(),
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
# =============================================================================
|
| 371 |
+
# MODEL IDENTITY FACTORY
|
| 372 |
+
# =============================================================================
|
| 373 |
+
|
| 374 |
+
def detect_quantization(model_path: str) -> str:
|
| 375 |
+
"""Detect quantization from model path or name."""
|
| 376 |
+
path_lower = model_path.lower()
|
| 377 |
+
|
| 378 |
+
# GGUF quantizations
|
| 379 |
+
for q in ["q4_k_m", "q4_k_s", "q4_0", "q5_k_m", "q5_k_s", "q5_0", "q6_k", "q8_0"]:
|
| 380 |
+
if q in path_lower:
|
| 381 |
+
return q.upper()
|
| 382 |
+
|
| 383 |
+
# GPTQ
|
| 384 |
+
if "gptq" in path_lower:
|
| 385 |
+
if "4bit" in path_lower or "-4b" in path_lower:
|
| 386 |
+
return "GPTQ-4bit"
|
| 387 |
+
elif "8bit" in path_lower or "-8b" in path_lower:
|
| 388 |
+
return "GPTQ-8bit"
|
| 389 |
+
return "GPTQ"
|
| 390 |
+
|
| 391 |
+
# AWQ
|
| 392 |
+
if "awq" in path_lower:
|
| 393 |
+
return "AWQ-4bit"
|
| 394 |
+
|
| 395 |
+
# BitsAndBytes
|
| 396 |
+
if "bnb" in path_lower or "bitsandbytes" in path_lower:
|
| 397 |
+
if "4bit" in path_lower:
|
| 398 |
+
return "bnb-4bit"
|
| 399 |
+
return "bnb-8bit"
|
| 400 |
+
|
| 401 |
+
return "none"
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
def detect_format(model_path: str) -> str:
|
| 405 |
+
"""Detect model format from path."""
|
| 406 |
+
path_lower = model_path.lower()
|
| 407 |
+
|
| 408 |
+
if ".gguf" in path_lower:
|
| 409 |
+
return "gguf"
|
| 410 |
+
elif ".ggml" in path_lower:
|
| 411 |
+
return "ggml"
|
| 412 |
+
elif ".safetensors" in path_lower or "safetensors" in path_lower:
|
| 413 |
+
return "safetensors"
|
| 414 |
+
elif ".onnx" in path_lower:
|
| 415 |
+
return "onnx"
|
| 416 |
+
elif ".bin" in path_lower or "pytorch" in path_lower:
|
| 417 |
+
return "pytorch"
|
| 418 |
+
elif "api" in path_lower or "http" in path_lower:
|
| 419 |
+
return "api"
|
| 420 |
+
|
| 421 |
+
return "unknown"
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
def detect_provider(model_path: str) -> Optional[str]:
|
| 425 |
+
"""Detect who made this variant."""
|
| 426 |
+
path_lower = model_path.lower()
|
| 427 |
+
|
| 428 |
+
providers = [
|
| 429 |
+
"thebloke",
|
| 430 |
+
"unsloth",
|
| 431 |
+
"mlx-community",
|
| 432 |
+
"bartowski",
|
| 433 |
+
"mradermacher",
|
| 434 |
+
"turboderp",
|
| 435 |
+
]
|
| 436 |
+
|
| 437 |
+
for provider in providers:
|
| 438 |
+
if provider in path_lower:
|
| 439 |
+
return provider
|
| 440 |
+
|
| 441 |
+
return None
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
def create_model_identity(
|
| 445 |
+
model_id: str,
|
| 446 |
+
weights_path: Optional[Path] = None,
|
| 447 |
+
config: Optional[dict] = None,
|
| 448 |
+
parent_root: Optional[str] = None,
|
| 449 |
+
behavioral_fingerprint: Optional[BehavioralFingerprint] = None,
|
| 450 |
+
**kwargs,
|
| 451 |
+
) -> ModelIdentity:
|
| 452 |
+
"""
|
| 453 |
+
Factory function to create ModelIdentity from various inputs.
|
| 454 |
+
|
| 455 |
+
Args:
|
| 456 |
+
model_id: HuggingFace model ID or local path
|
| 457 |
+
weights_path: Path to weights file (for hashing)
|
| 458 |
+
config: Model config dict
|
| 459 |
+
parent_root: Merkle root of parent (genesis or base model)
|
| 460 |
+
behavioral_fingerprint: Pre-computed fingerprint for APIs
|
| 461 |
+
**kwargs: Additional fields (parameters, context_length, etc.)
|
| 462 |
+
|
| 463 |
+
Returns:
|
| 464 |
+
Finalized ModelIdentity ready for lattice
|
| 465 |
+
"""
|
| 466 |
+
# Parse base model from full ID
|
| 467 |
+
# e.g., "TheBloke/Llama-3-8B-GGUF" -> base is "meta-llama/Llama-3-8B"
|
| 468 |
+
base_model = kwargs.pop("base_model", None)
|
| 469 |
+
if not base_model:
|
| 470 |
+
# Try to extract base from model_id
|
| 471 |
+
parts = model_id.split("/")
|
| 472 |
+
if len(parts) >= 2:
|
| 473 |
+
name = parts[-1]
|
| 474 |
+
# Remove common suffixes
|
| 475 |
+
for suffix in ["-GGUF", "-GPTQ", "-AWQ", "-fp16", "-bf16", "-GGML"]:
|
| 476 |
+
name = name.replace(suffix, "")
|
| 477 |
+
base_model = name
|
| 478 |
+
else:
|
| 479 |
+
base_model = model_id
|
| 480 |
+
|
| 481 |
+
# Detect variant info
|
| 482 |
+
quantization = detect_quantization(model_id)
|
| 483 |
+
format_type = detect_format(model_id)
|
| 484 |
+
provider = detect_provider(model_id)
|
| 485 |
+
|
| 486 |
+
# Extract bits from quantization
|
| 487 |
+
bits = None
|
| 488 |
+
if "4" in quantization:
|
| 489 |
+
bits = 4
|
| 490 |
+
elif "5" in quantization:
|
| 491 |
+
bits = 5
|
| 492 |
+
elif "6" in quantization:
|
| 493 |
+
bits = 6
|
| 494 |
+
elif "8" in quantization:
|
| 495 |
+
bits = 8
|
| 496 |
+
|
| 497 |
+
variant = ModelVariant(
|
| 498 |
+
quantization=quantization,
|
| 499 |
+
format=format_type,
|
| 500 |
+
bits=bits,
|
| 501 |
+
provider=provider,
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
+
# Hash weights if available
|
| 505 |
+
weight_hash = None
|
| 506 |
+
if weights_path and Path(weights_path).exists():
|
| 507 |
+
# For large files, hash first and last 1MB + size
|
| 508 |
+
path = Path(weights_path)
|
| 509 |
+
size = path.stat().st_size
|
| 510 |
+
hasher = hashlib.sha256()
|
| 511 |
+
hasher.update(str(size).encode())
|
| 512 |
+
|
| 513 |
+
with open(path, "rb") as f:
|
| 514 |
+
# First 1MB
|
| 515 |
+
hasher.update(f.read(1024 * 1024))
|
| 516 |
+
# Last 1MB
|
| 517 |
+
if size > 2 * 1024 * 1024:
|
| 518 |
+
f.seek(-1024 * 1024, 2)
|
| 519 |
+
hasher.update(f.read())
|
| 520 |
+
|
| 521 |
+
weight_hash = hasher.hexdigest()[:16]
|
| 522 |
+
|
| 523 |
+
# Hash config if available
|
| 524 |
+
config_hash = None
|
| 525 |
+
if config:
|
| 526 |
+
config_json = json.dumps(config, sort_keys=True)
|
| 527 |
+
config_hash = hashlib.sha256(config_json.encode()).hexdigest()[:16]
|
| 528 |
+
|
| 529 |
+
# Create identity
|
| 530 |
+
identity = ModelIdentity(
|
| 531 |
+
base_model=base_model,
|
| 532 |
+
model_id="", # Will be computed
|
| 533 |
+
variant=variant,
|
| 534 |
+
fine_tune=FineTuneInfo(),
|
| 535 |
+
weight_hash=weight_hash,
|
| 536 |
+
config_hash=config_hash,
|
| 537 |
+
behavioral_fingerprint=behavioral_fingerprint,
|
| 538 |
+
parent_root=parent_root,
|
| 539 |
+
**kwargs,
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
# Compute model_id and merkle_root
|
| 543 |
+
identity.model_id = identity.compute_model_id()
|
| 544 |
+
identity.finalize(parent_root)
|
| 545 |
+
|
| 546 |
+
return identity
|
| 547 |
+
|
| 548 |
+
|
| 549 |
+
# =============================================================================
|
| 550 |
+
# MODEL REGISTRY (Lattice Integration)
|
| 551 |
+
# =============================================================================
|
| 552 |
+
|
| 553 |
+
class ModelRegistry:
|
| 554 |
+
"""
|
| 555 |
+
Registry of model identities in the lattice.
|
| 556 |
+
|
| 557 |
+
Provides:
|
| 558 |
+
- Get or create model identity
|
| 559 |
+
- Link observations to model identities
|
| 560 |
+
- Query models by various criteria
|
| 561 |
+
"""
|
| 562 |
+
|
| 563 |
+
def __init__(self, lattice_dir: Path = None, genesis_root: str = None):
|
| 564 |
+
self.lattice_dir = lattice_dir or Path(__file__).parent.parent / "lattice"
|
| 565 |
+
self.models_dir = self.lattice_dir / "models"
|
| 566 |
+
self.models_dir.mkdir(parents=True, exist_ok=True)
|
| 567 |
+
|
| 568 |
+
# Genesis root (models link to this if no base model)
|
| 569 |
+
self.genesis_root = genesis_root or "89f940c1a4b7aa65"
|
| 570 |
+
|
| 571 |
+
# Cache of loaded identities
|
| 572 |
+
self._cache: Dict[str, ModelIdentity] = {}
|
| 573 |
+
self._load_all()
|
| 574 |
+
|
| 575 |
+
def _load_all(self):
|
| 576 |
+
"""Load all model identities from disk."""
|
| 577 |
+
for json_file in self.models_dir.glob("*.json"):
|
| 578 |
+
try:
|
| 579 |
+
data = json.loads(json_file.read_text())
|
| 580 |
+
identity = self._dict_to_identity(data)
|
| 581 |
+
self._cache[identity.merkle_root] = identity
|
| 582 |
+
except Exception as e:
|
| 583 |
+
print(f"Error loading {json_file}: {e}")
|
| 584 |
+
|
| 585 |
+
def _dict_to_identity(self, data: dict) -> ModelIdentity:
|
| 586 |
+
"""Convert dict back to ModelIdentity."""
|
| 587 |
+
variant_data = data.get("variant", {})
|
| 588 |
+
fine_tune_data = data.get("fine_tune", {})
|
| 589 |
+
fingerprint_data = data.get("behavioral_fingerprint")
|
| 590 |
+
|
| 591 |
+
return ModelIdentity(
|
| 592 |
+
base_model=data["base_model"],
|
| 593 |
+
model_id=data["model_id"],
|
| 594 |
+
variant=ModelVariant(**variant_data),
|
| 595 |
+
fine_tune=FineTuneInfo(**fine_tune_data),
|
| 596 |
+
weight_hash=data.get("weight_hash"),
|
| 597 |
+
config_hash=data.get("config_hash"),
|
| 598 |
+
tokenizer_hash=data.get("tokenizer_hash"),
|
| 599 |
+
behavioral_fingerprint=BehavioralFingerprint(**fingerprint_data) if fingerprint_data else None,
|
| 600 |
+
source_url=data.get("source_url"),
|
| 601 |
+
source_revision=data.get("source_revision"),
|
| 602 |
+
downloaded_at=data.get("downloaded_at"),
|
| 603 |
+
parent_root=data.get("parent_root"),
|
| 604 |
+
merkle_root=data.get("merkle_root"),
|
| 605 |
+
created_at=data.get("created_at", time.time()),
|
| 606 |
+
parameters=data.get("parameters"),
|
| 607 |
+
context_length=data.get("context_length"),
|
| 608 |
+
architecture=data.get("architecture"),
|
| 609 |
+
license=data.get("license"),
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
def _save_identity(self, identity: ModelIdentity):
|
| 613 |
+
"""Save identity to disk."""
|
| 614 |
+
filename = f"{identity.merkle_root}.json"
|
| 615 |
+
filepath = self.models_dir / filename
|
| 616 |
+
filepath.write_text(json.dumps(identity.to_dict(), indent=2))
|
| 617 |
+
|
| 618 |
+
def get_or_create(
|
| 619 |
+
self,
|
| 620 |
+
model_id: str,
|
| 621 |
+
**kwargs,
|
| 622 |
+
) -> ModelIdentity:
|
| 623 |
+
"""
|
| 624 |
+
Get existing model identity or create new one.
|
| 625 |
+
|
| 626 |
+
If model already exists in registry, returns existing.
|
| 627 |
+
Otherwise creates new identity linked to genesis or base model.
|
| 628 |
+
"""
|
| 629 |
+
# Check if we have this model already
|
| 630 |
+
for identity in self._cache.values():
|
| 631 |
+
if identity.model_id == model_id or identity.base_model == model_id:
|
| 632 |
+
return identity
|
| 633 |
+
|
| 634 |
+
# Determine parent
|
| 635 |
+
# If this is a variant, try to find base model
|
| 636 |
+
parent_root = kwargs.pop("parent_root", None)
|
| 637 |
+
if not parent_root:
|
| 638 |
+
base = kwargs.get("base_model")
|
| 639 |
+
if base:
|
| 640 |
+
for identity in self._cache.values():
|
| 641 |
+
if identity.base_model == base and identity.variant.quantization == "none":
|
| 642 |
+
parent_root = identity.merkle_root
|
| 643 |
+
break
|
| 644 |
+
|
| 645 |
+
# Default to genesis
|
| 646 |
+
if not parent_root:
|
| 647 |
+
parent_root = self.genesis_root
|
| 648 |
+
|
| 649 |
+
# Create new identity
|
| 650 |
+
identity = create_model_identity(
|
| 651 |
+
model_id=model_id,
|
| 652 |
+
parent_root=parent_root,
|
| 653 |
+
**kwargs,
|
| 654 |
+
)
|
| 655 |
+
|
| 656 |
+
# Cache and save
|
| 657 |
+
self._cache[identity.merkle_root] = identity
|
| 658 |
+
self._save_identity(identity)
|
| 659 |
+
|
| 660 |
+
return identity
|
| 661 |
+
|
| 662 |
+
def get_by_root(self, merkle_root: str) -> Optional[ModelIdentity]:
|
| 663 |
+
"""Get model identity by merkle root."""
|
| 664 |
+
return self._cache.get(merkle_root)
|
| 665 |
+
|
| 666 |
+
def list_all(self) -> List[ModelIdentity]:
|
| 667 |
+
"""List all registered models."""
|
| 668 |
+
return list(self._cache.values())
|
| 669 |
+
|
| 670 |
+
def list_by_base(self, base_model: str) -> List[ModelIdentity]:
|
| 671 |
+
"""List all variants of a base model."""
|
| 672 |
+
return [i for i in self._cache.values() if i.base_model == base_model]
|
| 673 |
+
|
| 674 |
+
def search(self, query: str) -> List[ModelIdentity]:
|
| 675 |
+
"""Search models by name."""
|
| 676 |
+
query_lower = query.lower()
|
| 677 |
+
return [
|
| 678 |
+
i for i in self._cache.values()
|
| 679 |
+
if query_lower in i.model_id.lower() or query_lower in i.base_model.lower()
|
| 680 |
+
]
|
| 681 |
+
|
| 682 |
+
|
| 683 |
+
# =============================================================================
|
| 684 |
+
# CLI
|
| 685 |
+
# =============================================================================
|
| 686 |
+
|
| 687 |
+
if __name__ == "__main__":
|
| 688 |
+
import sys
|
| 689 |
+
|
| 690 |
+
# Test: Create some model identities
|
| 691 |
+
print("=== CASCADE Model Identity Layer ===\n")
|
| 692 |
+
|
| 693 |
+
# Initialize registry
|
| 694 |
+
registry = ModelRegistry()
|
| 695 |
+
|
| 696 |
+
# Create some test identities
|
| 697 |
+
test_models = [
|
| 698 |
+
"meta-llama/Llama-3-8B",
|
| 699 |
+
"TheBloke/Llama-3-8B-GGUF",
|
| 700 |
+
"unsloth/Llama-3-8B-bnb-4bit",
|
| 701 |
+
"anthropic/claude-3-opus",
|
| 702 |
+
"openai/gpt-4",
|
| 703 |
+
]
|
| 704 |
+
|
| 705 |
+
for model in test_models:
|
| 706 |
+
identity = registry.get_or_create(model)
|
| 707 |
+
print(f"Model: {identity.model_id}")
|
| 708 |
+
print(f" Base: {identity.base_model}")
|
| 709 |
+
print(f" Quant: {identity.variant.quantization}")
|
| 710 |
+
print(f" Format: {identity.variant.format}")
|
| 711 |
+
print(f" Merkle: {identity.merkle_root}")
|
| 712 |
+
print(f" Parent: {identity.parent_root}")
|
| 713 |
+
print()
|
| 714 |
+
|
| 715 |
+
print(f"Total models in registry: {len(registry.list_all())}")
|
cascade/ipld.py
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE IPLD - InterPlanetary Linked Data Integration
|
| 3 |
+
|
| 4 |
+
Native IPLD encoding for provenance chains. Merkle roots become CIDs.
|
| 5 |
+
The lattice goes interplanetary.
|
| 6 |
+
|
| 7 |
+
CIDs (Content IDentifiers) are self-describing, content-addressed identifiers.
|
| 8 |
+
When we encode a chain as IPLD, its CID is derived from its content.
|
| 9 |
+
Anyone with the CID can fetch and verify.
|
| 10 |
+
|
| 11 |
+
Architecture:
|
| 12 |
+
ProvenanceChain ──encode──► DAG-CBOR ──hash──► CID
|
| 13 |
+
│
|
| 14 |
+
bafyreif...xyz (interplanetary address)
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import hashlib
|
| 19 |
+
from typing import Dict, Any, Optional, List
|
| 20 |
+
from dataclasses import dataclass
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
|
| 23 |
+
# IPLD encoding
|
| 24 |
+
import dag_cbor
|
| 25 |
+
from multiformats import CID, multihash
|
| 26 |
+
|
| 27 |
+
# CASCADE core
|
| 28 |
+
from cascade.core.provenance import ProvenanceChain, ProvenanceRecord
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# =============================================================================
|
| 32 |
+
# IPLD ENCODING
|
| 33 |
+
# =============================================================================
|
| 34 |
+
|
| 35 |
+
def chain_to_ipld(chain: ProvenanceChain) -> Dict[str, Any]:
|
| 36 |
+
"""
|
| 37 |
+
Convert a ProvenanceChain to IPLD-compatible format.
|
| 38 |
+
|
| 39 |
+
IPLD format uses:
|
| 40 |
+
- Lowercase keys
|
| 41 |
+
- CID links for references
|
| 42 |
+
- DAG-CBOR encoding
|
| 43 |
+
"""
|
| 44 |
+
# Convert records to IPLD format
|
| 45 |
+
records = {}
|
| 46 |
+
for name, record in chain.records.items():
|
| 47 |
+
records[name] = {
|
| 48 |
+
"layer_name": record.layer_name,
|
| 49 |
+
"layer_idx": record.layer_idx,
|
| 50 |
+
"state_hash": record.state_hash,
|
| 51 |
+
"parent_hashes": record.parent_hashes,
|
| 52 |
+
"params_hash": record.params_hash,
|
| 53 |
+
"shape": record.shape,
|
| 54 |
+
"dtype": record.dtype,
|
| 55 |
+
"stats": record.stats,
|
| 56 |
+
"execution_order": record.execution_order,
|
| 57 |
+
"timestamp": record.timestamp,
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
# Convert external_roots to CID links if they look like CIDs
|
| 61 |
+
external_links = []
|
| 62 |
+
for root in chain.external_roots:
|
| 63 |
+
if root.startswith("bafy") or root.startswith("Qm"):
|
| 64 |
+
# Already a CID - create a link
|
| 65 |
+
external_links.append({"/": root})
|
| 66 |
+
else:
|
| 67 |
+
# Legacy merkle root - keep as string
|
| 68 |
+
external_links.append({"legacy_root": root})
|
| 69 |
+
|
| 70 |
+
return {
|
| 71 |
+
"session_id": chain.session_id,
|
| 72 |
+
"model_id": chain.model_id,
|
| 73 |
+
"model_hash": chain.model_hash,
|
| 74 |
+
"input_hash": chain.input_hash,
|
| 75 |
+
"output_hash": chain.output_hash,
|
| 76 |
+
"records": records,
|
| 77 |
+
"external_roots": chain.external_roots, # Keep for verification
|
| 78 |
+
"external_links": external_links, # IPLD links
|
| 79 |
+
"merkle_root": chain.merkle_root,
|
| 80 |
+
"created_at": chain.created_at,
|
| 81 |
+
"finalized": chain.finalized,
|
| 82 |
+
"ipld_version": 1,
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def encode_to_dag_cbor(data: Dict[str, Any]) -> bytes:
|
| 87 |
+
"""Encode data as DAG-CBOR (canonical CBOR for IPLD)."""
|
| 88 |
+
return dag_cbor.encode(data)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def decode_from_dag_cbor(raw: bytes) -> Dict[str, Any]:
|
| 92 |
+
"""Decode DAG-CBOR data."""
|
| 93 |
+
return dag_cbor.decode(raw)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def compute_cid(data: bytes, codec: str = "dag-cbor") -> str:
|
| 97 |
+
"""
|
| 98 |
+
Compute CID (Content IDentifier) from data.
|
| 99 |
+
|
| 100 |
+
CID = multicodec(codec) + multihash(sha256(data))
|
| 101 |
+
|
| 102 |
+
Returns CIDv1 in base32 (bafyrei...)
|
| 103 |
+
"""
|
| 104 |
+
# SHA-256 hash of the data
|
| 105 |
+
digest = hashlib.sha256(data).digest()
|
| 106 |
+
|
| 107 |
+
# Create multihash (0x12 = sha2-256, 0x20 = 32 bytes)
|
| 108 |
+
mh = multihash.wrap(digest, "sha2-256")
|
| 109 |
+
|
| 110 |
+
# Create CID v1 with dag-cbor codec (0x71)
|
| 111 |
+
cid = CID("base32", 1, "dag-cbor", mh)
|
| 112 |
+
|
| 113 |
+
return str(cid)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def chain_to_cid(chain: ProvenanceChain) -> tuple[str, bytes]:
|
| 117 |
+
"""
|
| 118 |
+
Convert chain to CID.
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
(cid_string, encoded_bytes)
|
| 122 |
+
"""
|
| 123 |
+
ipld_data = chain_to_ipld(chain)
|
| 124 |
+
encoded = encode_to_dag_cbor(ipld_data)
|
| 125 |
+
cid = compute_cid(encoded)
|
| 126 |
+
return cid, encoded
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
# =============================================================================
|
| 130 |
+
# IPLD CHAIN - Native CID-based chain
|
| 131 |
+
# =============================================================================
|
| 132 |
+
|
| 133 |
+
@dataclass
|
| 134 |
+
class IPLDChain:
|
| 135 |
+
"""
|
| 136 |
+
A provenance chain with native CID support.
|
| 137 |
+
|
| 138 |
+
Instead of custom merkle roots, uses CIDs.
|
| 139 |
+
Links to other chains via CID references.
|
| 140 |
+
"""
|
| 141 |
+
chain: ProvenanceChain
|
| 142 |
+
cid: Optional[str] = None
|
| 143 |
+
encoded: Optional[bytes] = None
|
| 144 |
+
|
| 145 |
+
@classmethod
|
| 146 |
+
def from_chain(cls, chain: ProvenanceChain) -> 'IPLDChain':
|
| 147 |
+
"""Create IPLD chain from regular chain."""
|
| 148 |
+
cid, encoded = chain_to_cid(chain)
|
| 149 |
+
return cls(chain=chain, cid=cid, encoded=encoded)
|
| 150 |
+
|
| 151 |
+
@classmethod
|
| 152 |
+
def from_bytes(cls, data: bytes) -> 'IPLDChain':
|
| 153 |
+
"""Deserialize from DAG-CBOR bytes."""
|
| 154 |
+
ipld_data = decode_from_dag_cbor(data)
|
| 155 |
+
chain = ipld_to_chain(ipld_data)
|
| 156 |
+
cid = compute_cid(data)
|
| 157 |
+
return cls(chain=chain, cid=cid, encoded=data)
|
| 158 |
+
|
| 159 |
+
def link_to(self, other: 'IPLDChain') -> None:
|
| 160 |
+
"""Link this chain to another via CID."""
|
| 161 |
+
if other.cid is None:
|
| 162 |
+
raise ValueError("Cannot link to chain without CID")
|
| 163 |
+
self.chain.link_external(other.cid, source_id=other.chain.model_id)
|
| 164 |
+
# Recompute our CID since we changed
|
| 165 |
+
self.cid, self.encoded = chain_to_cid(self.chain)
|
| 166 |
+
|
| 167 |
+
def save(self, path: Path) -> None:
|
| 168 |
+
"""Save as DAG-CBOR file."""
|
| 169 |
+
if self.encoded is None:
|
| 170 |
+
self.cid, self.encoded = chain_to_cid(self.chain)
|
| 171 |
+
with open(path, 'wb') as f:
|
| 172 |
+
f.write(self.encoded)
|
| 173 |
+
|
| 174 |
+
@classmethod
|
| 175 |
+
def load(cls, path: Path) -> 'IPLDChain':
|
| 176 |
+
"""Load from DAG-CBOR file."""
|
| 177 |
+
with open(path, 'rb') as f:
|
| 178 |
+
data = f.read()
|
| 179 |
+
return cls.from_bytes(data)
|
| 180 |
+
|
| 181 |
+
def to_json(self) -> str:
|
| 182 |
+
"""Export as JSON (for human inspection)."""
|
| 183 |
+
ipld_data = chain_to_ipld(self.chain)
|
| 184 |
+
ipld_data["_cid"] = self.cid
|
| 185 |
+
return json.dumps(ipld_data, indent=2, default=str)
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def ipld_to_chain(ipld_data: Dict[str, Any]) -> ProvenanceChain:
|
| 189 |
+
"""Convert IPLD data back to ProvenanceChain."""
|
| 190 |
+
# Reconstruct records
|
| 191 |
+
records = {}
|
| 192 |
+
for name, rec_data in ipld_data.get("records", {}).items():
|
| 193 |
+
records[name] = ProvenanceRecord(
|
| 194 |
+
layer_name=rec_data["layer_name"],
|
| 195 |
+
layer_idx=rec_data["layer_idx"],
|
| 196 |
+
state_hash=rec_data["state_hash"],
|
| 197 |
+
parent_hashes=rec_data["parent_hashes"],
|
| 198 |
+
params_hash=rec_data.get("params_hash"),
|
| 199 |
+
shape=rec_data.get("shape", []),
|
| 200 |
+
dtype=rec_data.get("dtype", "float32"),
|
| 201 |
+
stats=rec_data.get("stats", {}),
|
| 202 |
+
execution_order=rec_data.get("execution_order", 0),
|
| 203 |
+
timestamp=rec_data.get("timestamp", 0),
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
chain = ProvenanceChain(
|
| 207 |
+
session_id=ipld_data["session_id"],
|
| 208 |
+
model_id=ipld_data["model_id"],
|
| 209 |
+
model_hash=ipld_data["model_hash"],
|
| 210 |
+
input_hash=ipld_data["input_hash"],
|
| 211 |
+
output_hash=ipld_data.get("output_hash"),
|
| 212 |
+
external_roots=ipld_data.get("external_roots", []),
|
| 213 |
+
merkle_root=ipld_data.get("merkle_root"),
|
| 214 |
+
created_at=ipld_data.get("created_at", 0),
|
| 215 |
+
finalized=ipld_data.get("finalized", False),
|
| 216 |
+
)
|
| 217 |
+
chain.records = records
|
| 218 |
+
|
| 219 |
+
return chain
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
# =============================================================================
|
| 223 |
+
# IPFS PUBLISHING (requires running IPFS daemon)
|
| 224 |
+
# =============================================================================
|
| 225 |
+
|
| 226 |
+
def publish_to_ipfs(chain: IPLDChain, ipfs_api: str = "/ip4/127.0.0.1/tcp/5001") -> str:
|
| 227 |
+
"""
|
| 228 |
+
Publish chain to IPFS network.
|
| 229 |
+
|
| 230 |
+
Requires IPFS daemon running locally.
|
| 231 |
+
Returns the CID (which should match our computed CID).
|
| 232 |
+
|
| 233 |
+
Args:
|
| 234 |
+
chain: IPLDChain to publish
|
| 235 |
+
ipfs_api: IPFS API multiaddr
|
| 236 |
+
|
| 237 |
+
Returns:
|
| 238 |
+
CID from IPFS (for verification)
|
| 239 |
+
"""
|
| 240 |
+
try:
|
| 241 |
+
import ipfshttpclient
|
| 242 |
+
client = ipfshttpclient.connect(ipfs_api)
|
| 243 |
+
|
| 244 |
+
# Add the raw DAG-CBOR data
|
| 245 |
+
result = client.dag.put(
|
| 246 |
+
chain.encoded,
|
| 247 |
+
store_codec="dag-cbor",
|
| 248 |
+
input_codec="dag-cbor"
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
ipfs_cid = result["Cid"]["/"]
|
| 252 |
+
|
| 253 |
+
# Verify CIDs match
|
| 254 |
+
if ipfs_cid != chain.cid:
|
| 255 |
+
print(f"[WARN] CID mismatch: computed={chain.cid}, ipfs={ipfs_cid}")
|
| 256 |
+
|
| 257 |
+
return ipfs_cid
|
| 258 |
+
|
| 259 |
+
except Exception as e:
|
| 260 |
+
print(f"[ERROR] IPFS publish failed: {e}")
|
| 261 |
+
print(" Make sure IPFS daemon is running: ipfs daemon")
|
| 262 |
+
raise
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def fetch_from_ipfs(cid: str, ipfs_api: str = "/ip4/127.0.0.1/tcp/5001") -> IPLDChain:
|
| 266 |
+
"""
|
| 267 |
+
Fetch chain from IPFS network by CID.
|
| 268 |
+
|
| 269 |
+
Args:
|
| 270 |
+
cid: Content identifier
|
| 271 |
+
ipfs_api: IPFS API multiaddr
|
| 272 |
+
|
| 273 |
+
Returns:
|
| 274 |
+
IPLDChain
|
| 275 |
+
"""
|
| 276 |
+
try:
|
| 277 |
+
import ipfshttpclient
|
| 278 |
+
client = ipfshttpclient.connect(ipfs_api)
|
| 279 |
+
|
| 280 |
+
# Get the DAG node
|
| 281 |
+
data = client.dag.get(cid)
|
| 282 |
+
|
| 283 |
+
# Convert to chain
|
| 284 |
+
chain = ipld_to_chain(data)
|
| 285 |
+
encoded = encode_to_dag_cbor(data)
|
| 286 |
+
|
| 287 |
+
return IPLDChain(chain=chain, cid=cid, encoded=encoded)
|
| 288 |
+
|
| 289 |
+
except Exception as e:
|
| 290 |
+
print(f"[ERROR] IPFS fetch failed: {e}")
|
| 291 |
+
raise
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
# =============================================================================
|
| 295 |
+
# GENESIS IN IPLD
|
| 296 |
+
# =============================================================================
|
| 297 |
+
|
| 298 |
+
def get_genesis_cid() -> tuple[str, IPLDChain]:
|
| 299 |
+
"""
|
| 300 |
+
Get genesis as IPLD chain with CID.
|
| 301 |
+
|
| 302 |
+
The genesis CID is deterministic - anyone computing it gets the same result.
|
| 303 |
+
This is the interplanetary Schelling point.
|
| 304 |
+
"""
|
| 305 |
+
from cascade.genesis import create_genesis
|
| 306 |
+
|
| 307 |
+
genesis = create_genesis()
|
| 308 |
+
ipld_genesis = IPLDChain.from_chain(genesis)
|
| 309 |
+
|
| 310 |
+
return ipld_genesis.cid, ipld_genesis
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
# =============================================================================
|
| 314 |
+
# CLI
|
| 315 |
+
# =============================================================================
|
| 316 |
+
|
| 317 |
+
if __name__ == "__main__":
|
| 318 |
+
import sys
|
| 319 |
+
|
| 320 |
+
print("=" * 60)
|
| 321 |
+
print("CASCADE IPLD - InterPlanetary Linked Data")
|
| 322 |
+
print("=" * 60)
|
| 323 |
+
|
| 324 |
+
# Get genesis CID
|
| 325 |
+
genesis_cid, genesis_ipld = get_genesis_cid()
|
| 326 |
+
print(f"\nGenesis CID: {genesis_cid}")
|
| 327 |
+
print(f"Genesis merkle_root: {genesis_ipld.chain.merkle_root}")
|
| 328 |
+
|
| 329 |
+
# Load cascade_alpha and convert to IPLD
|
| 330 |
+
alpha_path = Path("lattice/cascade_alpha.json")
|
| 331 |
+
if alpha_path.exists():
|
| 332 |
+
with open(alpha_path) as f:
|
| 333 |
+
alpha_data = json.load(f)
|
| 334 |
+
alpha_chain = ProvenanceChain.from_dict(alpha_data)
|
| 335 |
+
alpha_ipld = IPLDChain.from_chain(alpha_chain)
|
| 336 |
+
|
| 337 |
+
print(f"\ncascade_alpha CID: {alpha_ipld.cid}")
|
| 338 |
+
print(f"cascade_alpha merkle_root: {alpha_chain.merkle_root}")
|
| 339 |
+
|
| 340 |
+
# Save as DAG-CBOR
|
| 341 |
+
out_dir = Path("lattice/ipld")
|
| 342 |
+
out_dir.mkdir(exist_ok=True)
|
| 343 |
+
|
| 344 |
+
genesis_ipld.save(out_dir / "genesis.cbor")
|
| 345 |
+
alpha_ipld.save(out_dir / "cascade_alpha.cbor")
|
| 346 |
+
|
| 347 |
+
# Also save JSON for inspection
|
| 348 |
+
with open(out_dir / "genesis.ipld.json", 'w') as f:
|
| 349 |
+
f.write(genesis_ipld.to_json())
|
| 350 |
+
with open(out_dir / "cascade_alpha.ipld.json", 'w') as f:
|
| 351 |
+
f.write(alpha_ipld.to_json())
|
| 352 |
+
|
| 353 |
+
print(f"\nSaved to {out_dir}/")
|
| 354 |
+
print(f" - genesis.cbor")
|
| 355 |
+
print(f" - cascade_alpha.cbor")
|
| 356 |
+
print(f" - genesis.ipld.json")
|
| 357 |
+
print(f" - cascade_alpha.ipld.json")
|
| 358 |
+
|
| 359 |
+
print("\n" + "=" * 60)
|
| 360 |
+
print("INTERPLANETARY ADDRESSES")
|
| 361 |
+
print("=" * 60)
|
| 362 |
+
print(f"""
|
| 363 |
+
Genesis: {genesis_cid}
|
| 364 |
+
cascade_alpha: {alpha_ipld.cid if alpha_path.exists() else 'N/A'}
|
| 365 |
+
|
| 366 |
+
These CIDs are content-addressed. Anyone with the CID can:
|
| 367 |
+
1. Fetch the data from IPFS (if pinned)
|
| 368 |
+
2. Verify the content matches the CID
|
| 369 |
+
3. Trust the chain without trusting the source
|
| 370 |
+
|
| 371 |
+
To publish to IPFS:
|
| 372 |
+
ipfs daemon # Start IPFS
|
| 373 |
+
python -c "
|
| 374 |
+
from cascade.ipld import publish_to_ipfs, get_genesis_cid
|
| 375 |
+
_, genesis = get_genesis_cid()
|
| 376 |
+
cid = publish_to_ipfs(genesis)
|
| 377 |
+
print(f'Published: {{cid}}')
|
| 378 |
+
"
|
| 379 |
+
""")
|
cascade/listen.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cascade Passive Monitor.
|
| 3 |
+
|
| 4 |
+
Listens to stdin or follows a log file and observes events.
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
python -m cascade.listen # Listen to stdin
|
| 8 |
+
python -m cascade.listen --follow app.log # Follow a log file
|
| 9 |
+
|
| 10 |
+
This module:
|
| 11 |
+
1. Reads input from stdin or a log file
|
| 12 |
+
2. Pipes lines -> Cascade Adapter
|
| 13 |
+
3. Writes events to tape file (JSONL) and human log (Markdown)
|
| 14 |
+
4. Emits events to event_queue for external consumers
|
| 15 |
+
|
| 16 |
+
For visualization, point a consumer at the event_queue or load the tape file
|
| 17 |
+
into your preferred visualization tool.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import sys
|
| 21 |
+
import argparse
|
| 22 |
+
import time
|
| 23 |
+
import json
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
from queue import Queue
|
| 26 |
+
|
| 27 |
+
# Ensure package root is in path
|
| 28 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 29 |
+
|
| 30 |
+
from cascade import Monitor
|
| 31 |
+
|
| 32 |
+
# Shared event queue for external consumers (e.g., custom UIs)
|
| 33 |
+
event_queue: Queue = Queue()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def main():
|
| 37 |
+
parser = argparse.ArgumentParser(description="Cascade Passive Monitor")
|
| 38 |
+
parser.add_argument("--log-dir", default="./logs", help="Directory for logs")
|
| 39 |
+
parser.add_argument("--follow", help="Log file to follow (tail -f style)")
|
| 40 |
+
parser.add_argument("--quiet", "-q", action="store_true", help="Suppress console output")
|
| 41 |
+
args = parser.parse_args()
|
| 42 |
+
|
| 43 |
+
# 0. Setup Logs & Baggies
|
| 44 |
+
log_dir = Path(args.log_dir)
|
| 45 |
+
log_dir.mkdir(parents=True, exist_ok=True)
|
| 46 |
+
|
| 47 |
+
baggies_dir = log_dir / "baggies"
|
| 48 |
+
baggies_dir.mkdir(exist_ok=True)
|
| 49 |
+
|
| 50 |
+
# Excrement Management (Archive old artifacts)
|
| 51 |
+
follow_abs = Path(args.follow).absolute() if args.follow else None
|
| 52 |
+
for f in log_dir.glob("*.*"):
|
| 53 |
+
if f.is_file() and f.suffix in [".md", ".jsonl", ".log"] and "baggies" not in str(f):
|
| 54 |
+
if follow_abs and f.absolute() == follow_abs:
|
| 55 |
+
continue
|
| 56 |
+
try:
|
| 57 |
+
dest = baggies_dir / f.name
|
| 58 |
+
if dest.exists():
|
| 59 |
+
dest = baggies_dir / f"{f.stem}_{int(time.time())}{f.suffix}"
|
| 60 |
+
f.replace(dest)
|
| 61 |
+
except Exception:
|
| 62 |
+
pass
|
| 63 |
+
print(f"[CASCADE] Logs archived to {baggies_dir}")
|
| 64 |
+
|
| 65 |
+
session_id = int(time.time())
|
| 66 |
+
tape_path = log_dir / f"cascade_tape_{session_id}.jsonl"
|
| 67 |
+
human_path = log_dir / f"cascade_log_{session_id}.md"
|
| 68 |
+
|
| 69 |
+
tape_file = open(tape_path, "w", encoding="utf-8")
|
| 70 |
+
human_file = open(human_path, "w", encoding="utf-8")
|
| 71 |
+
|
| 72 |
+
# Init Log
|
| 73 |
+
human_file.write(f"# CASCADE MISSION LOG // SESSION {session_id}\n")
|
| 74 |
+
human_file.write(f"**Mode:** PASSIVE {'FOLLOWER' if args.follow else 'LISTENER'}\n")
|
| 75 |
+
human_file.write(f"**Target:** `{args.follow or 'STDIN'}`\n---\n\n")
|
| 76 |
+
human_file.flush()
|
| 77 |
+
|
| 78 |
+
print("="*60)
|
| 79 |
+
print("CASCADE // LISTENER")
|
| 80 |
+
print(f"Monitoring: {args.follow if args.follow else 'Standard Input'}")
|
| 81 |
+
print(f"Tape: {tape_path.absolute()}")
|
| 82 |
+
print(f"Baggies: {baggies_dir.absolute()}")
|
| 83 |
+
print("="*60)
|
| 84 |
+
|
| 85 |
+
monitor = Monitor("symbiont_passive")
|
| 86 |
+
|
| 87 |
+
def process_line(line):
|
| 88 |
+
line = line.strip()
|
| 89 |
+
if not line:
|
| 90 |
+
return
|
| 91 |
+
event = monitor.observe(line)
|
| 92 |
+
payload = {
|
| 93 |
+
"event": {
|
| 94 |
+
"event_id": event.event_id,
|
| 95 |
+
"timestamp": event.timestamp,
|
| 96 |
+
"component": event.component,
|
| 97 |
+
"event_type": event.event_type,
|
| 98 |
+
"data": event.data,
|
| 99 |
+
"raw": line, # Include original line for drill-down
|
| 100 |
+
},
|
| 101 |
+
"metrics": monitor.metrics.summary(),
|
| 102 |
+
"triage": monitor.metrics.triage(),
|
| 103 |
+
}
|
| 104 |
+
event_queue.put(payload)
|
| 105 |
+
tape_file.write(json.dumps(payload) + "\n")
|
| 106 |
+
tape_file.flush()
|
| 107 |
+
|
| 108 |
+
# Narrative
|
| 109 |
+
t_str = time.strftime('%H:%M:%S', time.localtime(event.timestamp))
|
| 110 |
+
icon = {"error": "🔴", "warning": "⚠️", "state_change": "🔄"}.get(event.event_type, "ℹ️")
|
| 111 |
+
if "loss" in str(event.data):
|
| 112 |
+
icon = "📉"
|
| 113 |
+
human_file.write(f"### {icon} {t_str} // {event.event_type.upper()}\n")
|
| 114 |
+
human_file.write(f"Event observed in **{event.component}**.\n")
|
| 115 |
+
if event.data:
|
| 116 |
+
human_file.write("```yaml\n")
|
| 117 |
+
for k, v in event.data.items():
|
| 118 |
+
human_file.write(f"{k}: {v}\n")
|
| 119 |
+
human_file.write("```\n")
|
| 120 |
+
human_file.write("\n")
|
| 121 |
+
human_file.flush()
|
| 122 |
+
|
| 123 |
+
# Mirror to console (unless quiet)
|
| 124 |
+
if not args.quiet:
|
| 125 |
+
sys.stdout.write(f"[SIGHT] {line[:80]}...\n")
|
| 126 |
+
sys.stdout.flush()
|
| 127 |
+
|
| 128 |
+
try:
|
| 129 |
+
if args.follow:
|
| 130 |
+
print(f"[CASCADE] Waiting for stream: {args.follow}")
|
| 131 |
+
f_path = Path(args.follow)
|
| 132 |
+
if not f_path.exists():
|
| 133 |
+
f_path.touch()
|
| 134 |
+
with open(f_path, "r", encoding="utf-8", errors="replace") as f:
|
| 135 |
+
print(f"[CASCADE] Scanning for events...")
|
| 136 |
+
while True:
|
| 137 |
+
line = f.readline()
|
| 138 |
+
if not line:
|
| 139 |
+
time.sleep(0.1)
|
| 140 |
+
continue
|
| 141 |
+
process_line(line)
|
| 142 |
+
else:
|
| 143 |
+
print("[CASCADE] Reading from stdin (Ctrl+C to stop)...")
|
| 144 |
+
for line in sys.stdin:
|
| 145 |
+
process_line(line)
|
| 146 |
+
except KeyboardInterrupt:
|
| 147 |
+
print("\n[CASCADE] Detaching...")
|
| 148 |
+
finally:
|
| 149 |
+
tape_file.close()
|
| 150 |
+
human_file.close()
|
| 151 |
+
print(f"[CASCADE] Session complete. Tape: {tape_path}")
|
| 152 |
+
|
| 153 |
+
if __name__ == "__main__":
|
| 154 |
+
main()
|
cascade/logging/__init__.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Logging System
|
| 3 |
+
Industry-standard dual-layer logging for mathematical precision and human clarity.
|
| 4 |
+
|
| 5 |
+
Two modes:
|
| 6 |
+
1. Kleene Mode: Mathematical fixed point logs for debugging and verification
|
| 7 |
+
2. Interpretive Mode: Human-readable causation stories for operators
|
| 8 |
+
|
| 9 |
+
Use together for complete system observability.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from .kleene_logger import (
|
| 13 |
+
KleeneLogger,
|
| 14 |
+
LogLevel,
|
| 15 |
+
get_kleene_logger,
|
| 16 |
+
log_fixed_point,
|
| 17 |
+
log_iterations
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
from .interpretive_logger import (
|
| 21 |
+
InterpretiveLogger,
|
| 22 |
+
ImpactLevel,
|
| 23 |
+
get_interpretive_logger,
|
| 24 |
+
translate_kleene_to_interpretive
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
from .log_manager import (
|
| 28 |
+
LogMode,
|
| 29 |
+
LogConfig,
|
| 30 |
+
CascadeLogManager,
|
| 31 |
+
init_logging,
|
| 32 |
+
get_log_manager,
|
| 33 |
+
log
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def init_cascade_logging(component: str, system: str):
|
| 38 |
+
"""Initialize both logging layers for a component"""
|
| 39 |
+
kleene = get_kleene_logger(component)
|
| 40 |
+
interpretive = get_interpretive_logger(system)
|
| 41 |
+
|
| 42 |
+
# Bridge automatic translation
|
| 43 |
+
def bridge_log(entry):
|
| 44 |
+
translate_kleene_to_interpretive(entry, interpretive)
|
| 45 |
+
|
| 46 |
+
kleene._emit_to_container = lambda entry: (
|
| 47 |
+
print(kleene._format_container(entry)),
|
| 48 |
+
bridge_log(entry)
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
return kleene, interpretive
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# Convenience for quick setup
|
| 55 |
+
def setup_logging(component: str, system: str = "CASCADE"):
|
| 56 |
+
"""Quick setup for both loggers"""
|
| 57 |
+
return init_cascade_logging(component, system)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# Export main interfaces
|
| 61 |
+
__all__ = [
|
| 62 |
+
# Kleene (mathematical)
|
| 63 |
+
'KleeneLogger',
|
| 64 |
+
'LogLevel',
|
| 65 |
+
'get_kleene_logger',
|
| 66 |
+
'log_fixed_point',
|
| 67 |
+
'log_iterations',
|
| 68 |
+
|
| 69 |
+
# Interpretive (human)
|
| 70 |
+
'InterpretiveLogger',
|
| 71 |
+
'ImpactLevel',
|
| 72 |
+
'get_interpretive_logger',
|
| 73 |
+
'translate_kleene_to_interpretive',
|
| 74 |
+
|
| 75 |
+
# Log Manager (orchestrator)
|
| 76 |
+
'LogMode',
|
| 77 |
+
'LogConfig',
|
| 78 |
+
'CascadeLogManager',
|
| 79 |
+
'init_logging',
|
| 80 |
+
'get_log_manager',
|
| 81 |
+
'log',
|
| 82 |
+
|
| 83 |
+
# Unified
|
| 84 |
+
'init_cascade_logging',
|
| 85 |
+
'setup_logging'
|
| 86 |
+
]
|
cascade/logging/color_example.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Color Logging Example
|
| 3 |
+
Shows how to integrate beautiful colored logs throughout your system.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .kleene_logger import get_kleene_logger, LogLevel
|
| 7 |
+
from .interpretive_logger import get_interpretive_logger, ImpactLevel
|
| 8 |
+
|
| 9 |
+
def example_data_processing():
|
| 10 |
+
"""Example: Data processing with beautiful logs"""
|
| 11 |
+
kleene = get_kleene_logger("DataProcessor")
|
| 12 |
+
interpretive = get_interpretive_logger("Data Pipeline")
|
| 13 |
+
|
| 14 |
+
# Start processing
|
| 15 |
+
kleene.log(LogLevel.INFO, "load_dataset_start",
|
| 16 |
+
state_before={"dataset": "smollm3-blueprint.pdf"})
|
| 17 |
+
|
| 18 |
+
interpretive.log(ImpactLevel.LOW, "DataLoader", "Loading dataset",
|
| 19 |
+
context="Reading PDF file for analysis",
|
| 20 |
+
consequence="Will extract text and metadata",
|
| 21 |
+
metrics={"file_size": "1.0MB", "type": "PDF"})
|
| 22 |
+
|
| 23 |
+
# Processing steps
|
| 24 |
+
kleene.log(LogLevel.DEBUG, "extract_text",
|
| 25 |
+
state_before={"page": 1},
|
| 26 |
+
state_after={"pages_processed": 15})
|
| 27 |
+
|
| 28 |
+
# Fixed point reached
|
| 29 |
+
kleene.log(LogLevel.INFO, "processing_complete",
|
| 30 |
+
state_after={"records": 500, "clean": True},
|
| 31 |
+
fixed_point=True,
|
| 32 |
+
iterations=3)
|
| 33 |
+
|
| 34 |
+
interpretive.log(ImpactLevel.MEDIUM, "DataProcessor", "Processing complete",
|
| 35 |
+
context="Successfully extracted and cleaned data",
|
| 36 |
+
consequence="Ready for forensics analysis",
|
| 37 |
+
metrics={"records": 500, "pages": 15, "errors": 0})
|
| 38 |
+
|
| 39 |
+
def example_model_observation():
|
| 40 |
+
"""Example: Model observation with beautiful logs"""
|
| 41 |
+
kleene = get_kleene_logger("ModelObserver")
|
| 42 |
+
interpretive = get_interpretive_logger("Model Observatory")
|
| 43 |
+
|
| 44 |
+
# Model loading
|
| 45 |
+
kleene.log(LogLevel.INFO, "model_load_start",
|
| 46 |
+
state_before={"model": "mistralai/Mixtral-8x22B-Instruct-v0.1"})
|
| 47 |
+
|
| 48 |
+
interpretive.log(ImpactLevel.MEDIUM, "ModelLoader", "Loading Mixtral",
|
| 49 |
+
context="Loading 8x22B MoE model for inference",
|
| 50 |
+
consequence="Will consume significant VRAM",
|
| 51 |
+
metrics={"params": "141B", "active": "39B", "device": "cuda"})
|
| 52 |
+
|
| 53 |
+
# Observation
|
| 54 |
+
kleene.log(LogLevel.INFO, "observation_start",
|
| 55 |
+
state_before={"layers": 0, "hash": "initial"})
|
| 56 |
+
|
| 57 |
+
# Fixed point achieved
|
| 58 |
+
kleene.log(LogLevel.INFO, "observation_fixed_point",
|
| 59 |
+
state_after={"layers": 64, "merkle": "abc123..."},
|
| 60 |
+
fixed_point=True,
|
| 61 |
+
iterations=64)
|
| 62 |
+
|
| 63 |
+
interpretive.log(ImpactLevel.LOW, "CASCADE", "Model observed",
|
| 64 |
+
context="Cryptographic proof generated for model execution",
|
| 65 |
+
consequence="Merkle root provides verifiable audit trail",
|
| 66 |
+
metrics={"model": "Mixtral", "layers": 64, "merkle": "abc123..."})
|
| 67 |
+
|
| 68 |
+
def example_error_handling():
|
| 69 |
+
"""Example: Error handling with colored logs"""
|
| 70 |
+
kleene = get_kleene_logger("ErrorHandler")
|
| 71 |
+
interpretive = get_interpretive_logger("System Monitor")
|
| 72 |
+
|
| 73 |
+
# Error detected
|
| 74 |
+
kleene.log(LogLevel.ERROR, "memory_exhaustion",
|
| 75 |
+
state_before={"memory": "15.8/16GB", "operation": "inference"},
|
| 76 |
+
fixed_point=False)
|
| 77 |
+
|
| 78 |
+
interpretive.log(ImpactLevel.HIGH, "MemoryManager", "Out of memory",
|
| 79 |
+
context="GPU memory exhausted during model inference",
|
| 80 |
+
consequence="Inference failed, system degraded",
|
| 81 |
+
metrics={"used": "15.8GB", "total": "16GB", "available": "200MB"},
|
| 82 |
+
recommendation="Enable gradient checkpointing or use smaller batch size")
|
| 83 |
+
|
| 84 |
+
# Recovery
|
| 85 |
+
kleene.log(LogLevel.WARNING, "fallback_activated",
|
| 86 |
+
state_after={"mode": "cpu_fallback", "batch_size": 1})
|
| 87 |
+
|
| 88 |
+
interpretive.log(ImpactLevel.MEDIUM, "FallbackHandler", "CPU fallback activated",
|
| 89 |
+
context="Switched to CPU inference due to memory constraints",
|
| 90 |
+
consequence="Performance degraded but functionality preserved",
|
| 91 |
+
metrics={"device": "cpu", "batch_size": 1, "slowdown": "10x"})
|
| 92 |
+
|
| 93 |
+
# Run all examples
|
| 94 |
+
if __name__ == "__main__":
|
| 95 |
+
print("\n🎨 CASCADE Color Logging Examples\n")
|
| 96 |
+
print("="*60)
|
| 97 |
+
|
| 98 |
+
example_data_processing()
|
| 99 |
+
print("\n" + "="*60)
|
| 100 |
+
|
| 101 |
+
example_model_observation()
|
| 102 |
+
print("\n" + "="*60)
|
| 103 |
+
|
| 104 |
+
example_error_handling()
|
| 105 |
+
print("\n" + "="*60)
|
| 106 |
+
|
| 107 |
+
print("\n✨ Beautiful logs are ready for production!")
|
cascade/logging/integrate.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Logging Integration
|
| 3 |
+
Plug-and-play logging for existing CASCADE components.
|
| 4 |
+
|
| 5 |
+
Retrofits existing systems with world-class logging without major surgery.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import functools
|
| 9 |
+
import time
|
| 10 |
+
from typing import Any, Callable, Dict, Optional
|
| 11 |
+
|
| 12 |
+
from .log_manager import get_log_manager, LogLevel, ImpactLevel
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def log_component(component_name: str, system: str = "CASCADE"):
|
| 16 |
+
"""Decorator to add logging to any class or function"""
|
| 17 |
+
def decorator(target):
|
| 18 |
+
if isinstance(target, type):
|
| 19 |
+
# Decorating a class
|
| 20 |
+
return _log_class(target, component_name, system)
|
| 21 |
+
else:
|
| 22 |
+
# Decorating a function
|
| 23 |
+
return _log_function(target, component_name, system)
|
| 24 |
+
return decorator
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _log_class(cls, component_name: str, system: str):
|
| 28 |
+
"""Add logging to all methods of a class"""
|
| 29 |
+
manager = get_log_manager()
|
| 30 |
+
manager.register_component(component_name, system)
|
| 31 |
+
|
| 32 |
+
for attr_name in dir(cls):
|
| 33 |
+
if not attr_name.startswith('_'):
|
| 34 |
+
attr = getattr(cls, attr_name)
|
| 35 |
+
if callable(attr):
|
| 36 |
+
setattr(cls, attr_name, _log_method(attr, component_name))
|
| 37 |
+
|
| 38 |
+
return cls
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _log_function(func, component_name: str, system: str):
|
| 42 |
+
"""Add logging to a function"""
|
| 43 |
+
manager = get_log_manager()
|
| 44 |
+
manager.register_component(component_name, system)
|
| 45 |
+
|
| 46 |
+
@functools.wraps(func)
|
| 47 |
+
def wrapper(*args, **kwargs):
|
| 48 |
+
start_time = time.time()
|
| 49 |
+
|
| 50 |
+
# Log start
|
| 51 |
+
get_log_manager().log_operation(
|
| 52 |
+
component_name, f"{func.__name__}_start",
|
| 53 |
+
level=LogLevel.DEBUG,
|
| 54 |
+
impact=ImpactLevel.TRACE,
|
| 55 |
+
details={
|
| 56 |
+
"context": f"Starting {func.__name__}",
|
| 57 |
+
"consequence": f"Will execute {func.__name__}",
|
| 58 |
+
"metrics": {"args": len(args), "kwargs": len(kwargs)}
|
| 59 |
+
}
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
result = func(*args, **kwargs)
|
| 64 |
+
|
| 65 |
+
# Log success
|
| 66 |
+
duration = time.time() - start_time
|
| 67 |
+
get_log_manager().log_operation(
|
| 68 |
+
component_name, f"{func.__name__}_complete",
|
| 69 |
+
level=LogLevel.INFO,
|
| 70 |
+
impact=ImpactLevel.LOW,
|
| 71 |
+
details={
|
| 72 |
+
"context": f"Completed {func.__name__}",
|
| 73 |
+
"consequence": f"Result ready",
|
| 74 |
+
"metrics": {"duration_seconds": duration}
|
| 75 |
+
}
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
return result
|
| 79 |
+
|
| 80 |
+
except Exception as e:
|
| 81 |
+
# Log error
|
| 82 |
+
get_log_manager().log_operation(
|
| 83 |
+
component_name, f"{func.__name__}_error",
|
| 84 |
+
level=LogLevel.ERROR,
|
| 85 |
+
impact=ImpactLevel.HIGH,
|
| 86 |
+
details={
|
| 87 |
+
"context": f"Failed in {func.__name__}",
|
| 88 |
+
"consequence": "Operation failed",
|
| 89 |
+
"metrics": {"error": str(e)}
|
| 90 |
+
}
|
| 91 |
+
)
|
| 92 |
+
raise
|
| 93 |
+
|
| 94 |
+
return wrapper
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def _log_method(method, component_name: str):
|
| 98 |
+
"""Add logging to a method"""
|
| 99 |
+
@functools.wraps(method)
|
| 100 |
+
def wrapper(self, *args, **kwargs):
|
| 101 |
+
start_time = time.time()
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
result = method(self, *args, **kwargs)
|
| 105 |
+
|
| 106 |
+
# Log successful method call
|
| 107 |
+
get_log_manager().log_operation(
|
| 108 |
+
component_name, f"{method.__name__}",
|
| 109 |
+
level=LogLevel.DEBUG,
|
| 110 |
+
impact=ImpactLevel.TRACE,
|
| 111 |
+
details={
|
| 112 |
+
"metrics": {"duration": time.time() - start_time}
|
| 113 |
+
}
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
return result
|
| 117 |
+
|
| 118 |
+
except Exception as e:
|
| 119 |
+
# Log method error
|
| 120 |
+
get_log_manager().log_operation(
|
| 121 |
+
component_name, f"{method.__name__}_error",
|
| 122 |
+
level=LogLevel.ERROR,
|
| 123 |
+
impact=ImpactLevel.HIGH,
|
| 124 |
+
details={
|
| 125 |
+
"context": f"Method {method.__name__} failed",
|
| 126 |
+
"metrics": {"error": str(e)}
|
| 127 |
+
}
|
| 128 |
+
)
|
| 129 |
+
raise
|
| 130 |
+
|
| 131 |
+
return wrapper
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def log_kleene_iterations(operation_name: str):
|
| 135 |
+
"""Decorator specifically for Kleene fixed point iterations"""
|
| 136 |
+
def decorator(func):
|
| 137 |
+
@functools.wraps(func)
|
| 138 |
+
def wrapper(*args, **kwargs):
|
| 139 |
+
get_log_manager().log_operation(
|
| 140 |
+
"KleeneEngine", f"{operation_name}_start",
|
| 141 |
+
level=LogLevel.INFO,
|
| 142 |
+
impact=ImpactLevel.MEDIUM,
|
| 143 |
+
details={
|
| 144 |
+
"context": f"Starting fixed point iteration for {operation_name}",
|
| 145 |
+
"consequence": "Will iterate until convergence"
|
| 146 |
+
}
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
start_time = time.time()
|
| 150 |
+
result = func(*args, **kwargs)
|
| 151 |
+
|
| 152 |
+
# Extract iteration info from result if available
|
| 153 |
+
iterations = getattr(result, 'iterations', 0)
|
| 154 |
+
converged = getattr(result, 'converged', True)
|
| 155 |
+
|
| 156 |
+
get_log_manager().log_operation(
|
| 157 |
+
"KleeneEngine", f"{operation_name}_complete",
|
| 158 |
+
level=LogLevel.INFO,
|
| 159 |
+
impact=ImpactLevel.LOW if converged else ImpactLevel.HIGH,
|
| 160 |
+
details={
|
| 161 |
+
"context": f"Fixed point iteration {'converged' if converged else 'diverged'}",
|
| 162 |
+
"consequence": f"Processed {iterations} iterations",
|
| 163 |
+
"metrics": {
|
| 164 |
+
"iterations": iterations,
|
| 165 |
+
"converged": converged,
|
| 166 |
+
"duration": time.time() - start_time
|
| 167 |
+
},
|
| 168 |
+
"fixed_point": converged
|
| 169 |
+
}
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
return result
|
| 173 |
+
return wrapper
|
| 174 |
+
return decorator
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def log_model_observation(model_id: str):
|
| 178 |
+
"""Decorator for model observation functions"""
|
| 179 |
+
def decorator(func):
|
| 180 |
+
@functools.wraps(func)
|
| 181 |
+
def wrapper(*args, **kwargs):
|
| 182 |
+
get_log_manager().log_operation(
|
| 183 |
+
"ModelObserver", f"observe_{model_id}",
|
| 184 |
+
level=LogLevel.INFO,
|
| 185 |
+
impact=ImpactLevel.MEDIUM,
|
| 186 |
+
details={
|
| 187 |
+
"context": f"Starting observation of model {model_id}",
|
| 188 |
+
"consequence": "Will generate cryptographic proof"
|
| 189 |
+
}
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
result = func(*args, **kwargs)
|
| 193 |
+
|
| 194 |
+
# Extract observation details
|
| 195 |
+
layers = getattr(result, 'layer_count', 0)
|
| 196 |
+
merkle = getattr(result, 'merkle_root', 'unknown')
|
| 197 |
+
|
| 198 |
+
get_log_manager().log_operation(
|
| 199 |
+
"ModelObserver", f"observed_{model_id}",
|
| 200 |
+
level=LogLevel.INFO,
|
| 201 |
+
impact=ImpactLevel.LOW,
|
| 202 |
+
details={
|
| 203 |
+
"context": f"Model observation complete",
|
| 204 |
+
"consequence": "Cryptographic proof generated",
|
| 205 |
+
"metrics": {
|
| 206 |
+
"model": model_id,
|
| 207 |
+
"layers": layers,
|
| 208 |
+
"merkle": merkle[:16] + "..."
|
| 209 |
+
},
|
| 210 |
+
"fixed_point": True
|
| 211 |
+
}
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
return result
|
| 215 |
+
return wrapper
|
| 216 |
+
return decorator
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def log_data_processing(dataset_name: str):
|
| 220 |
+
"""Decorator for data processing functions"""
|
| 221 |
+
def decorator(func):
|
| 222 |
+
@functools.wraps(func)
|
| 223 |
+
def wrapper(*args, **kwargs):
|
| 224 |
+
get_log_manager().log_operation(
|
| 225 |
+
"DataProcessor", f"process_{dataset_name}",
|
| 226 |
+
level=LogLevel.INFO,
|
| 227 |
+
impact=ImpactLevel.MEDIUM,
|
| 228 |
+
details={
|
| 229 |
+
"context": f"Processing dataset {dataset_name}",
|
| 230 |
+
"consequence": "Will extract and analyze data"
|
| 231 |
+
}
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
result = func(*args, **kwargs)
|
| 235 |
+
|
| 236 |
+
# Extract processing stats
|
| 237 |
+
records = getattr(result, 'record_count', 0)
|
| 238 |
+
operations = getattr(result, 'operations', [])
|
| 239 |
+
|
| 240 |
+
get_log_manager().log_operation(
|
| 241 |
+
"DataProcessor", f"processed_{dataset_name}",
|
| 242 |
+
level=LogLevel.INFO,
|
| 243 |
+
impact=ImpactLevel.LOW,
|
| 244 |
+
details={
|
| 245 |
+
"context": f"Dataset processing complete",
|
| 246 |
+
"consequence": f"Processed {records} records",
|
| 247 |
+
"metrics": {
|
| 248 |
+
"dataset": dataset_name,
|
| 249 |
+
"records": records,
|
| 250 |
+
"operations": len(operations)
|
| 251 |
+
}
|
| 252 |
+
}
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
return result
|
| 256 |
+
return wrapper
|
| 257 |
+
return decorator
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
# Quick integration function
|
| 261 |
+
def integrate_cascade_logging():
|
| 262 |
+
"""One-call integration for entire CASCADE system"""
|
| 263 |
+
from ..system.observer import SystemObserver
|
| 264 |
+
from ..core.provenance import ProvenanceTracker
|
| 265 |
+
from data_unity import run_kleene_iteration
|
| 266 |
+
|
| 267 |
+
# Register main components
|
| 268 |
+
manager = get_log_manager()
|
| 269 |
+
manager.register_component("SystemObserver", "System Observatory")
|
| 270 |
+
manager.register_component("ProvenanceTracker", "Model Observatory")
|
| 271 |
+
manager.register_component("DataUnity", "Data Unity")
|
| 272 |
+
manager.register_component("KleeneEngine", "NEXUS")
|
| 273 |
+
|
| 274 |
+
print("✅ CASCADE logging integrated across all components")
|
| 275 |
+
return manager
|
cascade/logging/interpretive_logger.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Interpretive Logger
|
| 3 |
+
Human-readable causation flow logging for operators and stakeholders.
|
| 4 |
+
|
| 5 |
+
Translates mathematical events into stories humans can understand and act upon.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import time
|
| 9 |
+
from dataclasses import dataclass, field
|
| 10 |
+
from enum import Enum
|
| 11 |
+
from typing import Any, Dict, List, Optional
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ImpactLevel(Enum):
|
| 16 |
+
"""Business impact levels"""
|
| 17 |
+
CRITICAL = "🔴 CRITICAL" # Service down, data loss
|
| 18 |
+
HIGH = "🟠 HIGH" # Degraded performance, user impact
|
| 19 |
+
MEDIUM = "🟡 MEDIUM" # Issues detected, monitoring needed
|
| 20 |
+
LOW = "🟢 LOW" # Informational, routine operations
|
| 21 |
+
TRACE = "🔵 TRACE" # Detailed flow, debugging
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class InterpretiveEntry:
|
| 26 |
+
"""A human-readable system event"""
|
| 27 |
+
timestamp: float = field(default_factory=time.time)
|
| 28 |
+
impact: ImpactLevel = ImpactLevel.LOW
|
| 29 |
+
system: str = "" # High-level system name
|
| 30 |
+
component: str = "" # Specific component
|
| 31 |
+
event: str = "" # What happened
|
| 32 |
+
context: str = "" # Why it matters
|
| 33 |
+
consequence: str = "" # What happens next
|
| 34 |
+
metrics: Dict[str, Any] = field(default_factory=dict)
|
| 35 |
+
recommendation: Optional[str] = None
|
| 36 |
+
|
| 37 |
+
def format_display(self) -> str:
|
| 38 |
+
"""Format for beautiful terminal output with colors"""
|
| 39 |
+
time_str = datetime.fromtimestamp(self.timestamp).strftime("%H:%M:%S")
|
| 40 |
+
|
| 41 |
+
# ANSI color codes
|
| 42 |
+
colors = {
|
| 43 |
+
"CRITICAL": ("\033[91m", "🔴"), # Bright red
|
| 44 |
+
"HIGH": ("\033[31m", "🟠"), # Red
|
| 45 |
+
"MEDIUM": ("\033[33m", "🟡"), # Yellow
|
| 46 |
+
"LOW": ("\033[32m", "🟢"), # Green
|
| 47 |
+
"TRACE": ("\033[90m", "🔵"), # Gray
|
| 48 |
+
"RESET": "\033[0m",
|
| 49 |
+
"BOLD": "\033[1m",
|
| 50 |
+
"DIM": "\033[2m",
|
| 51 |
+
"CYAN": "\033[36m",
|
| 52 |
+
"MAGENTA": "\033[35m",
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
color, icon = colors.get(self.impact.value, ("\033[0m", "⚪"))
|
| 56 |
+
reset = colors["RESET"]
|
| 57 |
+
bold = colors["BOLD"]
|
| 58 |
+
dim = colors["DIM"]
|
| 59 |
+
cyan = colors["CYAN"]
|
| 60 |
+
magenta = colors["MAGENTA"]
|
| 61 |
+
|
| 62 |
+
lines = [
|
| 63 |
+
f"\n{color}{bold}{icon} {self.impact.value} [{time_str}] {self.system}{reset}",
|
| 64 |
+
f"├─ {cyan}Component:{reset} {self.component}",
|
| 65 |
+
f"├─ {magenta}Event:{reset} {self.event}",
|
| 66 |
+
f"├─ {dim}Context:{reset} {self.context}",
|
| 67 |
+
f"├─ {dim}Consequence:{reset} {self.consequence}",
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
if self.metrics:
|
| 71 |
+
lines.append(f"├─ {cyan}Metrics:{reset} {self._format_metrics()}")
|
| 72 |
+
|
| 73 |
+
if self.recommendation:
|
| 74 |
+
lines.append(f"└─ {bold}Recommendation:{reset} {self.recommendation}")
|
| 75 |
+
else:
|
| 76 |
+
lines.append(f"└─ {dim}Status: Monitoring{reset}")
|
| 77 |
+
|
| 78 |
+
return "\n".join(lines)
|
| 79 |
+
|
| 80 |
+
def _format_metrics(self) -> str:
|
| 81 |
+
"""Format metrics nicely"""
|
| 82 |
+
return ", ".join([f"{k}={v}" for k, v in self.metrics.items()])
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class InterpretiveLogger:
|
| 86 |
+
"""Human-readable system storytelling"""
|
| 87 |
+
|
| 88 |
+
def __init__(self, system_name: str):
|
| 89 |
+
self.system = system_name
|
| 90 |
+
self.entries: List[InterpretiveEntry] = []
|
| 91 |
+
self.start_time = time.time()
|
| 92 |
+
|
| 93 |
+
def log(self, impact: ImpactLevel, component: str, event: str,
|
| 94 |
+
context: str, consequence: str,
|
| 95 |
+
metrics: Optional[Dict] = None,
|
| 96 |
+
recommendation: Optional[str] = None):
|
| 97 |
+
"""Record a system event"""
|
| 98 |
+
|
| 99 |
+
entry = InterpretiveEntry(
|
| 100 |
+
impact=impact,
|
| 101 |
+
system=self.system,
|
| 102 |
+
component=component,
|
| 103 |
+
event=event,
|
| 104 |
+
context=context,
|
| 105 |
+
consequence=consequence,
|
| 106 |
+
metrics=metrics or {},
|
| 107 |
+
recommendation=recommendation
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
self.entries.append(entry)
|
| 111 |
+
self._emit_to_container(entry)
|
| 112 |
+
|
| 113 |
+
def _emit_to_container(self, entry: InterpretiveEntry):
|
| 114 |
+
"""Emit beautiful formatted log to container"""
|
| 115 |
+
print(entry.format_display())
|
| 116 |
+
|
| 117 |
+
# Convenience methods for common events
|
| 118 |
+
def service_start(self, component: str, port: int = None):
|
| 119 |
+
"""Service started successfully"""
|
| 120 |
+
self.log(
|
| 121 |
+
ImpactLevel.LOW,
|
| 122 |
+
component,
|
| 123 |
+
"Service started",
|
| 124 |
+
f"Component initialized and ready for requests",
|
| 125 |
+
f"Accepting connections on port {port}" if port else "Ready for operations",
|
| 126 |
+
metrics={"port": port} if port else {},
|
| 127 |
+
recommendation="Monitor for healthy connections"
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
def service_error(self, component: str, error: str, impact: ImpactLevel = ImpactLevel.HIGH):
|
| 131 |
+
"""Service encountered error"""
|
| 132 |
+
self.log(
|
| 133 |
+
impact,
|
| 134 |
+
component,
|
| 135 |
+
"Service error",
|
| 136 |
+
f"Component failed to process request",
|
| 137 |
+
f"May affect system reliability",
|
| 138 |
+
metrics={"error": error},
|
| 139 |
+
recommendation="Check component logs and restart if needed"
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
def data_processing(self, dataset: str, records: int, operations: List[str]):
|
| 143 |
+
"""Data processing pipeline"""
|
| 144 |
+
self.log(
|
| 145 |
+
ImpactLevel.MEDIUM,
|
| 146 |
+
"DataProcessor",
|
| 147 |
+
f"Processing {dataset}",
|
| 148 |
+
f"Executing pipeline operations on dataset",
|
| 149 |
+
f"Will process {records:,} records through {len(operations)} stages",
|
| 150 |
+
metrics={
|
| 151 |
+
"dataset": dataset,
|
| 152 |
+
"records": records,
|
| 153 |
+
"operations": len(operations)
|
| 154 |
+
},
|
| 155 |
+
recommendation="Monitor processing progress and error rates"
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
def model_loaded(self, model_id: str, size_gb: float, device: str):
|
| 159 |
+
"""AI model loaded into memory"""
|
| 160 |
+
self.log(
|
| 161 |
+
ImpactLevel.MEDIUM,
|
| 162 |
+
"ModelLoader",
|
| 163 |
+
f"Model {model_id} loaded",
|
| 164 |
+
f"Neural network loaded and ready for inference",
|
| 165 |
+
f"Consuming {size_gb:.1f}GB VRAM on {device}",
|
| 166 |
+
metrics={
|
| 167 |
+
"model": model_id,
|
| 168 |
+
"size_gb": size_gb,
|
| 169 |
+
"device": device
|
| 170 |
+
},
|
| 171 |
+
recommendation="Monitor GPU memory usage during inference"
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
def security_event(self, component: str, event: str, details: str):
|
| 175 |
+
"""Security-related event"""
|
| 176 |
+
self.log(
|
| 177 |
+
ImpactLevel.CRITICAL,
|
| 178 |
+
component,
|
| 179 |
+
f"Security: {event}",
|
| 180 |
+
f"Security system detected potential threat",
|
| 181 |
+
f"Immediate investigation required",
|
| 182 |
+
metrics={"details": details},
|
| 183 |
+
recommendation="Review security logs and consider blocking source"
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
def performance_warning(self, component: str, metric: str, value: float, threshold: float):
|
| 187 |
+
"""Performance threshold exceeded"""
|
| 188 |
+
self.log(
|
| 189 |
+
ImpactLevel.HIGH,
|
| 190 |
+
component,
|
| 191 |
+
f"Performance warning: {metric}",
|
| 192 |
+
f"Component performance degraded",
|
| 193 |
+
f"May impact user experience if continues",
|
| 194 |
+
metrics={metric: value, "threshold": threshold},
|
| 195 |
+
recommendation=f"Optimize {metric} or scale resources"
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
def cascade_observation(self, model: str, layers: int, merkle_root: str):
|
| 199 |
+
"""CASCADE observed model execution"""
|
| 200 |
+
self.log(
|
| 201 |
+
ImpactLevel.INFO,
|
| 202 |
+
"CASCADE",
|
| 203 |
+
f"Model observation complete",
|
| 204 |
+
f"Cryptographic proof generated for model execution",
|
| 205 |
+
f"Merkle root provides verifiable audit trail",
|
| 206 |
+
metrics={
|
| 207 |
+
"model": model,
|
| 208 |
+
"layers": layers,
|
| 209 |
+
"merkle": merkle_root[:16] + "..."
|
| 210 |
+
},
|
| 211 |
+
recommendation="Store attestation for permanent records"
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
def fixed_point_convergence(self, operation: str, iterations: int, entities: int):
|
| 215 |
+
"""Mathematical fixed point reached"""
|
| 216 |
+
self.log(
|
| 217 |
+
ImpactLevel.INFO,
|
| 218 |
+
"KleeneEngine",
|
| 219 |
+
f"Fixed point convergence",
|
| 220 |
+
f"{operation} completed after {iterations} iterations",
|
| 221 |
+
f"Resolved relationships for {entities} entities",
|
| 222 |
+
metrics={
|
| 223 |
+
"operation": operation,
|
| 224 |
+
"iterations": iterations,
|
| 225 |
+
"entities": entities
|
| 226 |
+
},
|
| 227 |
+
recommendation="Review convergence quality metrics"
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
# Global interpretive loggers
|
| 232 |
+
_interpretive_loggers: Dict[str, InterpretiveLogger] = {}
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def get_interpretive_logger(system: str) -> InterpretiveLogger:
|
| 236 |
+
"""Get or create interpretive logger for system"""
|
| 237 |
+
if system not in _interpretive_loggers:
|
| 238 |
+
_interpretive_loggers[system] = InterpretiveLogger(system)
|
| 239 |
+
return _interpretive_loggers[system]
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
# Bridge function to translate Kleene logs to interpretive
|
| 243 |
+
def translate_kleene_to_interpretive(kleene_entry, interpretive_logger):
|
| 244 |
+
"""Translate mathematical log to human story"""
|
| 245 |
+
|
| 246 |
+
# Map Kleene levels to impact levels
|
| 247 |
+
impact_map = {
|
| 248 |
+
"CRITICAL": ImpactLevel.CRITICAL,
|
| 249 |
+
"ERROR": ImpactLevel.HIGH,
|
| 250 |
+
"WARNING": ImpactLevel.MEDIUM,
|
| 251 |
+
"INFO": ImpactLevel.LOW,
|
| 252 |
+
"DEBUG": ImpactLevel.TRACE,
|
| 253 |
+
"TRACE": ImpactLevel.TRACE
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
# Create human-readable context
|
| 257 |
+
if kleene_entry.fixed_point_reached:
|
| 258 |
+
event = f"Mathematical convergence achieved"
|
| 259 |
+
context = f"Operation {kleene_entry.operation} reached stable state"
|
| 260 |
+
consequence = "System can proceed with verified result"
|
| 261 |
+
else:
|
| 262 |
+
event = f"State transition in {kleene_entry.operation}"
|
| 263 |
+
context = f"Component processing through iterations"
|
| 264 |
+
consequence = "Continuing toward fixed point"
|
| 265 |
+
|
| 266 |
+
interpretive_logger.log(
|
| 267 |
+
impact_map.get(kleene_entry.level.value, ImpactLevel.LOW),
|
| 268 |
+
kleene_entry.component,
|
| 269 |
+
event,
|
| 270 |
+
context,
|
| 271 |
+
consequence,
|
| 272 |
+
metrics={
|
| 273 |
+
"iterations": kleene_entry.iteration_count,
|
| 274 |
+
"hash": kleene_entry.hash_value
|
| 275 |
+
}
|
| 276 |
+
)
|
cascade/logging/kleene_logger.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Kleene Fixed Point Logger
|
| 3 |
+
Industry-standard mathematical logging for debugging and verification.
|
| 4 |
+
|
| 5 |
+
Each log entry is a fixed point observation - hashable, verifiable, complete.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import hashlib
|
| 9 |
+
import json
|
| 10 |
+
import time
|
| 11 |
+
from dataclasses import dataclass, field
|
| 12 |
+
from enum import Enum
|
| 13 |
+
from typing import Any, Dict, List, Optional
|
| 14 |
+
from contextlib import contextmanager
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class LogLevel(Enum):
|
| 18 |
+
"""Mathematical significance levels"""
|
| 19 |
+
CRITICAL = "CRITICAL" # System-breaking fixed point failure
|
| 20 |
+
ERROR = "ERROR" # Fixed point not reached
|
| 21 |
+
WARNING = "WARNING" # Unexpected state transition
|
| 22 |
+
INFO = "INFO" # Fixed point achieved
|
| 23 |
+
DEBUG = "DEBUG" # State transition details
|
| 24 |
+
TRACE = "TRACE" # Every computation step
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class KleeneLogEntry:
|
| 29 |
+
"""A single fixed point observation"""
|
| 30 |
+
timestamp: float = field(default_factory=time.time)
|
| 31 |
+
level: LogLevel = LogLevel.INFO
|
| 32 |
+
component: str = ""
|
| 33 |
+
operation: str = ""
|
| 34 |
+
state_before: Optional[Dict] = None
|
| 35 |
+
state_after: Optional[Dict] = None
|
| 36 |
+
fixed_point_reached: bool = False
|
| 37 |
+
iteration_count: int = 0
|
| 38 |
+
hash_value: str = field(init=False)
|
| 39 |
+
|
| 40 |
+
def __post_init__(self):
|
| 41 |
+
# Create content hash for verifiability
|
| 42 |
+
content = {
|
| 43 |
+
"timestamp": self.timestamp,
|
| 44 |
+
"component": self.component,
|
| 45 |
+
"operation": self.operation,
|
| 46 |
+
"state_before": self.state_before,
|
| 47 |
+
"state_after": self.state_after,
|
| 48 |
+
"iteration": self.iteration_count
|
| 49 |
+
}
|
| 50 |
+
self.hash_value = hashlib.sha256(
|
| 51 |
+
json.dumps(content, sort_keys=True).encode()
|
| 52 |
+
).hexdigest()[:16]
|
| 53 |
+
|
| 54 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 55 |
+
return {
|
| 56 |
+
"ts": self.timestamp,
|
| 57 |
+
"lvl": self.level.value,
|
| 58 |
+
"comp": self.component,
|
| 59 |
+
"op": self.operation,
|
| 60 |
+
"before": self.state_before,
|
| 61 |
+
"after": self.state_after,
|
| 62 |
+
"fixed": self.fixed_point_reached,
|
| 63 |
+
"iter": self.iteration_count,
|
| 64 |
+
"hash": self.hash_value
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class KleeneLogger:
|
| 69 |
+
"""Mathematical logging for fixed point systems"""
|
| 70 |
+
|
| 71 |
+
def __init__(self, component_name: str):
|
| 72 |
+
self.component = component_name
|
| 73 |
+
self.entries: List[KleeneLogEntry] = []
|
| 74 |
+
self.session_start = time.time()
|
| 75 |
+
self.operation_count = 0
|
| 76 |
+
|
| 77 |
+
def log(self, level: LogLevel, operation: str,
|
| 78 |
+
state_before: Optional[Dict] = None,
|
| 79 |
+
state_after: Optional[Dict] = None,
|
| 80 |
+
fixed_point: bool = False,
|
| 81 |
+
iterations: int = 0):
|
| 82 |
+
"""Record a state transition"""
|
| 83 |
+
|
| 84 |
+
entry = KleeneLogEntry(
|
| 85 |
+
level=level,
|
| 86 |
+
component=self.component,
|
| 87 |
+
operation=operation,
|
| 88 |
+
state_before=state_before,
|
| 89 |
+
state_after=state_after,
|
| 90 |
+
fixed_point_reached=fixed_point,
|
| 91 |
+
iteration_count=iterations
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
self.entries.append(entry)
|
| 95 |
+
self._emit_to_container(entry)
|
| 96 |
+
|
| 97 |
+
def _emit_to_container(self, entry: KleeneLogEntry):
|
| 98 |
+
"""Emit structured log to container with colors"""
|
| 99 |
+
# ANSI color codes
|
| 100 |
+
colors = {
|
| 101 |
+
"CRITICAL": "\033[91m", # Bright red
|
| 102 |
+
"ERROR": "\033[31m", # Red
|
| 103 |
+
"WARNING": "\033[33m", # Yellow
|
| 104 |
+
"INFO": "\033[32m", # Green
|
| 105 |
+
"DEBUG": "\033[36m", # Cyan
|
| 106 |
+
"TRACE": "\033[90m", # Gray
|
| 107 |
+
"RESET": "\033[0m", # Reset
|
| 108 |
+
"BOLD": "\033[1m", # Bold
|
| 109 |
+
"DIM": "\033[2m", # Dim
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
color = colors.get(entry.level.value, colors["RESET"])
|
| 113 |
+
reset = colors["RESET"]
|
| 114 |
+
dim = colors["DIM"]
|
| 115 |
+
|
| 116 |
+
# Format with colors
|
| 117 |
+
print(f"{color}[KLEENE]{reset} {color}{entry.level.value:8}{reset} | "
|
| 118 |
+
f"{dim}{entry.component:20}{reset} | "
|
| 119 |
+
f"{entry.operation:30} | "
|
| 120 |
+
f"Iter:{entry.iteration_count:3} | "
|
| 121 |
+
f"Fixed:{'Y' if entry.fixed_point_reached else 'N':1} | "
|
| 122 |
+
f"{dim}Hash:{entry.hash_value}{reset}")
|
| 123 |
+
|
| 124 |
+
@contextmanager
|
| 125 |
+
def observe_operation(self, operation: str, initial_state: Dict):
|
| 126 |
+
"""Context manager for observing operations"""
|
| 127 |
+
self.operation_count += 1
|
| 128 |
+
iterations = 0
|
| 129 |
+
|
| 130 |
+
try:
|
| 131 |
+
self.log(LogLevel.DEBUG, f"{operation}_start",
|
| 132 |
+
state_before=initial_state)
|
| 133 |
+
|
| 134 |
+
# Yield control back to operation
|
| 135 |
+
yield self
|
| 136 |
+
|
| 137 |
+
# Operation completed successfully
|
| 138 |
+
self.log(LogLevel.INFO, f"{operation}_complete",
|
| 139 |
+
fixed_point=True, iterations=iterations)
|
| 140 |
+
|
| 141 |
+
except Exception as e:
|
| 142 |
+
self.log(LogLevel.ERROR, f"{operation}_failed",
|
| 143 |
+
state_after={"error": str(e)})
|
| 144 |
+
raise
|
| 145 |
+
|
| 146 |
+
def fixed_point(self, operation: str, final_state: Dict, iterations: int):
|
| 147 |
+
"""Log successful fixed point convergence"""
|
| 148 |
+
self.log(LogLevel.INFO, f"{operation}_fixed_point",
|
| 149 |
+
state_after=final_state,
|
| 150 |
+
fixed_point=True,
|
| 151 |
+
iterations=iterations)
|
| 152 |
+
|
| 153 |
+
def divergence(self, operation: str, state: Dict):
|
| 154 |
+
"""Log when system diverges (no fixed point)"""
|
| 155 |
+
self.log(LogLevel.WARNING, f"{operation}_divergence",
|
| 156 |
+
state_after=state,
|
| 157 |
+
fixed_point=False)
|
| 158 |
+
|
| 159 |
+
def critical_failure(self, operation: str, error_state: Dict):
|
| 160 |
+
"""Log critical system failure"""
|
| 161 |
+
self.log(LogLevel.CRITICAL, f"{operation}_critical",
|
| 162 |
+
state_after=error_state,
|
| 163 |
+
fixed_point=False)
|
| 164 |
+
|
| 165 |
+
def get_session_hash(self) -> str:
|
| 166 |
+
"""Get hash of entire session for verification"""
|
| 167 |
+
content = {
|
| 168 |
+
"component": self.component,
|
| 169 |
+
"start": self.session_start,
|
| 170 |
+
"operations": self.operation_count,
|
| 171 |
+
"entries": [e.hash_value for e in self.entries]
|
| 172 |
+
}
|
| 173 |
+
return hashlib.sha256(json.dumps(content).encode()).hexdigest()
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# Global loggers for major components
|
| 177 |
+
_loggers: Dict[str, KleeneLogger] = {}
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def get_kleene_logger(component: str) -> KleeneLogger:
|
| 181 |
+
"""Get or create logger for component"""
|
| 182 |
+
if component not in _loggers:
|
| 183 |
+
_loggers[component] = KleeneLogger(component)
|
| 184 |
+
return _loggers[component]
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
# Convenience decorators
|
| 188 |
+
def log_fixed_point(operation: str):
|
| 189 |
+
"""Decorator to automatically log fixed point operations"""
|
| 190 |
+
def decorator(func):
|
| 191 |
+
def wrapper(*args, **kwargs):
|
| 192 |
+
logger = get_kleene_logger(func.__module__)
|
| 193 |
+
start_state = {"args": str(args), "kwargs": str(kwargs)}
|
| 194 |
+
|
| 195 |
+
try:
|
| 196 |
+
result = func(*args, **kwargs)
|
| 197 |
+
logger.fixed_point(operation, {"result": str(result)}, 1)
|
| 198 |
+
return result
|
| 199 |
+
except Exception as e:
|
| 200 |
+
logger.critical_failure(operation, {"error": str(e)})
|
| 201 |
+
raise
|
| 202 |
+
return wrapper
|
| 203 |
+
return decorator
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def log_iterations(operation: str):
|
| 207 |
+
"""Decorator for operations that iterate to fixed points"""
|
| 208 |
+
def decorator(func):
|
| 209 |
+
def wrapper(*args, **kwargs):
|
| 210 |
+
logger = get_kleene_logger(func.__module__)
|
| 211 |
+
|
| 212 |
+
# Simulate iteration counting (real implementation would track)
|
| 213 |
+
result = func(*args, **kwargs)
|
| 214 |
+
iterations = getattr(result, 'iterations', 1)
|
| 215 |
+
|
| 216 |
+
logger.fixed_point(operation, {"converged": True}, iterations)
|
| 217 |
+
return result
|
| 218 |
+
return wrapper
|
| 219 |
+
return decorator
|
cascade/logging/log_manager.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Log Manager
|
| 3 |
+
Orchestrates the tsunami of data into ordered causation troops.
|
| 4 |
+
|
| 5 |
+
Manages log levels, routing, and the beautiful display of system truth.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
import time
|
| 11 |
+
from typing import Dict, List, Optional, Any
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
from enum import Enum
|
| 14 |
+
|
| 15 |
+
from .kleene_logger import KleeneLogger, LogLevel
|
| 16 |
+
from .interpretive_logger import InterpretiveLogger, ImpactLevel
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class LogMode(Enum):
|
| 20 |
+
"""The two modes of logging excellence"""
|
| 21 |
+
KLEENE = "kleene" # Mathematical precision
|
| 22 |
+
INTERPRETIVE = "interpretive" # Human stories
|
| 23 |
+
DUAL = "dual" # Both simultaneously
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class LogConfig:
|
| 28 |
+
"""Configuration for logging behavior"""
|
| 29 |
+
mode: LogMode = LogMode.DUAL
|
| 30 |
+
min_level_kleene: LogLevel = LogLevel.INFO
|
| 31 |
+
min_level_interpretive: ImpactLevel = ImpactLevel.LOW
|
| 32 |
+
show_metrics: bool = True
|
| 33 |
+
show_timestamps: bool = True
|
| 34 |
+
color_output: bool = True
|
| 35 |
+
file_output: bool = False
|
| 36 |
+
max_file_size_mb: int = 100
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class CascadeLogManager:
|
| 40 |
+
"""The conductor of your causation orchestra"""
|
| 41 |
+
|
| 42 |
+
def __init__(self, config: Optional[LogConfig] = None):
|
| 43 |
+
self.config = config or LogConfig()
|
| 44 |
+
self.kleene_loggers: Dict[str, KleeneLogger] = {}
|
| 45 |
+
self.interpretive_loggers: Dict[str, InterpretiveLogger] = {}
|
| 46 |
+
self.start_time = time.time()
|
| 47 |
+
self.operation_count = 0
|
| 48 |
+
|
| 49 |
+
# Initialize display
|
| 50 |
+
self._setup_display()
|
| 51 |
+
|
| 52 |
+
def _setup_display(self):
|
| 53 |
+
"""Setup beautiful terminal output"""
|
| 54 |
+
if self.config.color_output:
|
| 55 |
+
# Enable ANSI colors
|
| 56 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
| 57 |
+
|
| 58 |
+
# Print header
|
| 59 |
+
self._print_header()
|
| 60 |
+
|
| 61 |
+
def _print_header(self):
|
| 62 |
+
"""Print beautiful cascade header with colors"""
|
| 63 |
+
# ANSI color codes
|
| 64 |
+
colors = {
|
| 65 |
+
"WAVE": "\033[94m", # Bright blue
|
| 66 |
+
"BRIDGE": "\033[96m", # Cyan
|
| 67 |
+
"BOLD": "\033[1m",
|
| 68 |
+
"DIM": "\033[2m",
|
| 69 |
+
"RESET": "\033[0m",
|
| 70 |
+
"GREEN": "\033[32m",
|
| 71 |
+
"YELLOW": "\033[33m",
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
wave = colors["WAVE"]
|
| 75 |
+
bridge = colors["BRIDGE"]
|
| 76 |
+
bold = colors["BOLD"]
|
| 77 |
+
dim = colors["DIM"]
|
| 78 |
+
reset = colors["RESET"]
|
| 79 |
+
green = colors["GREEN"]
|
| 80 |
+
yellow = colors["YELLOW"]
|
| 81 |
+
|
| 82 |
+
print(f"\n{bold}{'='*80}{reset}")
|
| 83 |
+
print(f"{wave}🌊{reset} {bold}CASCADE // TRUTH INFRASTRUCTURE{reset} {bridge}🧠{reset}")
|
| 84 |
+
print(f"{bold}{'='*80}{reset}")
|
| 85 |
+
print(f"{bold}Mode:{reset} {green}{self.config.mode.value.upper()}{reset}")
|
| 86 |
+
print(f"{bold}Started:{reset} {dim}{time.strftime('%Y-%m-%d %H:%M:%S')}{reset}")
|
| 87 |
+
print(f"{bold}{'='*80}{reset}\n")
|
| 88 |
+
|
| 89 |
+
def register_component(self, component: str, system: str = "CASCADE"):
|
| 90 |
+
"""Register a component for logging"""
|
| 91 |
+
if self.config.mode in [LogMode.KLEENE, LogMode.DUAL]:
|
| 92 |
+
kleene = KleeneLogger(component)
|
| 93 |
+
self.kleene_loggers[component] = kleene
|
| 94 |
+
|
| 95 |
+
if self.config.mode in [LogMode.INTERPRETIVE, LogMode.DUAL]:
|
| 96 |
+
interpretive = InterpretiveLogger(system)
|
| 97 |
+
self.interpretive_loggers[system] = interpretive
|
| 98 |
+
|
| 99 |
+
def log_operation(self, component: str, operation: str,
|
| 100 |
+
level: LogLevel = LogLevel.INFO,
|
| 101 |
+
impact: ImpactLevel = ImpactLevel.LOW,
|
| 102 |
+
details: Optional[Dict] = None):
|
| 103 |
+
"""Log an operation across all active loggers"""
|
| 104 |
+
self.operation_count += 1
|
| 105 |
+
|
| 106 |
+
if self.config.mode in [LogMode.KLEENE, LogMode.DUAL]:
|
| 107 |
+
if component in self.kleene_loggers:
|
| 108 |
+
self.kleene_loggers[component].log(
|
| 109 |
+
level, operation,
|
| 110 |
+
state_before=details.get("before") if details else None,
|
| 111 |
+
state_after=details.get("after") if details else None,
|
| 112 |
+
fixed_point=details.get("fixed_point", False) if details else False,
|
| 113 |
+
iterations=details.get("iterations", 0) if details else 0
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
if self.config.mode in [LogMode.INTERPRETIVE, LogMode.DUAL]:
|
| 117 |
+
# Find interpretive logger for component
|
| 118 |
+
system = details.get("system", "CASCADE") if details else "CASCADE"
|
| 119 |
+
if system in self.interpretive_loggers:
|
| 120 |
+
self.interpretive_loggers[system].log(
|
| 121 |
+
impact, component, operation,
|
| 122 |
+
context=details.get("context", "") if details else "",
|
| 123 |
+
consequence=details.get("consequence", "") if details else "",
|
| 124 |
+
metrics=details.get("metrics", {}) if details else {},
|
| 125 |
+
recommendation=details.get("recommendation") if details else None
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
def get_session_stats(self) -> Dict[str, Any]:
|
| 129 |
+
"""Get beautiful session statistics"""
|
| 130 |
+
total_kleene = sum(len(logger.entries) for logger in self.kleene_loggers.values())
|
| 131 |
+
total_interpretive = sum(len(logger.entries) for logger in self.interpretive_loggers.values())
|
| 132 |
+
|
| 133 |
+
return {
|
| 134 |
+
"uptime_seconds": time.time() - self.start_time,
|
| 135 |
+
"operations": self.operation_count,
|
| 136 |
+
"kleene_entries": total_kleene,
|
| 137 |
+
"interpretive_entries": total_interpretive,
|
| 138 |
+
"active_components": len(self.kleene_loggers),
|
| 139 |
+
"active_systems": len(self.interpretive_loggers)
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
def print_summary(self):
|
| 143 |
+
"""Print beautiful session summary with colors"""
|
| 144 |
+
stats = self.get_session_stats()
|
| 145 |
+
|
| 146 |
+
# ANSI color codes
|
| 147 |
+
colors = {
|
| 148 |
+
"BOLD": "\033[1m",
|
| 149 |
+
"DIM": "\033[2m",
|
| 150 |
+
"RESET": "\033[0m",
|
| 151 |
+
"CYAN": "\033[36m",
|
| 152 |
+
"GREEN": "\033[32m",
|
| 153 |
+
"YELLOW": "\033[33m",
|
| 154 |
+
"BLUE": "\033[34m",
|
| 155 |
+
"MAGENTA": "\033[35m",
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
bold = colors["BOLD"]
|
| 159 |
+
dim = colors["DIM"]
|
| 160 |
+
reset = colors["RESET"]
|
| 161 |
+
cyan = colors["CYAN"]
|
| 162 |
+
green = colors["GREEN"]
|
| 163 |
+
yellow = colors["YELLOW"]
|
| 164 |
+
blue = colors["BLUE"]
|
| 165 |
+
magenta = colors["MAGENTA"]
|
| 166 |
+
|
| 167 |
+
print(f"\n{bold}{'='*80}{reset}")
|
| 168 |
+
print(f"{cyan}📊 CASCADE SESSION SUMMARY{reset}")
|
| 169 |
+
print(f"{bold}{'='*80}{reset}")
|
| 170 |
+
print(f"{bold}Uptime:{reset} {stats['uptime_seconds']:.1f} seconds")
|
| 171 |
+
print(f"{bold}Operations:{reset} {green}{stats['operations']:,}{reset}")
|
| 172 |
+
print(f"{bold}Kleene Entries:{reset} {yellow}{stats['kleene_entries']:,}{reset}")
|
| 173 |
+
print(f"{bold}Interpretive Entries:{reset} {blue}{stats['interpretive_entries']:,}{reset}")
|
| 174 |
+
print(f"{bold}Active Components:{reset} {magenta}{stats['active_components']}{reset}")
|
| 175 |
+
print(f"{bold}Active Systems:{reset} {magenta}{stats['active_systems']}{reset}")
|
| 176 |
+
|
| 177 |
+
if stats['kleene_entries'] > 0:
|
| 178 |
+
# Get session hash from first logger
|
| 179 |
+
first_logger = next(iter(self.kleene_loggers.values()))
|
| 180 |
+
print(f"{bold}Session Hash:{reset} {dim}{first_logger.get_session_hash()}{reset}")
|
| 181 |
+
|
| 182 |
+
print(f"{bold}{'='*80}{reset}")
|
| 183 |
+
|
| 184 |
+
def set_mode(self, mode: LogMode):
|
| 185 |
+
"""Switch logging mode dynamically"""
|
| 186 |
+
old_mode = self.config.mode
|
| 187 |
+
self.config.mode = mode
|
| 188 |
+
|
| 189 |
+
print(f"\n🔄 Logging mode changed: {old_mode.value} → {mode.value}")
|
| 190 |
+
|
| 191 |
+
def enable_file_logging(self, filepath: str):
|
| 192 |
+
"""Enable logging to file"""
|
| 193 |
+
self.config.file_output = True
|
| 194 |
+
# TODO: Implement file logging
|
| 195 |
+
print(f"📁 File logging enabled: {filepath}")
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
# Global log manager instance
|
| 199 |
+
_log_manager: Optional[CascadeLogManager] = None
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def init_logging(config: Optional[LogConfig] = None) -> CascadeLogManager:
|
| 203 |
+
"""Initialize the global CASCADE logging system"""
|
| 204 |
+
global _log_manager
|
| 205 |
+
_log_manager = CascadeLogManager(config)
|
| 206 |
+
return _log_manager
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def get_log_manager() -> CascadeLogManager:
|
| 210 |
+
"""Get the global log manager"""
|
| 211 |
+
global _log_manager
|
| 212 |
+
if _log_manager is None:
|
| 213 |
+
_log_manager = CascadeLogManager()
|
| 214 |
+
return _log_manager
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def log(component: str, operation: str, context: str = "", consequence: str = "",
|
| 218 |
+
metrics: Dict[str, Any] = None, impact: str = "LOW", **kwargs):
|
| 219 |
+
"""Quick log operation - convenience function"""
|
| 220 |
+
manager = get_log_manager()
|
| 221 |
+
manager.log_operation(component, operation,
|
| 222 |
+
details={
|
| 223 |
+
"context": context,
|
| 224 |
+
"consequence": consequence,
|
| 225 |
+
"metrics": metrics or {},
|
| 226 |
+
"impact": impact,
|
| 227 |
+
**kwargs
|
| 228 |
+
})
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def log_fixed_point(component: str, operation: str, iterations: int, **kwargs):
|
| 232 |
+
"""Log successful fixed point"""
|
| 233 |
+
log(component, operation,
|
| 234 |
+
level=LogLevel.INFO,
|
| 235 |
+
impact=ImpactLevel.LOW,
|
| 236 |
+
details={
|
| 237 |
+
"fixed_point": True,
|
| 238 |
+
"iterations": iterations,
|
| 239 |
+
**kwargs
|
| 240 |
+
})
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def log_error(component: str, operation: str, error: str, **kwargs):
|
| 244 |
+
"""Log error condition"""
|
| 245 |
+
log(component, f"{operation}_error",
|
| 246 |
+
level=LogLevel.ERROR,
|
| 247 |
+
impact=ImpactLevel.HIGH,
|
| 248 |
+
details={
|
| 249 |
+
"context": f"Operation failed: {error}",
|
| 250 |
+
"consequence": "System may be degraded",
|
| 251 |
+
"metrics": {"error": error},
|
| 252 |
+
**kwargs
|
| 253 |
+
})
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def log_performance(component: str, metric: str, value: float, threshold: float):
|
| 257 |
+
"""Log performance warning"""
|
| 258 |
+
log(component, f"performance_{metric}",
|
| 259 |
+
level=LogLevel.WARNING,
|
| 260 |
+
impact=ImpactLevel.MEDIUM,
|
| 261 |
+
details={
|
| 262 |
+
"context": f"Performance metric {metric} exceeded threshold",
|
| 263 |
+
"consequence": "May impact system performance",
|
| 264 |
+
"metrics": {metric: value, "threshold": threshold},
|
| 265 |
+
"recommendation": f"Optimize {metric} or scale resources"
|
| 266 |
+
})
|
cascade/observation.py
ADDED
|
@@ -0,0 +1,397 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Observation Manager
|
| 3 |
+
|
| 4 |
+
Connects the detective tabs (Observatory, Unity, System) to the lattice.
|
| 5 |
+
|
| 6 |
+
Flow:
|
| 7 |
+
1. User runs observation through any tab
|
| 8 |
+
2. Observation creates provenance chain
|
| 9 |
+
3. Chain links to model identity (for model obs) or genesis (for data/system)
|
| 10 |
+
4. Chain saved to lattice
|
| 11 |
+
5. Optionally pinned to IPFS
|
| 12 |
+
|
| 13 |
+
This is the integration layer between UI and lattice.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import json
|
| 17 |
+
import time
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from typing import Optional, Dict, Any, List
|
| 20 |
+
from dataclasses import dataclass, field
|
| 21 |
+
|
| 22 |
+
from cascade.core.provenance import ProvenanceChain
|
| 23 |
+
from cascade.identity import ModelRegistry, ModelIdentity, create_model_identity
|
| 24 |
+
from cascade.genesis import get_genesis_root, link_to_genesis
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class Observation:
|
| 29 |
+
"""
|
| 30 |
+
A single observation record in the lattice.
|
| 31 |
+
|
| 32 |
+
Can be:
|
| 33 |
+
- Model observation (inference through Observatory)
|
| 34 |
+
- Data observation (entity resolution through Unity)
|
| 35 |
+
- System observation (log analysis through System tab)
|
| 36 |
+
"""
|
| 37 |
+
observation_id: str
|
| 38 |
+
observation_type: str # "model", "data", "system"
|
| 39 |
+
|
| 40 |
+
# What was observed
|
| 41 |
+
source_id: str # Model ID, dataset ID, or log source
|
| 42 |
+
source_root: str # Merkle root of source identity
|
| 43 |
+
|
| 44 |
+
# The observation data
|
| 45 |
+
chain: ProvenanceChain
|
| 46 |
+
merkle_root: str
|
| 47 |
+
|
| 48 |
+
# Metadata
|
| 49 |
+
user_hash: Optional[str] = None # Anonymous user identifier
|
| 50 |
+
created_at: float = field(default_factory=time.time)
|
| 51 |
+
|
| 52 |
+
# IPFS
|
| 53 |
+
cid: Optional[str] = None
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class ObservationManager:
|
| 57 |
+
"""
|
| 58 |
+
Manages observations across all CASCADE tabs.
|
| 59 |
+
|
| 60 |
+
Responsibilities:
|
| 61 |
+
- Link observations to model identities or genesis
|
| 62 |
+
- Save observations to lattice
|
| 63 |
+
- Track observation history
|
| 64 |
+
- Provide stats for lattice gateway
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
def __init__(self, lattice_dir: Path = None):
|
| 68 |
+
self.lattice_dir = lattice_dir or Path(__file__).parent.parent / "lattice"
|
| 69 |
+
self.observations_dir = self.lattice_dir / "observations"
|
| 70 |
+
self.observations_dir.mkdir(parents=True, exist_ok=True)
|
| 71 |
+
|
| 72 |
+
# Model registry for linking model observations
|
| 73 |
+
self.model_registry = ModelRegistry(self.lattice_dir)
|
| 74 |
+
|
| 75 |
+
# Genesis root
|
| 76 |
+
self.genesis_root = get_genesis_root()
|
| 77 |
+
|
| 78 |
+
# In-memory observation index
|
| 79 |
+
self._observations: Dict[str, Observation] = {}
|
| 80 |
+
self._load_index()
|
| 81 |
+
|
| 82 |
+
def _load_index(self):
|
| 83 |
+
"""Load observation index from disk."""
|
| 84 |
+
index_file = self.lattice_dir / "observation_index.json"
|
| 85 |
+
if index_file.exists():
|
| 86 |
+
try:
|
| 87 |
+
index = json.loads(index_file.read_text())
|
| 88 |
+
# Just load metadata, not full chains
|
| 89 |
+
for obs_id, meta in index.items():
|
| 90 |
+
self._observations[obs_id] = meta
|
| 91 |
+
except:
|
| 92 |
+
pass
|
| 93 |
+
|
| 94 |
+
def _save_index(self):
|
| 95 |
+
"""Save observation index to disk."""
|
| 96 |
+
index_file = self.lattice_dir / "observation_index.json"
|
| 97 |
+
# Save lightweight index
|
| 98 |
+
index = {}
|
| 99 |
+
for obs_id, obs in self._observations.items():
|
| 100 |
+
if isinstance(obs, Observation):
|
| 101 |
+
index[obs_id] = {
|
| 102 |
+
"observation_id": obs.observation_id,
|
| 103 |
+
"observation_type": obs.observation_type,
|
| 104 |
+
"source_id": obs.source_id,
|
| 105 |
+
"source_root": obs.source_root,
|
| 106 |
+
"merkle_root": obs.merkle_root,
|
| 107 |
+
"created_at": obs.created_at,
|
| 108 |
+
"cid": obs.cid,
|
| 109 |
+
}
|
| 110 |
+
else:
|
| 111 |
+
index[obs_id] = obs
|
| 112 |
+
index_file.write_text(json.dumps(index, indent=2))
|
| 113 |
+
|
| 114 |
+
def observe_model(
|
| 115 |
+
self,
|
| 116 |
+
model_id: str,
|
| 117 |
+
chain: ProvenanceChain,
|
| 118 |
+
user_hash: Optional[str] = None,
|
| 119 |
+
**model_kwargs,
|
| 120 |
+
) -> Observation:
|
| 121 |
+
"""
|
| 122 |
+
Record a model observation.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
model_id: HuggingFace model ID or local path
|
| 126 |
+
chain: Provenance chain from Observatory
|
| 127 |
+
user_hash: Anonymous user identifier
|
| 128 |
+
**model_kwargs: Additional model info (parameters, etc.)
|
| 129 |
+
|
| 130 |
+
Returns:
|
| 131 |
+
Observation linked to model identity
|
| 132 |
+
"""
|
| 133 |
+
# Get or create model identity
|
| 134 |
+
identity = self.model_registry.get_or_create(model_id, **model_kwargs)
|
| 135 |
+
|
| 136 |
+
# Link chain to model identity
|
| 137 |
+
if not chain.external_roots:
|
| 138 |
+
chain.external_roots = []
|
| 139 |
+
if identity.merkle_root not in chain.external_roots:
|
| 140 |
+
chain.external_roots.append(identity.merkle_root)
|
| 141 |
+
|
| 142 |
+
# Finalize chain if not already
|
| 143 |
+
if not chain.finalized:
|
| 144 |
+
chain.finalize()
|
| 145 |
+
|
| 146 |
+
# Create observation record
|
| 147 |
+
obs_id = f"model_{chain.merkle_root}"
|
| 148 |
+
observation = Observation(
|
| 149 |
+
observation_id=obs_id,
|
| 150 |
+
observation_type="model",
|
| 151 |
+
source_id=model_id,
|
| 152 |
+
source_root=identity.merkle_root,
|
| 153 |
+
chain=chain,
|
| 154 |
+
merkle_root=chain.merkle_root,
|
| 155 |
+
user_hash=user_hash,
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
# Save chain to disk
|
| 159 |
+
self._save_observation(observation)
|
| 160 |
+
|
| 161 |
+
return observation
|
| 162 |
+
|
| 163 |
+
def observe_data(
|
| 164 |
+
self,
|
| 165 |
+
dataset_a: str,
|
| 166 |
+
dataset_b: str,
|
| 167 |
+
chain: ProvenanceChain,
|
| 168 |
+
user_hash: Optional[str] = None,
|
| 169 |
+
) -> Observation:
|
| 170 |
+
"""
|
| 171 |
+
Record a data unity observation.
|
| 172 |
+
|
| 173 |
+
Links directly to genesis (data doesn't have model identity).
|
| 174 |
+
"""
|
| 175 |
+
# Link to genesis
|
| 176 |
+
if not chain.external_roots:
|
| 177 |
+
chain.external_roots = []
|
| 178 |
+
if self.genesis_root not in chain.external_roots:
|
| 179 |
+
chain.external_roots.append(self.genesis_root)
|
| 180 |
+
|
| 181 |
+
if not chain.finalized:
|
| 182 |
+
chain.finalize()
|
| 183 |
+
|
| 184 |
+
# Create observation
|
| 185 |
+
source_id = f"{dataset_a}::{dataset_b}"
|
| 186 |
+
obs_id = f"data_{chain.merkle_root}"
|
| 187 |
+
|
| 188 |
+
observation = Observation(
|
| 189 |
+
observation_id=obs_id,
|
| 190 |
+
observation_type="data",
|
| 191 |
+
source_id=source_id,
|
| 192 |
+
source_root=self.genesis_root,
|
| 193 |
+
chain=chain,
|
| 194 |
+
merkle_root=chain.merkle_root,
|
| 195 |
+
user_hash=user_hash,
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
self._save_observation(observation)
|
| 199 |
+
return observation
|
| 200 |
+
|
| 201 |
+
def observe_system(
|
| 202 |
+
self,
|
| 203 |
+
source_name: str,
|
| 204 |
+
chain: ProvenanceChain,
|
| 205 |
+
user_hash: Optional[str] = None,
|
| 206 |
+
) -> Observation:
|
| 207 |
+
"""
|
| 208 |
+
Record a system log observation.
|
| 209 |
+
|
| 210 |
+
Links directly to genesis.
|
| 211 |
+
"""
|
| 212 |
+
# Link to genesis
|
| 213 |
+
if not chain.external_roots:
|
| 214 |
+
chain.external_roots = []
|
| 215 |
+
if self.genesis_root not in chain.external_roots:
|
| 216 |
+
chain.external_roots.append(self.genesis_root)
|
| 217 |
+
|
| 218 |
+
if not chain.finalized:
|
| 219 |
+
chain.finalize()
|
| 220 |
+
|
| 221 |
+
obs_id = f"system_{chain.merkle_root}"
|
| 222 |
+
|
| 223 |
+
observation = Observation(
|
| 224 |
+
observation_id=obs_id,
|
| 225 |
+
observation_type="system",
|
| 226 |
+
source_id=source_name,
|
| 227 |
+
source_root=self.genesis_root,
|
| 228 |
+
chain=chain,
|
| 229 |
+
merkle_root=chain.merkle_root,
|
| 230 |
+
user_hash=user_hash,
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
self._save_observation(observation)
|
| 234 |
+
return observation
|
| 235 |
+
|
| 236 |
+
def _save_observation(self, observation: Observation):
|
| 237 |
+
"""Save observation to disk."""
|
| 238 |
+
# Save to index
|
| 239 |
+
self._observations[observation.observation_id] = observation
|
| 240 |
+
self._save_index()
|
| 241 |
+
|
| 242 |
+
# Save full chain
|
| 243 |
+
chain_file = self.observations_dir / f"{observation.merkle_root}.json"
|
| 244 |
+
chain_data = {
|
| 245 |
+
"observation_id": observation.observation_id,
|
| 246 |
+
"observation_type": observation.observation_type,
|
| 247 |
+
"source_id": observation.source_id,
|
| 248 |
+
"source_root": observation.source_root,
|
| 249 |
+
"user_hash": observation.user_hash,
|
| 250 |
+
"created_at": observation.created_at,
|
| 251 |
+
"cid": observation.cid,
|
| 252 |
+
"chain": observation.chain.to_dict() if hasattr(observation.chain, 'to_dict') else str(observation.chain),
|
| 253 |
+
}
|
| 254 |
+
chain_file.write_text(json.dumps(chain_data, indent=2, default=str))
|
| 255 |
+
|
| 256 |
+
def pin_observation(self, observation: Observation) -> Optional[str]:
|
| 257 |
+
"""
|
| 258 |
+
Pin observation to IPFS.
|
| 259 |
+
|
| 260 |
+
Returns CID if successful.
|
| 261 |
+
"""
|
| 262 |
+
try:
|
| 263 |
+
from cascade.ipld import chain_to_cid, encode_to_dag_cbor
|
| 264 |
+
from cascade.web3_pin import pin_file
|
| 265 |
+
|
| 266 |
+
# Convert to IPLD format
|
| 267 |
+
chain_data = observation.chain.to_dict() if hasattr(observation.chain, 'to_dict') else {}
|
| 268 |
+
cbor_data = encode_to_dag_cbor(chain_data)
|
| 269 |
+
|
| 270 |
+
# Save CBOR
|
| 271 |
+
cbor_file = self.observations_dir / f"{observation.merkle_root}.cbor"
|
| 272 |
+
cbor_file.write_bytes(cbor_data)
|
| 273 |
+
|
| 274 |
+
# Compute CID
|
| 275 |
+
cid = chain_to_cid(chain_data)
|
| 276 |
+
observation.cid = cid
|
| 277 |
+
|
| 278 |
+
# Update index
|
| 279 |
+
self._save_observation(observation)
|
| 280 |
+
|
| 281 |
+
return cid
|
| 282 |
+
except Exception as e:
|
| 283 |
+
print(f"Failed to pin observation: {e}")
|
| 284 |
+
return None
|
| 285 |
+
|
| 286 |
+
def get_observation(self, merkle_root: str) -> Optional[Observation]:
|
| 287 |
+
"""Get observation by merkle root."""
|
| 288 |
+
for obs in self._observations.values():
|
| 289 |
+
if isinstance(obs, Observation) and obs.merkle_root == merkle_root:
|
| 290 |
+
return obs
|
| 291 |
+
elif isinstance(obs, dict) and obs.get("merkle_root") == merkle_root:
|
| 292 |
+
return obs
|
| 293 |
+
return None
|
| 294 |
+
|
| 295 |
+
def list_observations(
|
| 296 |
+
self,
|
| 297 |
+
observation_type: Optional[str] = None,
|
| 298 |
+
source_id: Optional[str] = None,
|
| 299 |
+
limit: int = 100,
|
| 300 |
+
) -> List[Dict[str, Any]]:
|
| 301 |
+
"""List observations with optional filters."""
|
| 302 |
+
results = []
|
| 303 |
+
|
| 304 |
+
for obs in self._observations.values():
|
| 305 |
+
if isinstance(obs, Observation):
|
| 306 |
+
obs_dict = {
|
| 307 |
+
"observation_id": obs.observation_id,
|
| 308 |
+
"observation_type": obs.observation_type,
|
| 309 |
+
"source_id": obs.source_id,
|
| 310 |
+
"merkle_root": obs.merkle_root,
|
| 311 |
+
"created_at": obs.created_at,
|
| 312 |
+
"cid": obs.cid,
|
| 313 |
+
}
|
| 314 |
+
else:
|
| 315 |
+
obs_dict = obs
|
| 316 |
+
|
| 317 |
+
# Apply filters
|
| 318 |
+
if observation_type and obs_dict.get("observation_type") != observation_type:
|
| 319 |
+
continue
|
| 320 |
+
if source_id and source_id not in obs_dict.get("source_id", ""):
|
| 321 |
+
continue
|
| 322 |
+
|
| 323 |
+
results.append(obs_dict)
|
| 324 |
+
|
| 325 |
+
# Sort by time, newest first
|
| 326 |
+
results.sort(key=lambda x: x.get("created_at", 0), reverse=True)
|
| 327 |
+
|
| 328 |
+
return results[:limit]
|
| 329 |
+
|
| 330 |
+
def get_stats(self) -> Dict[str, Any]:
|
| 331 |
+
"""Get lattice statistics."""
|
| 332 |
+
obs_list = list(self._observations.values())
|
| 333 |
+
|
| 334 |
+
model_obs = [o for o in obs_list if (isinstance(o, Observation) and o.observation_type == "model") or (isinstance(o, dict) and o.get("observation_type") == "model")]
|
| 335 |
+
data_obs = [o for o in obs_list if (isinstance(o, Observation) and o.observation_type == "data") or (isinstance(o, dict) and o.get("observation_type") == "data")]
|
| 336 |
+
system_obs = [o for o in obs_list if (isinstance(o, Observation) and o.observation_type == "system") or (isinstance(o, dict) and o.get("observation_type") == "system")]
|
| 337 |
+
|
| 338 |
+
# Count unique models
|
| 339 |
+
model_ids = set()
|
| 340 |
+
for o in model_obs:
|
| 341 |
+
if isinstance(o, Observation):
|
| 342 |
+
model_ids.add(o.source_id)
|
| 343 |
+
elif isinstance(o, dict):
|
| 344 |
+
model_ids.add(o.get("source_id", ""))
|
| 345 |
+
|
| 346 |
+
return {
|
| 347 |
+
"total_observations": len(obs_list),
|
| 348 |
+
"model_observations": len(model_obs),
|
| 349 |
+
"data_observations": len(data_obs),
|
| 350 |
+
"system_observations": len(system_obs),
|
| 351 |
+
"unique_models": len(model_ids),
|
| 352 |
+
"registered_models": len(self.model_registry.list_all()),
|
| 353 |
+
"genesis_root": self.genesis_root,
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
def get_model_observations(self, model_id: str) -> List[Dict[str, Any]]:
|
| 357 |
+
"""Get all observations for a specific model."""
|
| 358 |
+
return self.list_observations(observation_type="model", source_id=model_id)
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
# =============================================================================
|
| 362 |
+
# SINGLETON INSTANCE
|
| 363 |
+
# =============================================================================
|
| 364 |
+
|
| 365 |
+
_manager: Optional[ObservationManager] = None
|
| 366 |
+
|
| 367 |
+
def get_observation_manager() -> ObservationManager:
|
| 368 |
+
"""Get singleton observation manager."""
|
| 369 |
+
global _manager
|
| 370 |
+
if _manager is None:
|
| 371 |
+
_manager = ObservationManager()
|
| 372 |
+
return _manager
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
# =============================================================================
|
| 376 |
+
# CLI
|
| 377 |
+
# =============================================================================
|
| 378 |
+
|
| 379 |
+
if __name__ == "__main__":
|
| 380 |
+
print("=== CASCADE Observation Manager ===\n")
|
| 381 |
+
|
| 382 |
+
manager = get_observation_manager()
|
| 383 |
+
|
| 384 |
+
# Show stats
|
| 385 |
+
stats = manager.get_stats()
|
| 386 |
+
print(f"Genesis: {stats['genesis_root']}")
|
| 387 |
+
print(f"Registered Models: {stats['registered_models']}")
|
| 388 |
+
print(f"Total Observations: {stats['total_observations']}")
|
| 389 |
+
print(f" - Model: {stats['model_observations']}")
|
| 390 |
+
print(f" - Data: {stats['data_observations']}")
|
| 391 |
+
print(f" - System: {stats['system_observations']}")
|
| 392 |
+
print(f"Unique Models Observed: {stats['unique_models']}")
|
| 393 |
+
|
| 394 |
+
# List recent observations
|
| 395 |
+
print("\nRecent Observations:")
|
| 396 |
+
for obs in manager.list_observations(limit=5):
|
| 397 |
+
print(f" [{obs['observation_type']}] {obs['source_id'][:40]}... → {obs['merkle_root']}")
|
cascade/observe.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cascade Observer CLI.
|
| 3 |
+
|
| 4 |
+
Wraps a target process and observes its output.
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
python -m cascade.observe --cmd "python path/to/train.py --args..."
|
| 8 |
+
|
| 9 |
+
This module:
|
| 10 |
+
1. Wraps the target process
|
| 11 |
+
2. Pipes stdout/stderr -> Cascade Adapter
|
| 12 |
+
3. Writes events to tape file (JSONL) and human log (Markdown)
|
| 13 |
+
4. Emits events to event_queue for external consumers
|
| 14 |
+
|
| 15 |
+
For visualization, point a consumer at the event_queue or load the tape file
|
| 16 |
+
into your preferred visualization tool.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import sys
|
| 20 |
+
import subprocess
|
| 21 |
+
import argparse
|
| 22 |
+
import time
|
| 23 |
+
import json
|
| 24 |
+
import shlex
|
| 25 |
+
import shutil
|
| 26 |
+
from pathlib import Path
|
| 27 |
+
from queue import Queue
|
| 28 |
+
|
| 29 |
+
# Ensure package root is in path
|
| 30 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 31 |
+
|
| 32 |
+
from cascade import Monitor
|
| 33 |
+
|
| 34 |
+
# Shared event queue for external consumers (e.g., custom UIs)
|
| 35 |
+
event_queue: Queue = Queue()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def scoop_the_poop(log_dir: Path):
|
| 39 |
+
"""
|
| 40 |
+
Baggies system - archive old logs on startup.
|
| 41 |
+
Keeps the logs folder clean. Old sessions go to baggies/.
|
| 42 |
+
"""
|
| 43 |
+
baggies_dir = log_dir / "baggies"
|
| 44 |
+
baggies_dir.mkdir(parents=True, exist_ok=True)
|
| 45 |
+
|
| 46 |
+
# Find all old log files (not the current session)
|
| 47 |
+
tape_files = list(log_dir.glob("cascade_tape_*.jsonl"))
|
| 48 |
+
log_files = list(log_dir.glob("cascade_log_*.md"))
|
| 49 |
+
|
| 50 |
+
moved_count = 0
|
| 51 |
+
for f in tape_files + log_files:
|
| 52 |
+
if f.parent == log_dir: # Only files in root logs/, not baggies/
|
| 53 |
+
dest = baggies_dir / f.name
|
| 54 |
+
try:
|
| 55 |
+
shutil.move(str(f), str(dest))
|
| 56 |
+
moved_count += 1
|
| 57 |
+
except Exception as e:
|
| 58 |
+
print(f"[CASCADE] Could not archive {f.name}: {e}")
|
| 59 |
+
|
| 60 |
+
if moved_count > 0:
|
| 61 |
+
print(f"[CASCADE] 🧹 Scooped {moved_count} old logs → baggies/")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def main():
|
| 65 |
+
parser = argparse.ArgumentParser(
|
| 66 |
+
prog="cascade",
|
| 67 |
+
description="🌊 Cascade - Real-Time Neural Network Observability",
|
| 68 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 69 |
+
epilog="""
|
| 70 |
+
Examples:
|
| 71 |
+
cascade --cmd "python train.py"
|
| 72 |
+
cascade --cmd "python train.py --epochs=10"
|
| 73 |
+
cascade --cmd "python train.py" --cwd /path/to/project
|
| 74 |
+
|
| 75 |
+
Events are written to tape files in the log directory.
|
| 76 |
+
"""
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# Support both "cascade --cmd" and "cascade observe --cmd"
|
| 80 |
+
subparsers = parser.add_subparsers(dest="command")
|
| 81 |
+
observe_parser = subparsers.add_parser("observe", help="Observe a training process")
|
| 82 |
+
|
| 83 |
+
# Add args to both main parser and observe subparser
|
| 84 |
+
for p in [parser, observe_parser]:
|
| 85 |
+
p.add_argument("--cmd", required=True, help="Command to run the target process")
|
| 86 |
+
p.add_argument("--cwd", default=None, help="Working directory for the target (absolute path)")
|
| 87 |
+
p.add_argument("--log-dir", default="./logs", help="Directory for session tapes")
|
| 88 |
+
p.add_argument("--quiet", "-q", action="store_true", help="Suppress console output")
|
| 89 |
+
|
| 90 |
+
args = parser.parse_args()
|
| 91 |
+
|
| 92 |
+
# Resolve working directory to absolute
|
| 93 |
+
if args.cwd:
|
| 94 |
+
work_dir = Path(args.cwd).resolve()
|
| 95 |
+
else:
|
| 96 |
+
work_dir = Path.cwd()
|
| 97 |
+
|
| 98 |
+
# 0. Setup Session Tape (The Excrement/Product)
|
| 99 |
+
log_dir = Path(args.log_dir).resolve()
|
| 100 |
+
log_dir.mkdir(parents=True, exist_ok=True)
|
| 101 |
+
|
| 102 |
+
# 🧹 Scoop old logs before starting new session
|
| 103 |
+
scoop_the_poop(log_dir)
|
| 104 |
+
|
| 105 |
+
session_id = int(time.time())
|
| 106 |
+
|
| 107 |
+
# 1. Machine Tape (JSONL)
|
| 108 |
+
tape_path = log_dir / f"cascade_tape_{session_id}.jsonl"
|
| 109 |
+
tape_file = open(tape_path, "a", encoding="utf-8")
|
| 110 |
+
|
| 111 |
+
# 2. Human Log (Markdown)
|
| 112 |
+
human_path = log_dir / f"cascade_log_{session_id}.md"
|
| 113 |
+
human_file = open(human_path, "a", encoding="utf-8")
|
| 114 |
+
|
| 115 |
+
# Header for Human Log
|
| 116 |
+
human_file.write(f"# CASCADE MISSION LOG // SESSION {session_id}\n")
|
| 117 |
+
human_file.write(f"**Target:** `{args.cmd}`\n")
|
| 118 |
+
human_file.write(f"**Date:** {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 119 |
+
human_file.write("---\n\n")
|
| 120 |
+
human_file.flush()
|
| 121 |
+
|
| 122 |
+
print("="*60)
|
| 123 |
+
print("CASCADE // OBSERVER")
|
| 124 |
+
print(f"Target: {args.cmd}")
|
| 125 |
+
print(f"Tape: {tape_path.absolute()}")
|
| 126 |
+
print(f"Log: {human_path.absolute()}")
|
| 127 |
+
print("="*60)
|
| 128 |
+
|
| 129 |
+
# Init Monitor
|
| 130 |
+
monitor = Monitor("symbiont_alpha")
|
| 131 |
+
|
| 132 |
+
def write_human_entry(evt):
|
| 133 |
+
"""Convert an event into an articulate log entry."""
|
| 134 |
+
t_str = time.strftime('%H:%M:%S', time.localtime(evt.timestamp))
|
| 135 |
+
|
| 136 |
+
# Narrative construction based on event type
|
| 137 |
+
if evt.event_type == "error":
|
| 138 |
+
icon = "🔴"
|
| 139 |
+
narrative = f"CRITICAL FAILURE in **{evt.component}**."
|
| 140 |
+
elif evt.event_type == "warning":
|
| 141 |
+
icon = "⚠️"
|
| 142 |
+
narrative = f"Warning signal detected from **{evt.component}**."
|
| 143 |
+
elif evt.event_type == "state_change":
|
| 144 |
+
icon = "🔄"
|
| 145 |
+
narrative = f"State transition observed in **{evt.component}**."
|
| 146 |
+
elif "loss" in str(evt.data):
|
| 147 |
+
icon = "📉"
|
| 148 |
+
narrative = f"Optimization step completed by **{evt.component}**."
|
| 149 |
+
else:
|
| 150 |
+
icon = "ℹ️"
|
| 151 |
+
narrative = f"Standard event recorded from **{evt.component}**."
|
| 152 |
+
|
| 153 |
+
# Write readable block
|
| 154 |
+
human_file.write(f"### {icon} {t_str} // {evt.event_type.upper()}\n")
|
| 155 |
+
human_file.write(f"{narrative}\n")
|
| 156 |
+
if evt.data:
|
| 157 |
+
# Format data as a clean list or quote
|
| 158 |
+
human_file.write("```yaml\n")
|
| 159 |
+
for k, v in evt.data.items():
|
| 160 |
+
human_file.write(f"{k}: {v}\n")
|
| 161 |
+
human_file.write("```\n")
|
| 162 |
+
human_file.write("\n")
|
| 163 |
+
human_file.flush()
|
| 164 |
+
|
| 165 |
+
# Launch Target
|
| 166 |
+
try:
|
| 167 |
+
# Split command for subprocess if it's a string
|
| 168 |
+
cmd_parts = shlex.split(args.cmd)
|
| 169 |
+
|
| 170 |
+
process = subprocess.Popen(
|
| 171 |
+
cmd_parts,
|
| 172 |
+
cwd=args.cwd,
|
| 173 |
+
stdout=subprocess.PIPE,
|
| 174 |
+
stderr=subprocess.STDOUT,
|
| 175 |
+
text=True,
|
| 176 |
+
bufsize=1
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
print(f"[CASCADE] Linked to target. Recording to tape & log...")
|
| 180 |
+
|
| 181 |
+
for line in process.stdout:
|
| 182 |
+
line = line.strip()
|
| 183 |
+
if not line: continue
|
| 184 |
+
|
| 185 |
+
# Feed Adapter
|
| 186 |
+
event = monitor.observe(line)
|
| 187 |
+
|
| 188 |
+
# Build payload with FULL wealth: metrics + triage + raw
|
| 189 |
+
metrics_summary = monitor.metrics.summary()
|
| 190 |
+
triage_status = monitor.metrics.triage()
|
| 191 |
+
|
| 192 |
+
payload = {
|
| 193 |
+
"event": {
|
| 194 |
+
"event_id": event.event_id,
|
| 195 |
+
"timestamp": event.timestamp,
|
| 196 |
+
"component": event.component,
|
| 197 |
+
"event_type": event.event_type,
|
| 198 |
+
"data": event.data,
|
| 199 |
+
"raw": line, # Include original line for drill-down
|
| 200 |
+
},
|
| 201 |
+
"metrics": metrics_summary,
|
| 202 |
+
"triage": triage_status,
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
# Emit to queue for external consumers
|
| 206 |
+
event_queue.put(payload)
|
| 207 |
+
|
| 208 |
+
# Write to Tape (Machine)
|
| 209 |
+
tape_file.write(json.dumps(payload) + "\n")
|
| 210 |
+
tape_file.flush()
|
| 211 |
+
|
| 212 |
+
# Write to Log (Human)
|
| 213 |
+
write_human_entry(event)
|
| 214 |
+
|
| 215 |
+
# Echo to console (unless quiet)
|
| 216 |
+
if not args.quiet:
|
| 217 |
+
print(f"[RAW] {line}")
|
| 218 |
+
|
| 219 |
+
except KeyboardInterrupt:
|
| 220 |
+
print("\n[CASCADE] Detaching...")
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f"[CASCADE] Error: {e}")
|
| 223 |
+
finally:
|
| 224 |
+
tape_file.close()
|
| 225 |
+
human_file.close()
|
| 226 |
+
if 'process' in locals() and process.poll() is None:
|
| 227 |
+
process.terminate()
|
| 228 |
+
print(f"[CASCADE] Session complete. Tape: {tape_path}")
|
| 229 |
+
|
| 230 |
+
if __name__ == "__main__":
|
| 231 |
+
main()
|
cascade/patches/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CASCADE Patches - Auto-intercept LLM provider libraries
|
| 3 |
+
|
| 4 |
+
Each patch module wraps a provider's API to automatically emit receipts.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .openai_patch import patch_openai
|
| 8 |
+
from .anthropic_patch import patch_anthropic
|
| 9 |
+
from .huggingface_patch import patch_huggingface
|
| 10 |
+
from .ollama_patch import patch_ollama
|
| 11 |
+
from .litellm_patch import patch_litellm
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
"patch_openai",
|
| 15 |
+
"patch_anthropic",
|
| 16 |
+
"patch_huggingface",
|
| 17 |
+
"patch_ollama",
|
| 18 |
+
"patch_litellm",
|
| 19 |
+
]
|