tostido commited on
Commit
77bcbf1
·
0 Parent(s):

Initial commit - cascade-lattice 0.5.4

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .github/workflows/publish.yml +31 -0
  2. .gitignore +35 -0
  3. LICENSE +21 -0
  4. README.md +70 -0
  5. cascade/__init__.py +290 -0
  6. cascade/analysis/__init__.py +37 -0
  7. cascade/analysis/metrics.py +1168 -0
  8. cascade/analysis/tracer.py +487 -0
  9. cascade/bridge.py +265 -0
  10. cascade/cli_main.py +851 -0
  11. cascade/core/__init__.py +13 -0
  12. cascade/core/adapter.py +470 -0
  13. cascade/core/event.py +177 -0
  14. cascade/core/graph.py +292 -0
  15. cascade/core/provenance.py +601 -0
  16. cascade/core/web3_bridge.py +846 -0
  17. cascade/data/__init__.py +112 -0
  18. cascade/data/croissant.py +289 -0
  19. cascade/data/entities.py +349 -0
  20. cascade/data/hub.py +533 -0
  21. cascade/data/license.py +635 -0
  22. cascade/data/live.py +844 -0
  23. cascade/data/observer.py +666 -0
  24. cascade/data/pii.py +748 -0
  25. cascade/data/provenance.py +503 -0
  26. cascade/data/schema.py +417 -0
  27. cascade/demo.py +174 -0
  28. cascade/demo_sdk.py +114 -0
  29. cascade/export/__init__.py +23 -0
  30. cascade/export/tableau_export.py +598 -0
  31. cascade/forensics/__init__.py +53 -0
  32. cascade/forensics/analyzer.py +464 -0
  33. cascade/forensics/artifacts.py +1063 -0
  34. cascade/forensics/fingerprints.py +328 -0
  35. cascade/genesis.py +200 -0
  36. cascade/hold/__init__.py +82 -0
  37. cascade/hold/primitives.py +673 -0
  38. cascade/hold/session.py +707 -0
  39. cascade/identity.py +715 -0
  40. cascade/ipld.py +379 -0
  41. cascade/listen.py +154 -0
  42. cascade/logging/__init__.py +86 -0
  43. cascade/logging/color_example.py +107 -0
  44. cascade/logging/integrate.py +275 -0
  45. cascade/logging/interpretive_logger.py +276 -0
  46. cascade/logging/kleene_logger.py +219 -0
  47. cascade/logging/log_manager.py +266 -0
  48. cascade/observation.py +397 -0
  49. cascade/observe.py +231 -0
  50. cascade/patches/__init__.py +19 -0
.github/workflows/publish.yml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*'
7
+
8
+ jobs:
9
+ publish:
10
+ runs-on: ubuntu-latest
11
+ permissions:
12
+ id-token: write # For trusted publishing (optional)
13
+
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v5
19
+ with:
20
+ python-version: '3.10'
21
+
22
+ - name: Install build tools
23
+ run: pip install build
24
+
25
+ - name: Build package
26
+ run: python -m build
27
+
28
+ - name: Publish to PyPI
29
+ uses: pypa/gh-action-pypi-publish@release/v1
30
+ with:
31
+ password: ${{ secrets.PYPI_API_TOKEN }}
.gitignore ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Distribution / packaging
7
+ dist/
8
+ build/
9
+ *.egg-info/
10
+ *.egg
11
+ *.whl
12
+
13
+ # Virtual environments
14
+ venv/
15
+ .venv/
16
+ env/
17
+
18
+ # IDE
19
+ .vscode/
20
+ .idea/
21
+ *.swp
22
+ *.swo
23
+
24
+ # Testing
25
+ .pytest_cache/
26
+ .coverage
27
+ htmlcov/
28
+
29
+ # Logs
30
+ *.log
31
+ logs/
32
+
33
+ # OS
34
+ .DS_Store
35
+ Thumbs.db
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024-2026 Jeff Towers
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cascade Lattice
2
+
3
+ **Universal AI provenance layer — cryptographic receipts for every call, with HOLD inference halt protocol**
4
+
5
+ [![PyPI version](https://badge.fury.io/py/cascade-lattice.svg)](https://pypi.org/project/cascade-lattice/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+
8
+ ## Installation
9
+
10
+ ```bash
11
+ pip install cascade-lattice
12
+ ```
13
+
14
+ With optional dependencies:
15
+ ```bash
16
+ pip install cascade-lattice[torch] # PyTorch integration
17
+ pip install cascade-lattice[all] # All integrations
18
+ ```
19
+
20
+ ## Quick Start
21
+
22
+ ```python
23
+ from cascade import Monitor
24
+
25
+ # Create a monitor for your component
26
+ monitor = Monitor("training_loop")
27
+
28
+ # Observe events (parses logs, extracts metrics)
29
+ event = monitor.observe("Epoch 5: loss=0.0234, accuracy=0.9812")
30
+ print(event.data) # {'loss': 0.0234, 'accuracy': 0.9812, ...}
31
+
32
+ # Get metrics summary
33
+ print(monitor.metrics.summary())
34
+ ```
35
+
36
+ ## Features
37
+
38
+ - **Universal Observation** — Monitor training, inference, system logs, API calls
39
+ - **Cryptographic Receipts** — Every observation gets a verifiable hash chain
40
+ - **HOLD Protocol** — Inference halt capability for safety-critical applications
41
+ - **Tape Storage** — JSONL event streams for replay and analysis
42
+ - **Provider Patches** — Drop-in monitoring for OpenAI, Anthropic, LiteLLM, Ollama
43
+
44
+ ## CLI Usage
45
+
46
+ ```bash
47
+ cascade --help # Show all commands
48
+ cascade stats # Lattice statistics
49
+ cascade list -n 20 # Recent observations
50
+ cascade watch # Live observation feed
51
+ cascade fingerprint model/ # Fingerprint a model
52
+ cascade pii scan.log # Scan for PII
53
+ ```
54
+
55
+ ## Tape Utilities
56
+
57
+ ```python
58
+ from cascade.viz import load_tape_file, find_latest_tape, list_tape_files
59
+
60
+ # Find and load tape files
61
+ latest = find_latest_tape("./logs")
62
+ events = load_tape_file(latest)
63
+
64
+ for event in events:
65
+ print(event['event']['event_type'], event['event']['data'])
66
+ ```
67
+
68
+ ## License
69
+
70
+ MIT
cascade/__init__.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ╔═══════════════════════════════════════════════════════════════════════════════╗
3
+ ║ ║
4
+ ║ ██████╗ █████╗ ███████╗ ██████╗ █████╗ ██████╗ ███████╗ ║
5
+ ║ ██╔════╝██╔══██╗██╔════╝██╔════╝██╔══██╗██╔══██╗██╔════╝ ║
6
+ ║ ██║ ███████║███████╗██║ ███████║██║ ██║█████╗ ║
7
+ ║ ██║ ██╔══██║╚════██║██║ ██╔══██║██║ ██║██╔══╝ ║
8
+ ║ ╚██████╗██║ ██║███████║╚██████╗██║ ██║██████╔╝███████╗ ║
9
+ ║ ╚═════╝╚═╝ ╚═╝╚══════╝ ╚═════╝╚═╝ ╚═╝╚═════╝ ╚══════╝ ║
10
+ ║ ║
11
+ ║ Symbiotic Causation Monitoring for Neural Networks ║
12
+ ║ ║
13
+ ║ "even still, i grow, and yet, I grow still" ║
14
+ ║ ║
15
+ ╚═══════════════════════════════════════════════════════════════════════════════╝
16
+
17
+ Cascade is a self-interpreting causation monitor that symbiotically adapts to
18
+ any system architecture through Kleene fixed-point convergence.
19
+
20
+ Feed it ANY signal format. It learns your system's patterns. It traces cause
21
+ and effect bidirectionally through time. It predicts cascading failures before
22
+ they complete.
23
+
24
+ Quick Start:
25
+ >>> import cascade
26
+ >>> monitor = cascade.Monitor()
27
+ >>> monitor.observe({"loss": 0.5, "epoch": 10})
28
+ >>> monitor.observe("ERROR: gradient exploded at layer 5")
29
+ >>>
30
+ >>> # What caused this?
31
+ >>> monitor.trace_backwards("gradient_explosion")
32
+ >>>
33
+ >>> # What will this cause?
34
+ >>> monitor.trace_forwards("learning_rate_spike")
35
+ """
36
+
37
+ __version__ = "0.5.4"
38
+ __author__ = "Cascade Team"
39
+ __license__ = "MIT"
40
+
41
+ from cascade.core.event import Event, CausationLink
42
+ from cascade.core.graph import CausationGraph
43
+ from cascade.core.adapter import SymbioticAdapter
44
+ from cascade.analysis.tracer import Tracer
45
+ from cascade.analysis.metrics import MetricsEngine
46
+
47
+ # Primary API
48
+ class Monitor:
49
+ """
50
+ The main entry point for Cascade monitoring.
51
+
52
+ A symbiotic observer that acclimate to any system architecture.
53
+ Feed it signals in any format — it adapts and builds a causation graph.
54
+
55
+ Example:
56
+ >>> monitor = cascade.Monitor()
57
+ >>>
58
+ >>> # Feed it anything - dicts, strings, tensors, whatever
59
+ >>> monitor.observe({"loss": 0.5, "epoch": 10})
60
+ >>> monitor.observe("2024-01-01 12:00:00 INFO training started")
61
+ >>> monitor.observe(torch.tensor([0.1, 0.2, 0.3]))
62
+ >>>
63
+ >>> # Trace causation backwards (what caused this?)
64
+ >>> causes = monitor.trace_backwards(event_id)
65
+ >>>
66
+ >>> # Trace causation forwards (what will this cause?)
67
+ >>> effects = monitor.trace_forwards(event_id)
68
+ >>>
69
+ >>> # Get the full causation graph
70
+ >>> graph = monitor.graph
71
+ """
72
+
73
+ def __init__(self, name: str = "default"):
74
+ """
75
+ Initialize a new Cascade monitor.
76
+
77
+ Args:
78
+ name: Optional name for this monitor instance
79
+ """
80
+ self.name = name
81
+ self.adapter = SymbioticAdapter()
82
+ self.graph = CausationGraph()
83
+ self.tracer = Tracer(self.graph)
84
+ self.metrics = MetricsEngine(self.graph)
85
+ self._event_count = 0
86
+
87
+ def observe(self, signal) -> Event:
88
+ """
89
+ Observe a signal from the host system.
90
+
91
+ The signal can be in ANY format:
92
+ - dict: {"loss": 0.5, "epoch": 10}
93
+ - str: "ERROR: gradient exploded"
94
+ - tensor: torch.tensor([...])
95
+ - protobuf, JSON, log line, etc.
96
+
97
+ Cascade will automatically adapt to your signal format.
98
+
99
+ Args:
100
+ signal: Any signal from the host system
101
+
102
+ Returns:
103
+ Event: The interpreted event added to the causation graph
104
+ """
105
+ event = self.adapter.interpret(signal)
106
+ self.graph.add_event(event)
107
+ self.metrics.ingest(event)
108
+ self._event_count += 1
109
+ return event
110
+
111
+ def trace_backwards(self, event_id: str, max_depth: int = 10):
112
+ """
113
+ Trace causation backwards: what caused this event?
114
+
115
+ Args:
116
+ event_id: ID of the event to trace from
117
+ max_depth: Maximum depth to trace (default: 10)
118
+
119
+ Returns:
120
+ List of CausationChain objects showing the causal history
121
+ """
122
+ return self.tracer.trace_backwards(event_id, max_depth)
123
+
124
+ def trace_forwards(self, event_id: str, max_depth: int = 10):
125
+ """
126
+ Trace causation forwards: what did this event cause?
127
+
128
+ Args:
129
+ event_id: ID of the event to trace from
130
+ max_depth: Maximum depth to trace (default: 10)
131
+
132
+ Returns:
133
+ List of CausationChain objects showing the effects
134
+ """
135
+ return self.tracer.trace_forwards(event_id, max_depth)
136
+
137
+ def find_root_causes(self, event_id: str):
138
+ """
139
+ Find the ultimate root causes of an event.
140
+
141
+ Goes all the way back to find the origin points.
142
+
143
+ Args:
144
+ event_id: ID of the event to analyze
145
+
146
+ Returns:
147
+ List of root cause events with their causal chains
148
+ """
149
+ return self.tracer.find_root_causes(event_id)
150
+
151
+ def analyze_impact(self, event_id: str, max_depth: int = 20):
152
+ """
153
+ Analyze the downstream impact of an event.
154
+
155
+ Traces forward to find everything this event set in motion.
156
+
157
+ Args:
158
+ event_id: ID of the event to analyze
159
+ max_depth: Maximum depth to search
160
+
161
+ Returns:
162
+ ImpactAnalysis with effects and severity score
163
+ """
164
+ return self.tracer.analyze_impact(event_id, max_depth)
165
+
166
+ def predict_cascade(self, event_id: str):
167
+ """
168
+ Predict the likely future cascade from this event.
169
+
170
+ Uses learned patterns to forecast effects before they happen.
171
+
172
+ Args:
173
+ event_id: ID of the event to predict from
174
+
175
+ Returns:
176
+ CascadePrediction with risk scores and intervention points
177
+ """
178
+ return self.tracer.predict_cascade(event_id)
179
+
180
+ def __repr__(self):
181
+ return f"<Cascade Monitor '{self.name}' | {self._event_count} events>"
182
+
183
+
184
+ # Convenience function for quick setup
185
+ def observe() -> Monitor:
186
+ """
187
+ Create a new Cascade monitor ready for observation.
188
+
189
+ This is the simplest way to get started:
190
+
191
+ >>> import cascade
192
+ >>> monitor = cascade.observe()
193
+ >>> monitor.observe({"loss": 0.5})
194
+
195
+ Returns:
196
+ Monitor: A new monitor instance
197
+ """
198
+ return Monitor()
199
+
200
+
201
+ # Tape utilities for event storage
202
+ from cascade.viz.tape import (
203
+ load_tape_file,
204
+ find_latest_tape,
205
+ list_tape_files,
206
+ PlaybackBuffer,
207
+ )
208
+
209
+ # SDK - Universal AI Observation Layer
210
+ from cascade.sdk import init, observe as sdk_observe, shutdown
211
+
212
+ # Store - Simple observe/query with HuggingFace sync
213
+ from cascade.store import (
214
+ observe as store_observe,
215
+ query as store_query,
216
+ get as store_get,
217
+ stats as store_stats,
218
+ sync_all,
219
+ pull_from_hf,
220
+ Receipt,
221
+ # Discovery - find other users' lattices
222
+ discover_models,
223
+ discover_datasets,
224
+ discover_live,
225
+ dataset_info,
226
+ )
227
+
228
+ # Convenience aliases
229
+ auto_observe = init # cascade.auto_observe() is clearer for some users
230
+
231
+ # HOLD - Inference-Level Halt Protocol
232
+ from cascade import hold as hold_module
233
+ from cascade.hold import (
234
+ Hold,
235
+ HoldPoint,
236
+ HoldResolution,
237
+ HoldState,
238
+ HoldAwareMixin,
239
+ CausationHold,
240
+ InferenceStep,
241
+ HoldSession,
242
+ ArcadeFeedback,
243
+ )
244
+
245
+
246
+ __all__ = [
247
+ # SDK - Primary Interface
248
+ "init",
249
+ "auto_observe",
250
+ "shutdown",
251
+ # Store - HuggingFace-backed storage
252
+ "store_observe",
253
+ "store_query",
254
+ "store_get",
255
+ "store_stats",
256
+ "sync_all",
257
+ "pull_from_hf",
258
+ "Receipt",
259
+ # Discovery
260
+ "discover_models",
261
+ "discover_datasets",
262
+ "discover_live",
263
+ "dataset_info",
264
+ # Monitor (causation tracking)
265
+ "Monitor",
266
+ "observe",
267
+ "Event",
268
+ "CausationLink",
269
+ "CausationGraph",
270
+ "SymbioticAdapter",
271
+ "Tracer",
272
+ "MetricsEngine",
273
+ # Tape playback
274
+ "load_tape_file",
275
+ "find_latest_tape",
276
+ "list_tape_files",
277
+ "PlaybackBuffer",
278
+ # HOLD - Inference Halt Protocol
279
+ "Hold",
280
+ "HoldPoint",
281
+ "HoldResolution",
282
+ "HoldState",
283
+ "HoldAwareMixin",
284
+ "CausationHold",
285
+ "InferenceStep",
286
+ "HoldSession",
287
+ "ArcadeFeedback",
288
+ "hold_module",
289
+ "__version__",
290
+ ]
cascade/analysis/__init__.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Cascade Analysis module - tracing, prediction, and intervention."""
2
+
3
+ from cascade.analysis.tracer import (
4
+ Tracer,
5
+ RootCauseAnalysis,
6
+ ImpactAnalysis,
7
+ CascadePrediction,
8
+ )
9
+ from cascade.analysis.metrics import (
10
+ MetricsEngine,
11
+ MetricSeries,
12
+ MetricCategory,
13
+ MetricHealthSpec,
14
+ Anomaly,
15
+ Correlation,
16
+ ThresholdCrossing,
17
+ classify_metric,
18
+ METRIC_TAXONOMY,
19
+ HEALTH_SPECS,
20
+ )
21
+
22
+ __all__ = [
23
+ "Tracer",
24
+ "RootCauseAnalysis",
25
+ "ImpactAnalysis",
26
+ "CascadePrediction",
27
+ "MetricsEngine",
28
+ "MetricSeries",
29
+ "MetricCategory",
30
+ "MetricHealthSpec",
31
+ "Anomaly",
32
+ "Correlation",
33
+ "ThresholdCrossing",
34
+ "classify_metric",
35
+ "METRIC_TAXONOMY",
36
+ "HEALTH_SPECS",
37
+ ]
cascade/analysis/metrics.py ADDED
@@ -0,0 +1,1168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cascade Analysis - Metrics Engine.
3
+
4
+ The quantification layer. Extracts, tracks, and correlates numeric data
5
+ from the event stream. Provides the WHAT with enough depth that the WHY
6
+ becomes self-evident to the observer.
7
+
8
+ This module does NOT interpret or explain. It quantifies.
9
+
10
+ Industry-Standard Neural Network Observability Taxonomy:
11
+ =========================================================
12
+
13
+ CATEGORY 1: TRAINING_DYNAMICS
14
+ Core training loop metrics - loss, accuracy, learning rate, throughput
15
+
16
+ CATEGORY 2: GRADIENT_HEALTH
17
+ Gradient flow diagnostics - norms, clipping, vanishing/exploding
18
+
19
+ CATEGORY 3: WEIGHT_DYNAMICS
20
+ Parameter evolution - norms, update ratios, dead neurons
21
+
22
+ CATEGORY 4: ACTIVATION_FLOW
23
+ Forward pass health - magnitudes, saturation, dead ReLUs
24
+
25
+ CATEGORY 5: ATTENTION_MECHANICS
26
+ Transformer-specific - entropy, sparsity, head importance
27
+
28
+ CATEGORY 6: MEMORY_COMPUTE
29
+ Resource utilization - GPU/CPU memory, MFU, throughput
30
+
31
+ CATEGORY 7: OPTIMIZATION_STATE
32
+ Optimizer internals - Adam moments, momentum, weight decay
33
+
34
+ CATEGORY 8: CONVERGENCE_SIGNALS
35
+ Training health indicators - plateau, overfitting, noise scale
36
+
37
+ CATEGORY 9: DATA_PIPELINE
38
+ Data loading metrics - batch time, queue depth, prefetch
39
+
40
+ CATEGORY 10: REGULARIZATION
41
+ Regularization effects - dropout, batch norm, layer norm stats
42
+ """
43
+
44
+ from typing import Dict, List, Any, Optional, Tuple, Set
45
+ from dataclasses import dataclass, field
46
+ from collections import defaultdict
47
+ from enum import Enum, auto
48
+ import math
49
+ import re
50
+
51
+ from cascade.core.event import Event
52
+ from cascade.core.graph import CausationGraph
53
+
54
+
55
+ # =============================================================================
56
+ # METRIC CATEGORY TAXONOMY
57
+ # =============================================================================
58
+
59
+ class MetricCategory(Enum):
60
+ """Industry-standard neural network metric categories."""
61
+ TRAINING_DYNAMICS = auto() # Loss, accuracy, LR, throughput
62
+ GRADIENT_HEALTH = auto() # Grad norms, clipping, flow
63
+ WEIGHT_DYNAMICS = auto() # Weight norms, updates, dead neurons
64
+ ACTIVATION_FLOW = auto() # Activation stats, saturation
65
+ ATTENTION_MECHANICS = auto() # Attention entropy, sparsity, heads
66
+ MEMORY_COMPUTE = auto() # GPU/CPU mem, MFU, FLOPS
67
+ OPTIMIZATION_STATE = auto() # Adam moments, momentum, decay
68
+ CONVERGENCE_SIGNALS = auto() # Plateau, overfit, noise scale
69
+ DATA_PIPELINE = auto() # Batch time, queue, prefetch
70
+ REGULARIZATION = auto() # Dropout, norm layer stats
71
+ SYSTEM = auto() # Iteration, epoch, timestamps
72
+ UNKNOWN = auto() # Uncategorized metrics
73
+
74
+
75
+ # Comprehensive metric-to-category mapping
76
+ # This is the "knowledge base" of neural network metric taxonomy
77
+ METRIC_TAXONOMY: Dict[str, MetricCategory] = {
78
+ # TRAINING_DYNAMICS
79
+ "loss": MetricCategory.TRAINING_DYNAMICS,
80
+ "train_loss": MetricCategory.TRAINING_DYNAMICS,
81
+ "val_loss": MetricCategory.TRAINING_DYNAMICS,
82
+ "test_loss": MetricCategory.TRAINING_DYNAMICS,
83
+ "eval_loss": MetricCategory.TRAINING_DYNAMICS,
84
+ "nll_loss": MetricCategory.TRAINING_DYNAMICS,
85
+ "ce_loss": MetricCategory.TRAINING_DYNAMICS,
86
+ "cross_entropy": MetricCategory.TRAINING_DYNAMICS,
87
+ "mse_loss": MetricCategory.TRAINING_DYNAMICS,
88
+ "mae_loss": MetricCategory.TRAINING_DYNAMICS,
89
+ "perplexity": MetricCategory.TRAINING_DYNAMICS,
90
+ "ppl": MetricCategory.TRAINING_DYNAMICS,
91
+ "accuracy": MetricCategory.TRAINING_DYNAMICS,
92
+ "acc": MetricCategory.TRAINING_DYNAMICS,
93
+ "top1_acc": MetricCategory.TRAINING_DYNAMICS,
94
+ "top5_acc": MetricCategory.TRAINING_DYNAMICS,
95
+ "precision": MetricCategory.TRAINING_DYNAMICS,
96
+ "recall": MetricCategory.TRAINING_DYNAMICS,
97
+ "f1": MetricCategory.TRAINING_DYNAMICS,
98
+ "f1_score": MetricCategory.TRAINING_DYNAMICS,
99
+ "auc": MetricCategory.TRAINING_DYNAMICS,
100
+ "auroc": MetricCategory.TRAINING_DYNAMICS,
101
+ "bleu": MetricCategory.TRAINING_DYNAMICS,
102
+ "rouge": MetricCategory.TRAINING_DYNAMICS,
103
+ "lr": MetricCategory.TRAINING_DYNAMICS,
104
+ "learning_rate": MetricCategory.TRAINING_DYNAMICS,
105
+ "samples_per_sec": MetricCategory.TRAINING_DYNAMICS,
106
+ "tokens_per_sec": MetricCategory.TRAINING_DYNAMICS,
107
+ "throughput": MetricCategory.TRAINING_DYNAMICS,
108
+ "steps_per_sec": MetricCategory.TRAINING_DYNAMICS,
109
+
110
+ # GRADIENT_HEALTH
111
+ "grad_norm": MetricCategory.GRADIENT_HEALTH,
112
+ "gradient_norm": MetricCategory.GRADIENT_HEALTH,
113
+ "global_grad_norm": MetricCategory.GRADIENT_HEALTH,
114
+ "grad_norm_clipped": MetricCategory.GRADIENT_HEALTH,
115
+ "grad_clip_rate": MetricCategory.GRADIENT_HEALTH,
116
+ "grad_scale": MetricCategory.GRADIENT_HEALTH,
117
+ "grad_mean": MetricCategory.GRADIENT_HEALTH,
118
+ "grad_std": MetricCategory.GRADIENT_HEALTH,
119
+ "grad_max": MetricCategory.GRADIENT_HEALTH,
120
+ "grad_min": MetricCategory.GRADIENT_HEALTH,
121
+ "grad_sparsity": MetricCategory.GRADIENT_HEALTH,
122
+ "vanishing_grad": MetricCategory.GRADIENT_HEALTH,
123
+ "exploding_grad": MetricCategory.GRADIENT_HEALTH,
124
+
125
+ # WEIGHT_DYNAMICS
126
+ "weight_norm": MetricCategory.WEIGHT_DYNAMICS,
127
+ "param_norm": MetricCategory.WEIGHT_DYNAMICS,
128
+ "weight_mean": MetricCategory.WEIGHT_DYNAMICS,
129
+ "weight_std": MetricCategory.WEIGHT_DYNAMICS,
130
+ "update_ratio": MetricCategory.WEIGHT_DYNAMICS,
131
+ "weight_update": MetricCategory.WEIGHT_DYNAMICS,
132
+ "dead_neurons": MetricCategory.WEIGHT_DYNAMICS,
133
+ "dead_neuron_pct": MetricCategory.WEIGHT_DYNAMICS,
134
+ "param_count": MetricCategory.WEIGHT_DYNAMICS,
135
+ "num_params": MetricCategory.WEIGHT_DYNAMICS,
136
+ "trainable_params": MetricCategory.WEIGHT_DYNAMICS,
137
+
138
+ # ACTIVATION_FLOW
139
+ "activation_mean": MetricCategory.ACTIVATION_FLOW,
140
+ "activation_std": MetricCategory.ACTIVATION_FLOW,
141
+ "activation_norm": MetricCategory.ACTIVATION_FLOW,
142
+ "activation_max": MetricCategory.ACTIVATION_FLOW,
143
+ "saturation": MetricCategory.ACTIVATION_FLOW,
144
+ "saturation_pct": MetricCategory.ACTIVATION_FLOW,
145
+ "dead_relu": MetricCategory.ACTIVATION_FLOW,
146
+ "dead_relu_pct": MetricCategory.ACTIVATION_FLOW,
147
+ "activation_sparsity": MetricCategory.ACTIVATION_FLOW,
148
+ # Generic activation stats from layer hooks
149
+ "mean": MetricCategory.ACTIVATION_FLOW,
150
+ "std": MetricCategory.ACTIVATION_FLOW,
151
+ "min": MetricCategory.ACTIVATION_FLOW,
152
+ "max": MetricCategory.ACTIVATION_FLOW,
153
+ "sparsity": MetricCategory.ACTIVATION_FLOW,
154
+ "layer_idx": MetricCategory.SYSTEM,
155
+
156
+ # ATTENTION_MECHANICS
157
+ "attention_entropy": MetricCategory.ATTENTION_MECHANICS,
158
+ "attn_entropy": MetricCategory.ATTENTION_MECHANICS,
159
+ "attention_sparsity": MetricCategory.ATTENTION_MECHANICS,
160
+ "head_importance": MetricCategory.ATTENTION_MECHANICS,
161
+ "attention_weight_norm": MetricCategory.ATTENTION_MECHANICS,
162
+ "position_bias": MetricCategory.ATTENTION_MECHANICS,
163
+ "attention_score_mean": MetricCategory.ATTENTION_MECHANICS,
164
+ "attention_score_std": MetricCategory.ATTENTION_MECHANICS,
165
+
166
+ # MEMORY_COMPUTE
167
+ "gpu_memory": MetricCategory.MEMORY_COMPUTE,
168
+ "gpu_mem": MetricCategory.MEMORY_COMPUTE,
169
+ "gpu_memory_allocated": MetricCategory.MEMORY_COMPUTE,
170
+ "gpu_memory_cached": MetricCategory.MEMORY_COMPUTE,
171
+ "gpu_memory_peak": MetricCategory.MEMORY_COMPUTE,
172
+ "cpu_memory": MetricCategory.MEMORY_COMPUTE,
173
+ "memory_usage": MetricCategory.MEMORY_COMPUTE,
174
+ "mfu": MetricCategory.MEMORY_COMPUTE,
175
+ "model_flops_utilization": MetricCategory.MEMORY_COMPUTE,
176
+ "flops": MetricCategory.MEMORY_COMPUTE,
177
+ "tflops": MetricCategory.MEMORY_COMPUTE,
178
+ "gpu_utilization": MetricCategory.MEMORY_COMPUTE,
179
+ "gpu_util": MetricCategory.MEMORY_COMPUTE,
180
+
181
+ # OPTIMIZATION_STATE
182
+ "adam_m_norm": MetricCategory.OPTIMIZATION_STATE,
183
+ "adam_v_norm": MetricCategory.OPTIMIZATION_STATE,
184
+ "momentum": MetricCategory.OPTIMIZATION_STATE,
185
+ "beta1": MetricCategory.OPTIMIZATION_STATE,
186
+ "beta2": MetricCategory.OPTIMIZATION_STATE,
187
+ "weight_decay": MetricCategory.OPTIMIZATION_STATE,
188
+ "effective_weight_decay": MetricCategory.OPTIMIZATION_STATE,
189
+ "warmup_progress": MetricCategory.OPTIMIZATION_STATE,
190
+ "lr_schedule_progress": MetricCategory.OPTIMIZATION_STATE,
191
+
192
+ # CONVERGENCE_SIGNALS
193
+ "train_val_gap": MetricCategory.CONVERGENCE_SIGNALS,
194
+ "overfit_ratio": MetricCategory.CONVERGENCE_SIGNALS,
195
+ "loss_plateau": MetricCategory.CONVERGENCE_SIGNALS,
196
+ "gradient_noise_scale": MetricCategory.CONVERGENCE_SIGNALS,
197
+ "critical_batch_size": MetricCategory.CONVERGENCE_SIGNALS,
198
+ "effective_batch_size": MetricCategory.CONVERGENCE_SIGNALS,
199
+ "early_stop_score": MetricCategory.CONVERGENCE_SIGNALS,
200
+ "best_val_loss": MetricCategory.CONVERGENCE_SIGNALS,
201
+ "improvement_rate": MetricCategory.CONVERGENCE_SIGNALS,
202
+
203
+ # DATA_PIPELINE
204
+ "data_time": MetricCategory.DATA_PIPELINE,
205
+ "batch_time": MetricCategory.DATA_PIPELINE,
206
+ "load_time": MetricCategory.DATA_PIPELINE,
207
+ "preprocessing_time": MetricCategory.DATA_PIPELINE,
208
+ "augmentation_time": MetricCategory.DATA_PIPELINE,
209
+ "queue_depth": MetricCategory.DATA_PIPELINE,
210
+ "prefetch_factor": MetricCategory.DATA_PIPELINE,
211
+ "num_workers": MetricCategory.DATA_PIPELINE,
212
+
213
+ # REGULARIZATION
214
+ "dropout_rate": MetricCategory.REGULARIZATION,
215
+ "dropout": MetricCategory.REGULARIZATION,
216
+ "bn_mean": MetricCategory.REGULARIZATION,
217
+ "bn_var": MetricCategory.REGULARIZATION,
218
+ "bn_running_mean": MetricCategory.REGULARIZATION,
219
+ "bn_running_var": MetricCategory.REGULARIZATION,
220
+ "ln_mean": MetricCategory.REGULARIZATION,
221
+ "ln_var": MetricCategory.REGULARIZATION,
222
+ "l1_penalty": MetricCategory.REGULARIZATION,
223
+ "l2_penalty": MetricCategory.REGULARIZATION,
224
+
225
+ # SYSTEM
226
+ "iter": MetricCategory.SYSTEM,
227
+ "iteration": MetricCategory.SYSTEM,
228
+ "step": MetricCategory.SYSTEM,
229
+ "total": MetricCategory.SYSTEM,
230
+ "epoch": MetricCategory.SYSTEM,
231
+ "batch": MetricCategory.SYSTEM,
232
+ "batch_idx": MetricCategory.SYSTEM,
233
+ "global_step": MetricCategory.SYSTEM,
234
+ "time": MetricCategory.SYSTEM,
235
+ "dt": MetricCategory.SYSTEM,
236
+ "elapsed": MetricCategory.SYSTEM,
237
+ "wall_time": MetricCategory.SYSTEM,
238
+ "timestamp": MetricCategory.SYSTEM,
239
+ "hooked_layers": MetricCategory.SYSTEM,
240
+ "input_tokens": MetricCategory.SYSTEM,
241
+ "predicted_class": MetricCategory.TRAINING_DYNAMICS,
242
+
243
+ # MODEL INFO
244
+ "params": MetricCategory.WEIGHT_DYNAMICS,
245
+ "num_params": MetricCategory.WEIGHT_DYNAMICS,
246
+ "total_params": MetricCategory.WEIGHT_DYNAMICS,
247
+ "trainable_params": MetricCategory.WEIGHT_DYNAMICS,
248
+ "parameters": MetricCategory.WEIGHT_DYNAMICS,
249
+ "model_size": MetricCategory.WEIGHT_DYNAMICS,
250
+
251
+ # INFERENCE METRICS
252
+ "confidence": MetricCategory.TRAINING_DYNAMICS,
253
+ "similarity": MetricCategory.TRAINING_DYNAMICS,
254
+ "score": MetricCategory.TRAINING_DYNAMICS,
255
+ "prob": MetricCategory.TRAINING_DYNAMICS,
256
+ "probability": MetricCategory.TRAINING_DYNAMICS,
257
+ "entropy": MetricCategory.ATTENTION_MECHANICS,
258
+ "latency": MetricCategory.MEMORY_COMPUTE,
259
+ "inference_time": MetricCategory.MEMORY_COMPUTE,
260
+ "input_len": MetricCategory.DATA_PIPELINE,
261
+ "output_len": MetricCategory.DATA_PIPELINE,
262
+
263
+ # OBSERVATION SYSTEM METRICS
264
+ "hooked_modules": MetricCategory.SYSTEM,
265
+ "total_layers": MetricCategory.SYSTEM,
266
+ "sample_rate": MetricCategory.SYSTEM,
267
+ "layer_num": MetricCategory.SYSTEM,
268
+ "max_depth": MetricCategory.SYSTEM,
269
+ "return_code": MetricCategory.SYSTEM,
270
+ "pid": MetricCategory.SYSTEM,
271
+ "max_iterations": MetricCategory.SYSTEM,
272
+ "total_iterations": MetricCategory.SYSTEM,
273
+ "iterations": MetricCategory.SYSTEM,
274
+
275
+ # GPU/VRAM
276
+ "vram_gb": MetricCategory.MEMORY_COMPUTE,
277
+ "gpu_count": MetricCategory.MEMORY_COMPUTE,
278
+ "gpu_memory_gb": MetricCategory.MEMORY_COMPUTE,
279
+ }
280
+
281
+ # Patterns for dynamic metric name matching
282
+ METRIC_PATTERNS: List[Tuple[str, MetricCategory]] = [
283
+ (r".*loss.*", MetricCategory.TRAINING_DYNAMICS),
284
+ (r".*acc.*", MetricCategory.TRAINING_DYNAMICS),
285
+ (r".*accuracy.*", MetricCategory.TRAINING_DYNAMICS),
286
+ (r".*perplexity.*", MetricCategory.TRAINING_DYNAMICS),
287
+ (r".*lr.*", MetricCategory.TRAINING_DYNAMICS),
288
+ (r".*learning_rate.*", MetricCategory.TRAINING_DYNAMICS),
289
+ (r".*grad.*norm.*", MetricCategory.GRADIENT_HEALTH),
290
+ (r".*gradient.*", MetricCategory.GRADIENT_HEALTH),
291
+ (r".*weight.*norm.*", MetricCategory.WEIGHT_DYNAMICS),
292
+ (r".*param.*norm.*", MetricCategory.WEIGHT_DYNAMICS),
293
+ (r".*activation.*", MetricCategory.ACTIVATION_FLOW),
294
+ (r".*attention.*", MetricCategory.ATTENTION_MECHANICS),
295
+ (r".*attn.*", MetricCategory.ATTENTION_MECHANICS),
296
+ (r".*memory.*", MetricCategory.MEMORY_COMPUTE),
297
+ (r".*gpu.*", MetricCategory.MEMORY_COMPUTE),
298
+ (r".*mfu.*", MetricCategory.MEMORY_COMPUTE),
299
+ (r".*adam.*", MetricCategory.OPTIMIZATION_STATE),
300
+ (r".*momentum.*", MetricCategory.OPTIMIZATION_STATE),
301
+ (r".*overfit.*", MetricCategory.CONVERGENCE_SIGNALS),
302
+ (r".*plateau.*", MetricCategory.CONVERGENCE_SIGNALS),
303
+ (r".*data.*time.*", MetricCategory.DATA_PIPELINE),
304
+ (r".*batch.*time.*", MetricCategory.DATA_PIPELINE),
305
+ (r".*dropout.*", MetricCategory.REGULARIZATION),
306
+ (r".*bn_.*", MetricCategory.REGULARIZATION),
307
+ (r".*ln_.*", MetricCategory.REGULARIZATION),
308
+ (r".*iter.*", MetricCategory.SYSTEM),
309
+ (r".*epoch.*", MetricCategory.SYSTEM),
310
+ (r".*step.*", MetricCategory.SYSTEM),
311
+ (r".*time.*", MetricCategory.SYSTEM),
312
+ (r".*_ms$", MetricCategory.SYSTEM),
313
+ (r".*duration.*", MetricCategory.SYSTEM),
314
+ ]
315
+
316
+
317
+ def classify_metric(name: str) -> MetricCategory:
318
+ """Classify a metric name into its category."""
319
+ name_lower = name.lower()
320
+
321
+ # Direct lookup
322
+ if name_lower in METRIC_TAXONOMY:
323
+ return METRIC_TAXONOMY[name_lower]
324
+
325
+ # Pattern matching
326
+ for pattern, category in METRIC_PATTERNS:
327
+ if re.match(pattern, name_lower):
328
+ return category
329
+
330
+ return MetricCategory.UNKNOWN
331
+
332
+
333
+ # =============================================================================
334
+ # METRIC HEALTH THRESHOLDS (Industry Standards)
335
+ # =============================================================================
336
+
337
+ @dataclass
338
+ class MetricHealthSpec:
339
+ """Specification for healthy metric ranges."""
340
+ name: str
341
+ category: MetricCategory
342
+ healthy_min: Optional[float] = None
343
+ healthy_max: Optional[float] = None
344
+ critical_min: Optional[float] = None
345
+ critical_max: Optional[float] = None
346
+ expected_trend: Optional[str] = None # 'falling', 'rising', 'stable'
347
+
348
+ def is_healthy(self, value: float) -> bool:
349
+ if self.healthy_min is not None and value < self.healthy_min:
350
+ return False
351
+ if self.healthy_max is not None and value > self.healthy_max:
352
+ return False
353
+ return True
354
+
355
+ def is_critical(self, value: float) -> bool:
356
+ if self.critical_min is not None and value < self.critical_min:
357
+ return True
358
+ if self.critical_max is not None and value > self.critical_max:
359
+ return True
360
+ return False
361
+
362
+
363
+ # Industry-standard health thresholds
364
+ HEALTH_SPECS: Dict[str, MetricHealthSpec] = {
365
+ "loss": MetricHealthSpec(
366
+ name="loss",
367
+ category=MetricCategory.TRAINING_DYNAMICS,
368
+ healthy_max=10.0,
369
+ critical_max=100.0,
370
+ expected_trend="falling",
371
+ ),
372
+ "grad_norm": MetricHealthSpec(
373
+ name="grad_norm",
374
+ category=MetricCategory.GRADIENT_HEALTH,
375
+ healthy_min=1e-7,
376
+ healthy_max=10.0,
377
+ critical_min=1e-10, # Vanishing
378
+ critical_max=1000.0, # Exploding
379
+ ),
380
+ "lr": MetricHealthSpec(
381
+ name="lr",
382
+ category=MetricCategory.TRAINING_DYNAMICS,
383
+ healthy_min=1e-8,
384
+ healthy_max=1.0,
385
+ critical_max=10.0,
386
+ ),
387
+ "mfu": MetricHealthSpec(
388
+ name="mfu",
389
+ category=MetricCategory.MEMORY_COMPUTE,
390
+ healthy_min=0.1, # 10% utilization minimum
391
+ healthy_max=1.0,
392
+ ),
393
+ "dead_relu_pct": MetricHealthSpec(
394
+ name="dead_relu_pct",
395
+ category=MetricCategory.ACTIVATION_FLOW,
396
+ healthy_max=0.3, # 30% dead is concerning
397
+ critical_max=0.7, # 70% dead is critical
398
+ ),
399
+ "train_val_gap": MetricHealthSpec(
400
+ name="train_val_gap",
401
+ category=MetricCategory.CONVERGENCE_SIGNALS,
402
+ healthy_max=0.5, # Gap shouldn't exceed 50% of train loss
403
+ critical_max=2.0, # Severe overfitting
404
+ ),
405
+ }
406
+
407
+
408
+ @dataclass
409
+ class MetricSeries:
410
+ """A time series of a single metric with category awareness."""
411
+ name: str
412
+ category: MetricCategory = field(default=MetricCategory.UNKNOWN)
413
+ values: List[float] = field(default_factory=list)
414
+ timestamps: List[float] = field(default_factory=list)
415
+ event_ids: List[str] = field(default_factory=list)
416
+
417
+ def __post_init__(self):
418
+ if self.category == MetricCategory.UNKNOWN:
419
+ self.category = classify_metric(self.name)
420
+
421
+ @property
422
+ def count(self) -> int:
423
+ return len(self.values)
424
+
425
+ @property
426
+ def current(self) -> Optional[float]:
427
+ return self.values[-1] if self.values else None
428
+
429
+ @property
430
+ def previous(self) -> Optional[float]:
431
+ return self.values[-2] if len(self.values) >= 2 else None
432
+
433
+ @property
434
+ def delta(self) -> Optional[float]:
435
+ """Change from previous to current."""
436
+ if len(self.values) >= 2:
437
+ return self.values[-1] - self.values[-2]
438
+ return None
439
+
440
+ @property
441
+ def delta_pct(self) -> Optional[float]:
442
+ """Percentage change from previous to current."""
443
+ if len(self.values) >= 2 and self.values[-2] != 0:
444
+ return (self.values[-1] - self.values[-2]) / abs(self.values[-2])
445
+ return None
446
+
447
+ @property
448
+ def mean(self) -> Optional[float]:
449
+ return sum(self.values) / len(self.values) if self.values else None
450
+
451
+ @property
452
+ def std(self) -> Optional[float]:
453
+ if len(self.values) < 2:
454
+ return None
455
+ mean = self.mean
456
+ variance = sum((x - mean) ** 2 for x in self.values) / len(self.values)
457
+ return math.sqrt(variance)
458
+
459
+ @property
460
+ def min(self) -> Optional[float]:
461
+ return min(self.values) if self.values else None
462
+
463
+ @property
464
+ def max(self) -> Optional[float]:
465
+ return max(self.values) if self.values else None
466
+
467
+ @property
468
+ def range(self) -> Optional[float]:
469
+ if self.values:
470
+ return self.max - self.min
471
+ return None
472
+
473
+ def moving_average(self, window: int = 5) -> Optional[float]:
474
+ """Compute moving average over last N values."""
475
+ if len(self.values) < window:
476
+ return self.mean
477
+ return sum(self.values[-window:]) / window
478
+
479
+ def rate_of_change(self, window: int = 5) -> Optional[float]:
480
+ """Average rate of change over last N values."""
481
+ if len(self.values) < 2:
482
+ return None
483
+ window = min(window, len(self.values))
484
+ recent = self.values[-window:]
485
+ deltas = [recent[i] - recent[i-1] for i in range(1, len(recent))]
486
+ return sum(deltas) / len(deltas) if deltas else None
487
+
488
+ def is_anomaly(self, threshold_std: float = 2.0) -> bool:
489
+ """Is current value anomalous (outside N standard deviations)?"""
490
+ if len(self.values) < 5 or self.std is None or self.std == 0:
491
+ return False
492
+ return abs(self.values[-1] - self.mean) > threshold_std * self.std
493
+
494
+ def trend(self, window: int = 10) -> str:
495
+ """Determine trend: 'rising', 'falling', 'stable', 'volatile'."""
496
+ if len(self.values) < 3:
497
+ return "unknown"
498
+
499
+ window = min(window, len(self.values))
500
+ recent = self.values[-window:]
501
+ deltas = [recent[i] - recent[i-1] for i in range(1, len(recent))]
502
+
503
+ positive = sum(1 for d in deltas if d > 0)
504
+ negative = sum(1 for d in deltas if d < 0)
505
+
506
+ if positive > 0.7 * len(deltas):
507
+ return "rising"
508
+ elif negative > 0.7 * len(deltas):
509
+ return "falling"
510
+ elif self.std and self.mean and self.std > 0.1 * abs(self.mean):
511
+ return "volatile"
512
+ else:
513
+ return "stable"
514
+
515
+ def health_status(self) -> str:
516
+ """Check health against industry standards. Returns 'healthy', 'warning', 'critical', 'unknown'."""
517
+ if self.current is None:
518
+ return "unknown"
519
+
520
+ name_lower = self.name.lower()
521
+ if name_lower in HEALTH_SPECS:
522
+ spec = HEALTH_SPECS[name_lower]
523
+ if spec.is_critical(self.current):
524
+ return "critical"
525
+ if not spec.is_healthy(self.current):
526
+ return "warning"
527
+ return "healthy"
528
+
529
+ # Default heuristics for unknown metrics
530
+ if self.is_anomaly(threshold_std=3.0):
531
+ return "critical"
532
+ if self.is_anomaly(threshold_std=2.0):
533
+ return "warning"
534
+ return "healthy"
535
+
536
+ def to_dict(self) -> Dict[str, Any]:
537
+ return {
538
+ "name": self.name,
539
+ "category": self.category.name,
540
+ "count": self.count,
541
+ "current": self.current,
542
+ "delta": self.delta,
543
+ "delta_pct": self.delta_pct,
544
+ "mean": self.mean,
545
+ "std": self.std,
546
+ "min": self.min,
547
+ "max": self.max,
548
+ "trend": self.trend(),
549
+ "health": self.health_status(),
550
+ "is_anomaly": self.is_anomaly(),
551
+ "rate_of_change": self.rate_of_change(),
552
+ }
553
+
554
+
555
+ @dataclass
556
+ class Anomaly:
557
+ """A detected anomaly in the metric stream."""
558
+ metric_name: str
559
+ category: MetricCategory
560
+ event_id: str
561
+ timestamp: float
562
+ value: float
563
+ expected_range: Tuple[float, float] # (low, high)
564
+ deviation_std: float
565
+ severity: str # 'minor', 'major', 'critical'
566
+
567
+
568
+ @dataclass
569
+ class Correlation:
570
+ """A detected correlation between two metrics."""
571
+ metric_a: str
572
+ metric_b: str
573
+ category_a: MetricCategory
574
+ category_b: MetricCategory
575
+ coefficient: float # -1 to 1
576
+ strength: str # 'weak', 'moderate', 'strong'
577
+ direction: str # 'positive', 'negative'
578
+
579
+
580
+ @dataclass
581
+ class ThresholdCrossing:
582
+ """A metric crossing a significant threshold."""
583
+ metric_name: str
584
+ category: MetricCategory
585
+ event_id: str
586
+ timestamp: float
587
+ old_value: float
588
+ new_value: float
589
+ threshold: float
590
+ direction: str # 'above', 'below'
591
+
592
+
593
+ class MetricsEngine:
594
+ """
595
+ Quantification engine for the event stream.
596
+
597
+ Extracts numeric metrics from events, tracks them over time,
598
+ detects anomalies, correlations, and threshold crossings.
599
+
600
+ Does NOT interpret or explain. Provides raw quantified data
601
+ for human or AI observers to divine meaning from.
602
+
603
+ Example:
604
+ >>> engine = MetricsEngine(graph)
605
+ >>> engine.ingest(event)
606
+ >>>
607
+ >>> # Get metric statistics
608
+ >>> loss = engine.get_metric("loss")
609
+ >>> print(f"Loss: {loss.current} (delta: {loss.delta}, trend: {loss.trend()})")
610
+ >>>
611
+ >>> # Get anomalies
612
+ >>> for anomaly in engine.anomalies:
613
+ ... print(f"ANOMALY: {anomaly.metric_name} = {anomaly.value}")
614
+ >>>
615
+ >>> # Get correlations
616
+ >>> for corr in engine.get_correlations():
617
+ ... print(f"{corr.metric_a} ~ {corr.metric_b}: {corr.coefficient:.2f}")
618
+ """
619
+
620
+ def __init__(self, graph: Optional[CausationGraph] = None):
621
+ self.graph = graph
622
+ self._metrics: Dict[str, MetricSeries] = {}
623
+ self._anomalies: List[Anomaly] = []
624
+ self._threshold_crossings: List[ThresholdCrossing] = []
625
+ self._event_count = 0
626
+
627
+ # Configurable thresholds
628
+ self.anomaly_std_threshold = 2.5
629
+ self.correlation_min_samples = 10
630
+
631
+ # Known significant thresholds for ML metrics
632
+ self._known_thresholds = {
633
+ "loss": [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
634
+ "accuracy": [0.5, 0.8, 0.9, 0.95, 0.99],
635
+ "lr": [1e-5, 1e-4, 1e-3, 1e-2, 0.1],
636
+ "learning_rate": [1e-5, 1e-4, 1e-3, 1e-2, 0.1],
637
+ "grad_norm": [0.1, 1.0, 10.0, 100.0],
638
+ "gradient_norm": [0.1, 1.0, 10.0, 100.0],
639
+ }
640
+
641
+ def ingest(self, event: Event) -> Dict[str, MetricSeries]:
642
+ """
643
+ Ingest an event and extract/track all numeric metrics.
644
+
645
+ Returns dict of updated metric series.
646
+ """
647
+ self._event_count += 1
648
+ updated = {}
649
+
650
+ for key, value in event.data.items():
651
+ if isinstance(value, (int, float)) and not isinstance(value, bool):
652
+ category = classify_metric(key)
653
+
654
+ if math.isnan(value) or math.isinf(value):
655
+ # Track NaN/Inf as anomalies but don't add to series
656
+ self._anomalies.append(Anomaly(
657
+ metric_name=key,
658
+ category=category,
659
+ event_id=event.event_id,
660
+ timestamp=event.timestamp,
661
+ value=value,
662
+ expected_range=(0, 0),
663
+ deviation_std=float('inf'),
664
+ severity='critical',
665
+ ))
666
+ continue
667
+
668
+ # Get or create metric series with proper category
669
+ if key not in self._metrics:
670
+ self._metrics[key] = MetricSeries(name=key, category=category)
671
+
672
+ series = self._metrics[key]
673
+ old_value = series.current
674
+
675
+ # Add new value
676
+ series.values.append(float(value))
677
+ series.timestamps.append(event.timestamp)
678
+ series.event_ids.append(event.event_id)
679
+
680
+ # Check for anomaly
681
+ if series.is_anomaly(self.anomaly_std_threshold):
682
+ deviation = abs(value - series.mean) / series.std if series.std else 0
683
+ severity = 'critical' if deviation > 4 else 'major' if deviation > 3 else 'minor'
684
+ self._anomalies.append(Anomaly(
685
+ metric_name=key,
686
+ category=category,
687
+ event_id=event.event_id,
688
+ timestamp=event.timestamp,
689
+ value=value,
690
+ expected_range=(
691
+ series.mean - 2*series.std,
692
+ series.mean + 2*series.std
693
+ ),
694
+ deviation_std=deviation,
695
+ severity=severity,
696
+ ))
697
+
698
+ # Check for threshold crossing
699
+ if old_value is not None:
700
+ self._check_threshold_crossing(
701
+ key, event.event_id, event.timestamp, old_value, value
702
+ )
703
+
704
+ updated[key] = series
705
+
706
+ return updated
707
+
708
+ def _check_threshold_crossing(
709
+ self,
710
+ metric: str,
711
+ event_id: str,
712
+ timestamp: float,
713
+ old_value: float,
714
+ new_value: float
715
+ ):
716
+ """Check if a metric crossed a known threshold."""
717
+ thresholds = self._known_thresholds.get(metric, [])
718
+ category = classify_metric(metric)
719
+
720
+ for threshold in thresholds:
721
+ # Crossed upward
722
+ if old_value < threshold <= new_value:
723
+ self._threshold_crossings.append(ThresholdCrossing(
724
+ metric_name=metric,
725
+ category=category,
726
+ event_id=event_id,
727
+ timestamp=timestamp,
728
+ old_value=old_value,
729
+ new_value=new_value,
730
+ threshold=threshold,
731
+ direction='above',
732
+ ))
733
+ # Crossed downward
734
+ elif old_value > threshold >= new_value:
735
+ self._threshold_crossings.append(ThresholdCrossing(
736
+ metric_name=metric,
737
+ category=category,
738
+ event_id=event_id,
739
+ timestamp=timestamp,
740
+ old_value=old_value,
741
+ new_value=new_value,
742
+ threshold=threshold,
743
+ direction='below',
744
+ ))
745
+
746
+ def get_metric(self, name: str) -> Optional[MetricSeries]:
747
+ """Get a metric series by name."""
748
+ return self._metrics.get(name)
749
+
750
+ @property
751
+ def metrics(self) -> Dict[str, MetricSeries]:
752
+ """All tracked metrics."""
753
+ return self._metrics
754
+
755
+ @property
756
+ def metric_names(self) -> List[str]:
757
+ """Names of all tracked metrics."""
758
+ return list(self._metrics.keys())
759
+
760
+ @property
761
+ def anomalies(self) -> List[Anomaly]:
762
+ """All detected anomalies."""
763
+ return self._anomalies
764
+
765
+ @property
766
+ def recent_anomalies(self) -> List[Anomaly]:
767
+ """Anomalies from last 10 events."""
768
+ if not self._anomalies:
769
+ return []
770
+ recent_ids = set()
771
+ for series in self._metrics.values():
772
+ recent_ids.update(series.event_ids[-10:])
773
+ return [a for a in self._anomalies if a.event_id in recent_ids]
774
+
775
+ @property
776
+ def threshold_crossings(self) -> List[ThresholdCrossing]:
777
+ """All threshold crossings."""
778
+ return self._threshold_crossings
779
+
780
+ def get_correlations(self, min_coefficient: float = 0.5) -> List[Correlation]:
781
+ """
782
+ Compute correlations between all metric pairs.
783
+
784
+ Returns correlations with |coefficient| >= min_coefficient.
785
+ """
786
+ correlations = []
787
+ metric_names = list(self._metrics.keys())
788
+
789
+ for i, name_a in enumerate(metric_names):
790
+ series_a = self._metrics[name_a]
791
+ for name_b in metric_names[i+1:]:
792
+ series_b = self._metrics[name_b]
793
+ coef = self._pearson_correlation(name_a, name_b)
794
+ if coef is not None and abs(coef) >= min_coefficient:
795
+ strength = 'strong' if abs(coef) > 0.8 else 'moderate' if abs(coef) > 0.5 else 'weak'
796
+ direction = 'positive' if coef > 0 else 'negative'
797
+ correlations.append(Correlation(
798
+ metric_a=name_a,
799
+ metric_b=name_b,
800
+ category_a=series_a.category,
801
+ category_b=series_b.category,
802
+ coefficient=coef,
803
+ strength=strength,
804
+ direction=direction,
805
+ ))
806
+
807
+ return sorted(correlations, key=lambda c: abs(c.coefficient), reverse=True)
808
+
809
+ def _pearson_correlation(self, name_a: str, name_b: str) -> Optional[float]:
810
+ """Compute Pearson correlation between two metrics."""
811
+ series_a = self._metrics.get(name_a)
812
+ series_b = self._metrics.get(name_b)
813
+
814
+ if not series_a or not series_b:
815
+ return None
816
+
817
+ # Need enough samples
818
+ if series_a.count < self.correlation_min_samples or series_b.count < self.correlation_min_samples:
819
+ return None
820
+
821
+ # Align by taking min length
822
+ n = min(series_a.count, series_b.count)
823
+ a = series_a.values[-n:]
824
+ b = series_b.values[-n:]
825
+
826
+ # Compute correlation
827
+ mean_a = sum(a) / n
828
+ mean_b = sum(b) / n
829
+
830
+ numerator = sum((a[i] - mean_a) * (b[i] - mean_b) for i in range(n))
831
+
832
+ var_a = sum((x - mean_a) ** 2 for x in a)
833
+ var_b = sum((x - mean_b) ** 2 for x in b)
834
+
835
+ denominator = math.sqrt(var_a * var_b)
836
+
837
+ if denominator == 0:
838
+ return None
839
+
840
+ return numerator / denominator
841
+
842
+ def summary(self) -> Dict[str, Any]:
843
+ """Get a summary of all metrics and detections."""
844
+ return {
845
+ "event_count": self._event_count,
846
+ "metric_count": len(self._metrics),
847
+ "metrics": {name: series.to_dict() for name, series in self._metrics.items()},
848
+ "metrics_by_category": self.metrics_by_category_summary(),
849
+ "anomaly_count": len(self._anomalies),
850
+ "recent_anomalies": [
851
+ {"metric": a.metric_name, "category": a.category.name, "value": a.value, "severity": a.severity}
852
+ for a in self.recent_anomalies
853
+ ],
854
+ "threshold_crossings": len(self._threshold_crossings),
855
+ "correlations": [
856
+ {"a": c.metric_a, "b": c.metric_b, "r": c.coefficient,
857
+ "cat_a": c.category_a.name, "cat_b": c.category_b.name}
858
+ for c in self.get_correlations()[:5] # Top 5
859
+ ],
860
+ "health_status": self.health_summary(),
861
+ }
862
+
863
+ # =========================================================================
864
+ # CATEGORY-AWARE QUERIES
865
+ # =========================================================================
866
+
867
+ def get_metrics_by_category(self, category: MetricCategory) -> Dict[str, MetricSeries]:
868
+ """Get all metrics in a specific category."""
869
+ return {
870
+ name: series for name, series in self._metrics.items()
871
+ if series.category == category
872
+ }
873
+
874
+ def metrics_by_category_summary(self) -> Dict[str, Dict[str, Any]]:
875
+ """Get metric count and names grouped by category."""
876
+ by_cat: Dict[str, Dict[str, Any]] = {}
877
+ for name, series in self._metrics.items():
878
+ cat_name = series.category.name
879
+ if cat_name not in by_cat:
880
+ by_cat[cat_name] = {"count": 0, "metrics": [], "health": []}
881
+ by_cat[cat_name]["count"] += 1
882
+ by_cat[cat_name]["metrics"].append(name)
883
+ by_cat[cat_name]["health"].append(series.health_status())
884
+ return by_cat
885
+
886
+ def get_training_metrics(self) -> Dict[str, MetricSeries]:
887
+ """Convenience: get all TRAINING_DYNAMICS metrics."""
888
+ return self.get_metrics_by_category(MetricCategory.TRAINING_DYNAMICS)
889
+
890
+ def get_gradient_metrics(self) -> Dict[str, MetricSeries]:
891
+ """Convenience: get all GRADIENT_HEALTH metrics."""
892
+ return self.get_metrics_by_category(MetricCategory.GRADIENT_HEALTH)
893
+
894
+ def get_memory_metrics(self) -> Dict[str, MetricSeries]:
895
+ """Convenience: get all MEMORY_COMPUTE metrics."""
896
+ return self.get_metrics_by_category(MetricCategory.MEMORY_COMPUTE)
897
+
898
+ def get_convergence_metrics(self) -> Dict[str, MetricSeries]:
899
+ """Convenience: get all CONVERGENCE_SIGNALS metrics."""
900
+ return self.get_metrics_by_category(MetricCategory.CONVERGENCE_SIGNALS)
901
+
902
+ def health_summary(self) -> Dict[str, Any]:
903
+ """Get overall health status of all metrics."""
904
+ statuses = {"healthy": 0, "warning": 0, "critical": 0, "unknown": 0}
905
+ issues = []
906
+
907
+ for name, series in self._metrics.items():
908
+ status = series.health_status()
909
+ statuses[status] += 1
910
+ if status in ("warning", "critical"):
911
+ issues.append({
912
+ "metric": name,
913
+ "category": series.category.name,
914
+ "status": status,
915
+ "value": series.current,
916
+ "trend": series.trend(),
917
+ })
918
+
919
+ overall = "critical" if statuses["critical"] > 0 else \
920
+ "warning" if statuses["warning"] > 0 else "healthy"
921
+
922
+ return {
923
+ "overall": overall,
924
+ "counts": statuses,
925
+ "issues": issues,
926
+ }
927
+
928
+ def get_cross_category_correlations(self) -> List[Correlation]:
929
+ """Get correlations between metrics in different categories."""
930
+ all_corr = self.get_correlations(min_coefficient=0.3)
931
+ return [c for c in all_corr if c.category_a != c.category_b]
932
+
933
+ def get_category_coverage(self) -> Dict[str, bool]:
934
+ """Check which metric categories are being tracked."""
935
+ tracked = {series.category for series in self._metrics.values()}
936
+ return {cat.name: cat in tracked for cat in MetricCategory}
937
+
938
+ # =========================================================================
939
+ # TRIAGE SYSTEM - Common Sense Diagnostics (Occam's Razor)
940
+ # =========================================================================
941
+ #
942
+ # Five questions that matter:
943
+ # 1. Is training working? (loss trend)
944
+ # 2. Is it about to explode? (gradient health)
945
+ # 3. Am I wasting compute? (efficiency)
946
+ # 4. Am I overfitting? (generalization gap)
947
+ # 5. What broke and why? (anomaly + correlation)
948
+ #
949
+
950
+ def triage(self) -> Dict[str, Any]:
951
+ """
952
+ Quick diagnostic: Is training healthy? What's wrong?
953
+
954
+ Returns a simple, actionable assessment.
955
+ Occam's Razor: simplest useful answer.
956
+ """
957
+ diagnosis = {
958
+ "status": "LISTENING", # Not UNKNOWN - we're actively waiting
959
+ "confidence": 0.0,
960
+ "checks": {},
961
+ "action": "Collecting initial metrics...",
962
+ "details": [],
963
+ }
964
+
965
+ checks_passed = 0
966
+ checks_total = 0
967
+
968
+ # CHECK 1: Is loss going down?
969
+ loss_check = self._check_loss_progress()
970
+ diagnosis["checks"]["loss_progress"] = loss_check
971
+ checks_total += 1
972
+ if loss_check["ok"]:
973
+ checks_passed += 1
974
+
975
+ # CHECK 2: Are gradients healthy?
976
+ grad_check = self._check_gradient_health()
977
+ diagnosis["checks"]["gradient_health"] = grad_check
978
+ checks_total += 1
979
+ if grad_check["ok"]:
980
+ checks_passed += 1
981
+
982
+ # CHECK 3: Am I using compute efficiently?
983
+ efficiency_check = self._check_efficiency()
984
+ diagnosis["checks"]["efficiency"] = efficiency_check
985
+ checks_total += 1
986
+ if efficiency_check["ok"]:
987
+ checks_passed += 1
988
+
989
+ # CHECK 4: Am I overfitting?
990
+ overfit_check = self._check_overfitting()
991
+ diagnosis["checks"]["overfitting"] = overfit_check
992
+ checks_total += 1
993
+ if overfit_check["ok"]:
994
+ checks_passed += 1
995
+
996
+ # CHECK 5: Any anomalies pointing to root cause?
997
+ anomaly_check = self._check_anomalies()
998
+ diagnosis["checks"]["anomalies"] = anomaly_check
999
+ checks_total += 1
1000
+ if anomaly_check["ok"]:
1001
+ checks_passed += 1
1002
+
1003
+ # Overall status
1004
+ diagnosis["confidence"] = checks_passed / checks_total if checks_total > 0 else 0
1005
+
1006
+ if checks_passed == checks_total:
1007
+ diagnosis["status"] = "HEALTHY"
1008
+ diagnosis["action"] = "Training looks good. Continue monitoring."
1009
+ elif checks_passed >= checks_total * 0.6:
1010
+ diagnosis["status"] = "WARNING"
1011
+ # Find what's wrong
1012
+ issues = [k for k, v in diagnosis["checks"].items() if not v["ok"]]
1013
+ diagnosis["action"] = f"Review: {', '.join(issues)}"
1014
+ else:
1015
+ diagnosis["status"] = "CRITICAL"
1016
+ diagnosis["action"] = "Stop and investigate. Multiple issues detected."
1017
+
1018
+ # Collect all details
1019
+ for check_name, check_result in diagnosis["checks"].items():
1020
+ if check_result.get("detail"):
1021
+ diagnosis["details"].append(f"{check_name}: {check_result['detail']}")
1022
+
1023
+ return diagnosis
1024
+
1025
+ def _check_loss_progress(self) -> Dict[str, Any]:
1026
+ """Is loss decreasing as expected?"""
1027
+ # Find loss metric (try common names)
1028
+ loss_series = None
1029
+ for name in ["loss", "train_loss", "nll_loss", "ce_loss"]:
1030
+ if name in self._metrics:
1031
+ loss_series = self._metrics[name]
1032
+ break
1033
+
1034
+ if loss_series is None or loss_series.count < 3:
1035
+ return {"ok": True, "detail": "Waiting for loss metrics (need 3+)", "status": "waiting"}
1036
+
1037
+ trend = loss_series.trend()
1038
+ roc = loss_series.rate_of_change()
1039
+
1040
+ if trend == "falling":
1041
+ return {"ok": True, "detail": f"Loss falling (Δ={roc:.4f}/step)", "status": "good"}
1042
+ elif trend == "stable" and loss_series.current < 1.0:
1043
+ return {"ok": True, "detail": f"Loss stable at {loss_series.current:.4f}", "status": "converged"}
1044
+ elif trend == "rising":
1045
+ return {"ok": False, "detail": f"Loss RISING! Current: {loss_series.current:.4f}", "status": "diverging"}
1046
+ elif trend == "volatile":
1047
+ return {"ok": False, "detail": f"Loss unstable (std={loss_series.std:.4f})", "status": "unstable"}
1048
+ else:
1049
+ return {"ok": True, "detail": f"Loss: {loss_series.current:.4f} (trend unclear)", "status": "stable"}
1050
+
1051
+ def _check_gradient_health(self) -> Dict[str, Any]:
1052
+ """Are gradients in a healthy range?"""
1053
+ grad_series = None
1054
+ for name in ["grad_norm", "gradient_norm", "global_grad_norm"]:
1055
+ if name in self._metrics:
1056
+ grad_series = self._metrics[name]
1057
+ break
1058
+
1059
+ if grad_series is None or grad_series.count < 2:
1060
+ return {"ok": True, "detail": "Waiting for grad_norm metrics", "status": "waiting"}
1061
+
1062
+ current = grad_series.current
1063
+
1064
+ # Vanishing gradients
1065
+ if current < 1e-7:
1066
+ return {"ok": False, "detail": f"VANISHING gradients: {current:.2e}", "status": "vanishing"}
1067
+
1068
+ # Exploding gradients
1069
+ if current > 100:
1070
+ return {"ok": False, "detail": f"EXPLODING gradients: {current:.2f}", "status": "exploding"}
1071
+
1072
+ # Healthy range
1073
+ if 1e-5 < current < 10:
1074
+ return {"ok": True, "detail": f"Gradients healthy: {current:.4f}", "status": "healthy"}
1075
+
1076
+ # Warning zone
1077
+ return {"ok": True, "detail": f"Gradients marginal: {current:.4f}", "status": "marginal"}
1078
+
1079
+ def _check_efficiency(self) -> Dict[str, Any]:
1080
+ """Am I using compute efficiently?"""
1081
+ # Check MFU (Model FLOP Utilization)
1082
+ mfu_series = self._metrics.get("mfu")
1083
+ if mfu_series and mfu_series.count > 0:
1084
+ mfu = mfu_series.current
1085
+ if mfu < 0.1:
1086
+ return {"ok": False, "detail": f"Low GPU utilization: {mfu*100:.1f}%", "status": "inefficient"}
1087
+ elif mfu < 0.3:
1088
+ return {"ok": True, "detail": f"Moderate efficiency: {mfu*100:.1f}%", "status": "moderate"}
1089
+ else:
1090
+ return {"ok": True, "detail": f"Good efficiency: {mfu*100:.1f}%", "status": "efficient"}
1091
+
1092
+ # Fallback: check timing
1093
+ time_series = self._metrics.get("dt") or self._metrics.get("time") or self._metrics.get("batch_time")
1094
+ if time_series and time_series.count > 2:
1095
+ trend = time_series.trend()
1096
+ if trend == "rising":
1097
+ return {"ok": False, "detail": "Step time increasing (slowdown)", "status": "degrading"}
1098
+ return {"ok": True, "detail": f"Step time: {time_series.current:.3f}s", "status": "stable"}
1099
+
1100
+ return {"ok": True, "detail": "Need mfu or dt/time metrics", "status": "waiting"}
1101
+
1102
+ def _check_overfitting(self) -> Dict[str, Any]:
1103
+ """Is model overfitting?"""
1104
+ train_loss = None
1105
+ val_loss = None
1106
+
1107
+ # Find train and val loss
1108
+ for name in ["loss", "train_loss"]:
1109
+ if name in self._metrics:
1110
+ train_loss = self._metrics[name]
1111
+ break
1112
+
1113
+ for name in ["val_loss", "eval_loss", "test_loss"]:
1114
+ if name in self._metrics:
1115
+ val_loss = self._metrics[name]
1116
+ break
1117
+
1118
+ if train_loss is None or val_loss is None:
1119
+ return {"ok": True, "detail": "Need train_loss + val_loss to check", "status": "waiting"}
1120
+
1121
+ if train_loss.count < 3 or val_loss.count < 3:
1122
+ return {"ok": True, "detail": f"Collecting ({train_loss.count}/3 train, {val_loss.count}/3 val)", "status": "waiting"}
1123
+
1124
+ gap = val_loss.current - train_loss.current
1125
+ gap_pct = gap / train_loss.current if train_loss.current > 0 else 0
1126
+
1127
+ # Check if gap is widening
1128
+ train_trend = train_loss.trend()
1129
+ val_trend = val_loss.trend()
1130
+
1131
+ if train_trend == "falling" and val_trend == "rising":
1132
+ return {"ok": False, "detail": f"OVERFITTING: train↓ val↑ (gap={gap:.4f})", "status": "overfitting"}
1133
+
1134
+ if gap_pct > 0.5: # Val loss 50% higher than train
1135
+ return {"ok": False, "detail": f"Large generalization gap: {gap_pct*100:.1f}%", "status": "high_gap"}
1136
+
1137
+ if gap_pct > 0.2:
1138
+ return {"ok": True, "detail": f"Moderate gap: {gap_pct*100:.1f}%", "status": "moderate_gap"}
1139
+
1140
+ return {"ok": True, "detail": f"Good generalization (gap={gap:.4f})", "status": "healthy"}
1141
+
1142
+ def _check_anomalies(self) -> Dict[str, Any]:
1143
+ """Any recent anomalies that need attention?"""
1144
+ recent = self.recent_anomalies
1145
+
1146
+ if not recent:
1147
+ return {"ok": True, "detail": "No anomalies", "status": "clean"}
1148
+
1149
+ critical = [a for a in recent if a.severity == "critical"]
1150
+ major = [a for a in recent if a.severity == "major"]
1151
+
1152
+ if critical:
1153
+ names = list(set(a.metric_name for a in critical))
1154
+ return {"ok": False, "detail": f"CRITICAL anomalies in: {', '.join(names)}", "status": "critical"}
1155
+
1156
+ if major:
1157
+ names = list(set(a.metric_name for a in major))
1158
+ return {"ok": False, "detail": f"Major anomalies in: {', '.join(names)}", "status": "major"}
1159
+
1160
+ return {"ok": True, "detail": f"{len(recent)} minor anomalies", "status": "minor"}
1161
+
1162
+ def quick_status(self) -> str:
1163
+ """One-line status for dashboards."""
1164
+ t = self.triage()
1165
+ return f"[{t['status']}] {t['action']} (confidence: {t['confidence']*100:.0f}%)"
1166
+
1167
+ def __repr__(self) -> str:
1168
+ return f"<MetricsEngine | {len(self._metrics)} metrics, {len(self._anomalies)} anomalies>"
cascade/analysis/tracer.py ADDED
@@ -0,0 +1,487 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cascade Analysis - Bidirectional Causation Tracer.
3
+
4
+ Trace cause-effect chains forwards and backwards through time.
5
+ Find root causes. Predict cascading effects.
6
+ """
7
+
8
+ from typing import List, Dict, Any, Optional, Set
9
+ from collections import deque
10
+ from dataclasses import dataclass, field
11
+
12
+ from cascade.core.event import Event, CausationLink, CausationChain
13
+ from cascade.core.graph import CausationGraph
14
+
15
+
16
+ @dataclass
17
+ class RootCauseAnalysis:
18
+ """Results of a root cause analysis."""
19
+ target_event: Event
20
+ root_causes: List[Event]
21
+ chains: List[CausationChain]
22
+ deepest_depth: int = 0
23
+ narrative: str = ""
24
+
25
+
26
+ @dataclass
27
+ class ImpactAnalysis:
28
+ """Results of an impact/forward analysis."""
29
+ source_event: Event
30
+ effects: List[Event]
31
+ chains: List[CausationChain]
32
+ total_impact_count: int = 0
33
+ severity_score: float = 0.0
34
+ narrative: str = ""
35
+
36
+
37
+ @dataclass
38
+ class CascadePrediction:
39
+ """Prediction of likely cascade from an event."""
40
+ source_event: Event
41
+ predicted_effects: List[Dict[str, Any]] # [{event_type, probability, time_estimate}, ...]
42
+ risk_score: float = 0.0
43
+ intervention_points: List[str] = field(default_factory=list)
44
+ narrative: str = ""
45
+
46
+
47
+ class Tracer:
48
+ """
49
+ Bidirectional causation tracer.
50
+
51
+ Traces cause-effect chains through the causation graph:
52
+ - Backwards: "What caused this?" → find root causes
53
+ - Forwards: "What will this cause?" → predict cascades
54
+
55
+ Example:
56
+ >>> tracer = Tracer(graph)
57
+ >>>
58
+ >>> # What caused this gradient explosion?
59
+ >>> causes = tracer.trace_backwards("evt_123")
60
+ >>>
61
+ >>> # What will this learning rate change cause?
62
+ >>> effects = tracer.trace_forwards("evt_456")
63
+ >>>
64
+ >>> # Deep root cause analysis
65
+ >>> roots = tracer.find_root_causes("evt_789")
66
+ """
67
+
68
+ def __init__(self, graph: CausationGraph):
69
+ """
70
+ Initialize tracer with a causation graph.
71
+
72
+ Args:
73
+ graph: The causation graph to trace through
74
+ """
75
+ self.graph = graph
76
+ self._prediction_model = None # Future: ML model for predictions
77
+
78
+ def trace_backwards(self, event_id: str, max_depth: int = 1000) -> List[CausationChain]:
79
+ """
80
+ Trace causation backwards: what caused this event?
81
+
82
+ Args:
83
+ event_id: ID of the event to trace from
84
+ max_depth: Maximum depth to trace (default: 1000 - effectively unlimited)
85
+
86
+ Returns:
87
+ List of CausationChain objects, one per causal path found
88
+ """
89
+ target = self.graph.get_event(event_id)
90
+ if not target:
91
+ return []
92
+
93
+ chains = []
94
+ self._trace_backwards_recursive(event_id, [], [], max_depth, chains)
95
+
96
+ # Sort by depth (longest chain first for root cause analysis)
97
+ chains.sort(key=lambda c: c.depth, reverse=True)
98
+ return chains
99
+
100
+ def _trace_backwards_recursive(
101
+ self,
102
+ current_id: str,
103
+ current_events: List[Event],
104
+ current_links: List[CausationLink],
105
+ depth_remaining: int,
106
+ results: List[CausationChain],
107
+ visited: Optional[Set[str]] = None
108
+ ) -> None:
109
+ """Recursive helper for backwards tracing."""
110
+ if visited is None:
111
+ visited = set()
112
+
113
+ if current_id in visited:
114
+ return # Avoid cycles
115
+ visited.add(current_id)
116
+
117
+ current_event = self.graph.get_event(current_id)
118
+ if not current_event:
119
+ return
120
+
121
+ current_events = [current_event] + current_events
122
+
123
+ if depth_remaining <= 0:
124
+ # Max depth reached, record this chain
125
+ if len(current_events) > 1:
126
+ results.append(self._build_chain(current_events, current_links))
127
+ return
128
+
129
+ causes = self.graph.get_causes(current_id)
130
+
131
+ if not causes:
132
+ # This is a root - record the chain
133
+ if len(current_events) >= 1:
134
+ results.append(self._build_chain(current_events, current_links))
135
+ return
136
+
137
+ for cause in causes:
138
+ link = self.graph.get_link(cause.event_id, current_id)
139
+ new_links = [link] + current_links if link else current_links
140
+
141
+ self._trace_backwards_recursive(
142
+ cause.event_id,
143
+ current_events,
144
+ new_links,
145
+ depth_remaining - 1,
146
+ results,
147
+ visited.copy()
148
+ )
149
+
150
+ def trace_forwards(self, event_id: str, max_depth: int = 1000) -> List[CausationChain]:
151
+ """
152
+ Trace causation forwards: what will this event cause?
153
+
154
+ Args:
155
+ event_id: ID of the event to trace from
156
+ max_depth: Maximum depth to trace (default: 1000 - effectively unlimited)
157
+
158
+ Returns:
159
+ List of CausationChain objects, one per effect path found
160
+ """
161
+ source = self.graph.get_event(event_id)
162
+ if not source:
163
+ return []
164
+
165
+ chains = []
166
+ self._trace_forwards_recursive(event_id, [], [], max_depth, chains)
167
+
168
+ # Sort by depth
169
+ chains.sort(key=lambda c: c.depth, reverse=True)
170
+ return chains
171
+
172
+ def _trace_forwards_recursive(
173
+ self,
174
+ current_id: str,
175
+ current_events: List[Event],
176
+ current_links: List[CausationLink],
177
+ depth_remaining: int,
178
+ results: List[CausationChain],
179
+ visited: Optional[Set[str]] = None
180
+ ) -> None:
181
+ """Recursive helper for forwards tracing."""
182
+ if visited is None:
183
+ visited = set()
184
+
185
+ if current_id in visited:
186
+ return
187
+ visited.add(current_id)
188
+
189
+ current_event = self.graph.get_event(current_id)
190
+ if not current_event:
191
+ return
192
+
193
+ current_events = current_events + [current_event]
194
+
195
+ if depth_remaining <= 0:
196
+ if len(current_events) > 1:
197
+ results.append(self._build_chain(current_events, current_links))
198
+ return
199
+
200
+ effects = self.graph.get_effects(current_id)
201
+
202
+ if not effects:
203
+ # This is a leaf - record the chain
204
+ if len(current_events) >= 1:
205
+ results.append(self._build_chain(current_events, current_links))
206
+ return
207
+
208
+ for effect in effects:
209
+ link = self.graph.get_link(current_id, effect.event_id)
210
+ new_links = current_links + [link] if link else current_links
211
+
212
+ self._trace_forwards_recursive(
213
+ effect.event_id,
214
+ current_events,
215
+ new_links,
216
+ depth_remaining - 1,
217
+ results,
218
+ visited.copy()
219
+ )
220
+
221
+ def find_root_causes(self, event_id: str, max_depth: int = 1000) -> RootCauseAnalysis:
222
+ """
223
+ Deep root cause analysis: find the ultimate origins.
224
+
225
+ Traces all the way back to find events with no causes.
226
+
227
+ Args:
228
+ event_id: ID of the event to analyze
229
+ max_depth: Maximum depth to search (default: 1000 - effectively unlimited)
230
+
231
+ Returns:
232
+ RootCauseAnalysis with root causes and narrative
233
+ """
234
+ target = self.graph.get_event(event_id)
235
+ if not target:
236
+ return RootCauseAnalysis(
237
+ target_event=None,
238
+ root_causes=[],
239
+ chains=[],
240
+ )
241
+
242
+ chains = self.trace_backwards(event_id, max_depth)
243
+
244
+ # Extract root causes (events at the start of chains)
245
+ root_causes = []
246
+ seen = set()
247
+ for chain in chains:
248
+ if chain.events:
249
+ root = chain.events[0]
250
+ if root.event_id not in seen:
251
+ root_causes.append(root)
252
+ seen.add(root.event_id)
253
+
254
+ # Build narrative
255
+ narrative = self._build_root_cause_narrative(target, root_causes, chains)
256
+
257
+ return RootCauseAnalysis(
258
+ target_event=target,
259
+ root_causes=root_causes,
260
+ chains=chains,
261
+ deepest_depth=max(c.depth for c in chains) if chains else 0,
262
+ narrative=narrative,
263
+ )
264
+
265
+ def analyze_impact(self, event_id: str, max_depth: int = 1000) -> ImpactAnalysis:
266
+ """
267
+ Impact analysis: what were ALL downstream effects?
268
+
269
+ Traces forward to find everything this event set in motion.
270
+
271
+ Args:
272
+ event_id: ID of the event to analyze
273
+ max_depth: Maximum depth to search (default: 1000 - effectively unlimited)
274
+
275
+ Returns:
276
+ ImpactAnalysis with effects and severity score
277
+ """
278
+ source = self.graph.get_event(event_id)
279
+ if not source:
280
+ return ImpactAnalysis(
281
+ source_event=None,
282
+ effects=[],
283
+ chains=[],
284
+ )
285
+
286
+ chains = self.trace_forwards(event_id, max_depth)
287
+
288
+ # Extract all effects
289
+ effects = []
290
+ seen = set()
291
+ for chain in chains:
292
+ for event in chain.events[1:]: # Skip source
293
+ if event.event_id not in seen:
294
+ effects.append(event)
295
+ seen.add(event.event_id)
296
+
297
+ # Calculate severity
298
+ severity = self._calculate_impact_severity(source, effects)
299
+
300
+ # Build narrative
301
+ narrative = self._build_impact_narrative(source, effects, chains)
302
+
303
+ return ImpactAnalysis(
304
+ source_event=source,
305
+ effects=effects,
306
+ chains=chains,
307
+ total_impact_count=len(effects),
308
+ severity_score=severity,
309
+ narrative=narrative,
310
+ )
311
+
312
+ def predict_cascade(self, event_id: str) -> CascadePrediction:
313
+ """
314
+ Predict likely cascade from this event.
315
+
316
+ Uses learned patterns to forecast effects BEFORE they happen.
317
+ This is the "Minority Report" capability.
318
+
319
+ Args:
320
+ event_id: ID of the event to predict from
321
+
322
+ Returns:
323
+ CascadePrediction with risk scores and intervention points
324
+ """
325
+ source = self.graph.get_event(event_id)
326
+ if not source:
327
+ return CascadePrediction(
328
+ source_event=None,
329
+ predicted_effects=[],
330
+ )
331
+
332
+ # Get historical patterns for this event type
333
+ similar_events = self.graph.get_events_by_type(source.event_type)
334
+
335
+ # Count what typically follows - use all available history for better predictions
336
+ # No artificial cap - system learns from full history
337
+ effect_counts: Dict[str, int] = {}
338
+ analysis_window = similar_events # Full history, no slice
339
+ for similar in analysis_window:
340
+ effects = self.graph.get_effects(similar.event_id)
341
+ for effect in effects:
342
+ key = effect.event_type
343
+ effect_counts[key] = effect_counts.get(key, 0) + 1
344
+
345
+ # Convert to predictions
346
+ total = len(analysis_window)
347
+ predictions = []
348
+ for event_type, count in sorted(effect_counts.items(), key=lambda x: -x[1]):
349
+ predictions.append({
350
+ "event_type": event_type,
351
+ "probability": count / total if total > 0 else 0,
352
+ "historical_count": count,
353
+ })
354
+
355
+ # Calculate risk score
356
+ risk_score = self._calculate_risk_score(source, predictions)
357
+
358
+ # Identify intervention points
359
+ intervention_points = self._find_intervention_points(source, predictions)
360
+
361
+ return CascadePrediction(
362
+ source_event=source,
363
+ predicted_effects=predictions[:10], # Top 10
364
+ risk_score=risk_score,
365
+ intervention_points=intervention_points,
366
+ narrative=f"Based on {total} similar events, predicting {len(predictions)} likely effects.",
367
+ )
368
+
369
+ def _build_chain(self, events: List[Event], links: List[CausationLink]) -> CausationChain:
370
+ """Build a CausationChain from events and links."""
371
+ total_strength = 1.0
372
+ for link in links:
373
+ total_strength *= link.strength
374
+
375
+ return CausationChain(
376
+ events=events,
377
+ links=links,
378
+ total_strength=total_strength,
379
+ depth=len(links),
380
+ )
381
+
382
+ def _build_root_cause_narrative(
383
+ self,
384
+ target: Event,
385
+ roots: List[Event],
386
+ chains: List[CausationChain]
387
+ ) -> str:
388
+ """Build human-readable narrative for root cause analysis."""
389
+ if not roots:
390
+ return f"No root causes found for {target.event_type}"
391
+
392
+ lines = [f"Root cause analysis for {target.event_type}:"]
393
+ lines.append(f"Found {len(roots)} root cause(s) across {len(chains)} causal chain(s).")
394
+ lines.append("")
395
+
396
+ for i, root in enumerate(roots[:5], 1): # Top 5
397
+ lines.append(f"{i}. {root.component}/{root.event_type}")
398
+ if root.data:
399
+ key_data = list(root.data.items())[:3]
400
+ lines.append(f" Data: {dict(key_data)}")
401
+
402
+ return "\n".join(lines)
403
+
404
+ def _build_impact_narrative(
405
+ self,
406
+ source: Event,
407
+ effects: List[Event],
408
+ chains: List[CausationChain]
409
+ ) -> str:
410
+ """Build human-readable narrative for impact analysis."""
411
+ if not effects:
412
+ return f"No downstream effects found for {source.event_type}"
413
+
414
+ lines = [f"Impact analysis for {source.event_type}:"]
415
+ lines.append(f"Found {len(effects)} downstream effect(s).")
416
+ lines.append("")
417
+
418
+ # Group by event type
419
+ by_type: Dict[str, int] = {}
420
+ for effect in effects:
421
+ by_type[effect.event_type] = by_type.get(effect.event_type, 0) + 1
422
+
423
+ for event_type, count in sorted(by_type.items(), key=lambda x: -x[1]):
424
+ lines.append(f" • {event_type}: {count} occurrence(s)")
425
+
426
+ return "\n".join(lines)
427
+
428
+ def _calculate_impact_severity(self, source: Event, effects: List[Event]) -> float:
429
+ """Calculate severity score for an impact (0.0 to 1.0)."""
430
+ if not effects:
431
+ return 0.0
432
+
433
+ # Factors: number of effects, types of effects
434
+ count_score = min(1.0, len(effects) / 20) # 20+ effects = max
435
+
436
+ # High-severity event types
437
+ severe_types = {'error', 'anomaly', 'crash', 'failure', 'explosion'}
438
+ severe_count = sum(1 for e in effects if e.event_type in severe_types)
439
+ severity_score = min(1.0, severe_count / 5)
440
+
441
+ return (count_score + severity_score) / 2
442
+
443
+ def _calculate_risk_score(
444
+ self,
445
+ source: Event,
446
+ predictions: List[Dict[str, Any]]
447
+ ) -> float:
448
+ """Calculate risk score for a cascade prediction."""
449
+ if not predictions:
450
+ return 0.0
451
+
452
+ # High-risk event types
453
+ risky_types = {'error', 'anomaly', 'crash', 'failure', 'explosion', 'nan', 'overflow'}
454
+
455
+ risk = 0.0
456
+ for pred in predictions:
457
+ if pred["event_type"] in risky_types:
458
+ risk += pred["probability"] * 2 # Double weight for risky
459
+ else:
460
+ risk += pred["probability"] * 0.5
461
+
462
+ return min(1.0, risk)
463
+
464
+ def _find_intervention_points(
465
+ self,
466
+ source: Event,
467
+ predictions: List[Dict[str, Any]]
468
+ ) -> List[str]:
469
+ """Identify points where intervention could prevent bad cascades."""
470
+ points = []
471
+
472
+ # Look at source event data for intervention hints
473
+ if 'learning_rate' in source.data:
474
+ points.append("Reduce learning rate")
475
+ if 'gradient' in source.event_type.lower():
476
+ points.append("Apply gradient clipping")
477
+ if source.data.get('loss', 0) > 10:
478
+ points.append("Check loss function / data")
479
+
480
+ # Check predictions for severe outcomes
481
+ for pred in predictions:
482
+ if pred["event_type"] == "nan" and pred["probability"] > 0.3:
483
+ points.append("Enable NaN detection early stopping")
484
+ if pred["event_type"] == "overflow" and pred["probability"] > 0.3:
485
+ points.append("Apply gradient scaling")
486
+
487
+ return points
cascade/bridge.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace → IPFS Bridge
3
+
4
+ Makes every CASCADE instance a node in the IPFS network.
5
+ Serves lattice content to DHT without running a full daemon.
6
+
7
+ Uses js-ipfs HTTP API compatible endpoints via ipfs-http-client.
8
+ For HF Spaces, we use Helia (browser/Node IPFS) style serving.
9
+ """
10
+
11
+ import json
12
+ import hashlib
13
+ from pathlib import Path
14
+ from typing import Optional, Dict, Any
15
+ import threading
16
+ import time
17
+
18
+ # Optional: for full IPFS integration
19
+ try:
20
+ import ipfshttpclient
21
+ HAS_IPFS_CLIENT = True
22
+ except ImportError:
23
+ HAS_IPFS_CLIENT = False
24
+
25
+ from cascade.ipld import chain_to_ipld, chain_to_cid, encode_to_dag_cbor
26
+
27
+
28
+ class LatticeServer:
29
+ """
30
+ Serves lattice content over IPFS-compatible protocols.
31
+
32
+ Can run in multiple modes:
33
+ 1. Gateway mode: HTTP endpoints that mirror IPFS gateway API
34
+ 2. DHT mode: Announce content to IPFS DHT (needs daemon)
35
+ 3. Hybrid: Both
36
+ """
37
+
38
+ def __init__(self, lattice_dir: Path = None):
39
+ if lattice_dir is None:
40
+ # Try relative to this file first, then cwd
41
+ candidate = Path(__file__).resolve().parent.parent / "lattice"
42
+ if not candidate.exists():
43
+ candidate = Path.cwd() / "lattice"
44
+ self.lattice_dir = candidate
45
+ else:
46
+ self.lattice_dir = lattice_dir
47
+ self.ipld_dir = self.lattice_dir / "ipld"
48
+ self._index: Dict[str, Path] = {} # CID -> file path
49
+ self._build_index()
50
+
51
+ def _build_index(self):
52
+ """Index all known CIDs to their local files."""
53
+ # Index CBOR files
54
+ if self.ipld_dir.exists():
55
+ for cbor_file in self.ipld_dir.glob("*.cbor"):
56
+ ipld_json = cbor_file.with_suffix(".ipld.json")
57
+ if ipld_json.exists():
58
+ meta = json.loads(ipld_json.read_text())
59
+ # Try both 'cid' and '_cid' keys
60
+ cid = meta.get("cid") or meta.get("_cid")
61
+ if cid:
62
+ self._index[cid] = cbor_file
63
+
64
+ # Index JSON chain files (compute CID on the fly)
65
+ for json_file in self.lattice_dir.glob("*.json"):
66
+ if json_file.name == "README.md":
67
+ continue
68
+ try:
69
+ chain_data = json.loads(json_file.read_text())
70
+ cid = chain_to_cid(chain_data)
71
+ self._index[cid] = json_file
72
+ except:
73
+ pass
74
+
75
+ print(f"Indexed {len(self._index)} CIDs")
76
+
77
+ def resolve(self, cid: str) -> Optional[bytes]:
78
+ """Resolve a CID to its content."""
79
+ if cid in self._index:
80
+ filepath = self._index[cid]
81
+ if filepath.suffix == ".cbor":
82
+ return filepath.read_bytes()
83
+ else:
84
+ # JSON file - return as CBOR for consistency
85
+ chain_data = json.loads(filepath.read_text())
86
+ ipld_data = chain_to_ipld(chain_data)
87
+ return encode_to_dag_cbor(ipld_data)
88
+ return None
89
+
90
+ def list_cids(self) -> list:
91
+ """List all available CIDs."""
92
+ return list(self._index.keys())
93
+
94
+ def get_gateway_response(self, cid: str) -> tuple:
95
+ """
96
+ Return (content, content_type, status_code) for gateway-style serving.
97
+ """
98
+ content = self.resolve(cid)
99
+ if content:
100
+ return (content, "application/cbor", 200)
101
+ return (b"CID not found", "text/plain", 404)
102
+
103
+ def announce_to_dht(self, ipfs_api: str = "/ip4/127.0.0.1/tcp/5001"):
104
+ """
105
+ Announce all CIDs to IPFS DHT.
106
+ Requires running IPFS daemon.
107
+ """
108
+ if not HAS_IPFS_CLIENT:
109
+ print("ipfshttpclient not installed. Run: pip install ipfshttpclient")
110
+ return
111
+
112
+ try:
113
+ client = ipfshttpclient.connect(ipfs_api)
114
+ except Exception as e:
115
+ print(f"Could not connect to IPFS daemon: {e}")
116
+ print("Start daemon with: ipfs daemon")
117
+ return
118
+
119
+ for cid, filepath in self._index.items():
120
+ try:
121
+ # Add file to local IPFS node
122
+ if filepath.suffix == ".cbor":
123
+ result = client.add(str(filepath))
124
+ print(f"Announced {filepath.name}: {result['Hash']}")
125
+ except Exception as e:
126
+ print(f"Failed to announce {cid}: {e}")
127
+
128
+ def start_gateway(self, host: str = "0.0.0.0", port: int = 8080):
129
+ """
130
+ Start a simple HTTP gateway for serving lattice content.
131
+
132
+ Compatible with IPFS gateway URL format:
133
+ GET /ipfs/{cid}
134
+ """
135
+ from http.server import HTTPServer, BaseHTTPRequestHandler
136
+
137
+ server = self
138
+
139
+ class GatewayHandler(BaseHTTPRequestHandler):
140
+ def do_GET(self):
141
+ # Parse /ipfs/{cid} or just /{cid}
142
+ path = self.path.strip("/")
143
+ if path.startswith("ipfs/"):
144
+ cid = path[5:]
145
+ else:
146
+ cid = path
147
+
148
+ content, content_type, status = server.get_gateway_response(cid)
149
+
150
+ self.send_response(status)
151
+ self.send_header("Content-Type", content_type)
152
+ self.send_header("Content-Length", len(content))
153
+ self.send_header("Access-Control-Allow-Origin", "*")
154
+ self.end_headers()
155
+ self.wfile.write(content)
156
+
157
+ def do_HEAD(self):
158
+ path = self.path.strip("/")
159
+ if path.startswith("ipfs/"):
160
+ cid = path[5:]
161
+ else:
162
+ cid = path
163
+
164
+ _, content_type, status = server.get_gateway_response(cid)
165
+
166
+ self.send_response(status)
167
+ self.send_header("Content-Type", content_type)
168
+ self.send_header("Access-Control-Allow-Origin", "*")
169
+ self.end_headers()
170
+
171
+ def log_message(self, format, *args):
172
+ print(f"[Gateway] {args[0]}")
173
+
174
+ httpd = HTTPServer((host, port), GatewayHandler)
175
+ print(f"Lattice gateway running at http://{host}:{port}")
176
+ print(f"Serving {len(self._index)} CIDs")
177
+ print(f"\nTry: http://localhost:{port}/ipfs/bafyreidixjlzdat7ex72foi6vm3vnskhzguovxj6ondbazrqks7v6ahmei")
178
+ httpd.serve_forever()
179
+
180
+
181
+ def create_gradio_gateway():
182
+ """
183
+ Create a Gradio interface that serves as IPFS gateway.
184
+ Suitable for HuggingFace Spaces deployment.
185
+ """
186
+ try:
187
+ import gradio as gr
188
+ except ImportError:
189
+ print("Gradio not installed. Run: pip install gradio")
190
+ return None
191
+
192
+ server = LatticeServer()
193
+
194
+ def resolve_cid(cid: str) -> str:
195
+ """Resolve CID and return content as hex + JSON decode attempt."""
196
+ content = server.resolve(cid.strip())
197
+ if content is None:
198
+ return f"❌ CID not found: {cid}\n\nAvailable CIDs:\n" + "\n".join(server.list_cids())
199
+
200
+ # Try to decode as CBOR → JSON for display
201
+ try:
202
+ import dag_cbor
203
+ decoded = dag_cbor.decode(content)
204
+ return f"✓ Found! ({len(content)} bytes)\n\n{json.dumps(decoded, indent=2, default=str)}"
205
+ except:
206
+ return f"✓ Found! ({len(content)} bytes)\n\nRaw hex: {content.hex()[:200]}..."
207
+
208
+ def list_all() -> str:
209
+ """List all available CIDs."""
210
+ cids = server.list_cids()
211
+ lines = [f"=== Lattice Index ({len(cids)} chains) ===\n"]
212
+ for cid in cids:
213
+ filepath = server._index[cid]
214
+ lines.append(f"• {filepath.stem}")
215
+ lines.append(f" {cid}\n")
216
+ return "\n".join(lines)
217
+
218
+ with gr.Blocks(title="CASCADE Lattice Gateway") as app:
219
+ gr.Markdown("# 🌐 CASCADE Lattice Gateway")
220
+ gr.Markdown("*The neural internetwork, content-addressed.*")
221
+
222
+ with gr.Tab("Resolve CID"):
223
+ cid_input = gr.Textbox(
224
+ label="CID",
225
+ placeholder="bafyrei...",
226
+ value="bafyreidixjlzdat7ex72foi6vm3vnskhzguovxj6ondbazrqks7v6ahmei"
227
+ )
228
+ resolve_btn = gr.Button("Resolve")
229
+ output = gr.Textbox(label="Content", lines=20)
230
+ resolve_btn.click(resolve_cid, inputs=cid_input, outputs=output)
231
+
232
+ with gr.Tab("Browse Lattice"):
233
+ list_btn = gr.Button("List All CIDs")
234
+ list_output = gr.Textbox(label="Available Chains", lines=20)
235
+ list_btn.click(list_all, outputs=list_output)
236
+
237
+ gr.Markdown("""
238
+ ---
239
+ **What is this?**
240
+
241
+ This gateway serves the CASCADE lattice — a cryptographic provenance network for AI agents.
242
+
243
+ Every chain has a CID (Content IDentifier). Same content = same CID. Forever.
244
+
245
+ - **Genesis**: `bafyreidixjlzdat7ex72foi6vm3vnskhzguovxj6ondbazrqks7v6ahmei`
246
+ - Protocol: [IPLD](https://ipld.io/) (InterPlanetary Linked Data)
247
+ """)
248
+
249
+ return app
250
+
251
+
252
+ if __name__ == "__main__":
253
+ import sys
254
+
255
+ if "--gradio" in sys.argv:
256
+ app = create_gradio_gateway()
257
+ if app:
258
+ app.launch()
259
+ elif "--announce" in sys.argv:
260
+ server = LatticeServer()
261
+ server.announce_to_dht()
262
+ else:
263
+ # Default: run HTTP gateway
264
+ server = LatticeServer()
265
+ server.start_gateway(port=8080)
cascade/cli_main.py ADDED
@@ -0,0 +1,851 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE CLI - Full-featured Rich TUI for cascade-ai.
3
+
4
+ Exposes all CASCADE capabilities:
5
+ - Lattice: stats, list, inspect, chains, pin, export, watch
6
+ - Model: observe, fingerprint
7
+ - Data: entities, provenance, pii scan
8
+ - System: logs, analyze, ingest
9
+ - Proxy: start intercepting proxy
10
+ """
11
+
12
+ import argparse
13
+ import sys
14
+ import json
15
+ from pathlib import Path
16
+ from datetime import datetime
17
+
18
+ # Rich imports with fallback
19
+ try:
20
+ from rich.console import Console
21
+ from rich.table import Table
22
+ from rich.panel import Panel
23
+ from rich.tree import Tree
24
+ from rich.progress import Progress, SpinnerColumn, TextColumn
25
+ from rich.text import Text
26
+ from rich.markdown import Markdown
27
+ from rich.syntax import Syntax
28
+ from rich import box
29
+ HAS_RICH = True
30
+ except ImportError:
31
+ HAS_RICH = False
32
+
33
+ console = Console() if HAS_RICH else None
34
+
35
+
36
+ # ═══════════════════════════════════════════════════════════════════════════════
37
+ # LATTICE COMMANDS
38
+ # ═══════════════════════════════════════════════════════════════════════════════
39
+
40
+ def cmd_stats(args):
41
+ """Show lattice statistics with Rich panels."""
42
+ from cascade.observation import ObservationManager
43
+
44
+ manager = ObservationManager()
45
+ stats = manager.get_stats()
46
+
47
+ if HAS_RICH:
48
+ stats_table = Table(show_header=False, box=box.SIMPLE, padding=(0, 2))
49
+ stats_table.add_column("Key", style="cyan")
50
+ stats_table.add_column("Value", style="green")
51
+
52
+ stats_table.add_row("Genesis Root", f"[bold magenta]{stats['genesis_root']}[/]")
53
+ stats_table.add_row("", "")
54
+ stats_table.add_row("Total Observations", str(stats['total_observations']))
55
+ stats_table.add_row(" └─ Model", str(stats['model_observations']))
56
+ stats_table.add_row(" └─ Data", str(stats['data_observations']))
57
+ stats_table.add_row(" └─ System", str(stats['system_observations']))
58
+ stats_table.add_row("", "")
59
+ stats_table.add_row("Registered Models", str(stats['registered_models']))
60
+ stats_table.add_row("Unique Models Observed", str(stats['unique_models']))
61
+
62
+ panel = Panel(
63
+ stats_table,
64
+ title="[bold cyan]CASCADE LATTICE[/]",
65
+ subtitle="[dim]The Neural Internetwork[/]",
66
+ border_style="cyan",
67
+ )
68
+ console.print(panel)
69
+ else:
70
+ print(f"""
71
+ CASCADE LATTICE STATS
72
+ ═════════════════════
73
+ Genesis Root: {stats['genesis_root']}
74
+
75
+ Observations:
76
+ Total: {stats['total_observations']}
77
+ Model: {stats['model_observations']}
78
+ Data: {stats['data_observations']}
79
+ System: {stats['system_observations']}
80
+
81
+ Models:
82
+ Registered: {stats['registered_models']}
83
+ Observed: {stats['unique_models']}
84
+ """)
85
+
86
+
87
+ def cmd_list(args):
88
+ """List recent observations."""
89
+ from cascade.observation import ObservationManager
90
+
91
+ manager = ObservationManager()
92
+ observations = manager.list_observations(limit=args.limit)
93
+
94
+ if not observations:
95
+ if HAS_RICH:
96
+ console.print("[yellow]No observations yet.[/]")
97
+ else:
98
+ print("No observations yet.")
99
+ return
100
+
101
+ if HAS_RICH:
102
+ table = Table(title=f"Recent Observations", box=box.ROUNDED)
103
+ table.add_column("Type", style="cyan", width=8)
104
+ table.add_column("Source", style="white", max_width=40)
105
+ table.add_column("Merkle Root", style="magenta")
106
+ table.add_column("Time", style="dim")
107
+
108
+ for obs in observations:
109
+ obs_type = obs.get('observation_type', '?')[:7]
110
+ source = obs.get('source_id', 'unknown')[:39]
111
+ merkle = obs.get('merkle_root', '?')[:16]
112
+ timestamp = obs.get('timestamp', '')
113
+ if timestamp:
114
+ try:
115
+ if isinstance(timestamp, (int, float)):
116
+ timestamp = datetime.fromtimestamp(timestamp).strftime('%H:%M:%S')
117
+ else:
118
+ timestamp = str(timestamp)[:8]
119
+ except:
120
+ timestamp = '?'
121
+
122
+ table.add_row(obs_type, source, merkle, timestamp)
123
+
124
+ console.print(table)
125
+ console.print(f"[dim]Showing {len(observations)} of {manager.get_stats()['total_observations']}[/]")
126
+ else:
127
+ print(f"\n{'TYPE':<8} {'SOURCE':<40} {'MERKLE ROOT':<20}")
128
+ print("─" * 70)
129
+ for obs in observations:
130
+ print(f"{obs.get('observation_type', '?')[:7]:<8} {obs.get('source_id', '?')[:39]:<40} {obs.get('merkle_root', '?')[:19]:<20}")
131
+
132
+
133
+ def cmd_inspect(args):
134
+ """Inspect a specific observation by merkle root."""
135
+ from cascade.observation import ObservationManager
136
+
137
+ manager = ObservationManager()
138
+ obs = manager.get_observation(args.root)
139
+
140
+ if not obs:
141
+ if HAS_RICH:
142
+ console.print(f"[red]Observation not found:[/] {args.root}")
143
+ else:
144
+ print(f"Observation not found: {args.root}")
145
+ return
146
+
147
+ if HAS_RICH:
148
+ tree = Tree(f"[bold magenta]{args.root}[/]")
149
+
150
+ for key, value in obs.items():
151
+ if isinstance(value, dict):
152
+ branch = tree.add(f"[cyan]{key}[/]")
153
+ for k, v in value.items():
154
+ branch.add(f"[dim]{k}:[/] {v}")
155
+ elif isinstance(value, list):
156
+ branch = tree.add(f"[cyan]{key}[/] ({len(value)} items)")
157
+ for item in value[:5]:
158
+ branch.add(str(item)[:60])
159
+ if len(value) > 5:
160
+ branch.add(f"[dim]... and {len(value) - 5} more[/]")
161
+ else:
162
+ tree.add(f"[cyan]{key}:[/] {value}")
163
+
164
+ console.print(Panel(tree, title="Observation Details", border_style="magenta"))
165
+ else:
166
+ print(json.dumps(obs, indent=2, default=str))
167
+
168
+
169
+ def cmd_chains(args):
170
+ """List all chains in the lattice."""
171
+ from cascade.viz.lattice_gateway import load_lattice_data
172
+
173
+ data = load_lattice_data()
174
+ chains = data.get('chains', [])
175
+
176
+ if HAS_RICH:
177
+ table = Table(title="Lattice Chains", box=box.ROUNDED)
178
+ table.add_column("Name", style="cyan")
179
+ table.add_column("Merkle Root", style="magenta")
180
+ table.add_column("Records", justify="right")
181
+ table.add_column("CID", style="dim")
182
+
183
+ for chain in chains:
184
+ name = chain.get('name', '?')
185
+ root = chain.get('merkle_root', '?')[:16]
186
+ records = len(chain.get('records', {}))
187
+ cid = chain.get('cid', 'Not pinned')
188
+ if cid and cid != 'Not pinned':
189
+ cid = cid[:20] + '...'
190
+
191
+ style = "bold green" if name == 'genesis' else None
192
+ table.add_row(name, root, str(records), cid, style=style)
193
+
194
+ console.print(table)
195
+ console.print(f"\n[dim]Genesis: {data.get('genesis_root', 'N/A')}[/]")
196
+ else:
197
+ print(f"Chains in lattice: {len(chains)}")
198
+ for chain in chains:
199
+ print(f" {chain.get('name')}: {chain.get('merkle_root', '?')[:16]} ({len(chain.get('records', {}))} records)")
200
+
201
+
202
+ def cmd_pin(args):
203
+ """Pin observation to IPFS."""
204
+ from cascade.observation import ObservationManager
205
+
206
+ manager = ObservationManager()
207
+ obs = manager.get_observation(args.root)
208
+
209
+ if not obs:
210
+ if HAS_RICH:
211
+ console.print(f"[red]Observation not found:[/] {args.root}")
212
+ else:
213
+ print(f"Observation not found: {args.root}")
214
+ return
215
+
216
+ if HAS_RICH:
217
+ with console.status("[cyan]Pinning to IPFS...[/]"):
218
+ cid = manager.pin_to_ipfs(obs)
219
+
220
+ if cid:
221
+ console.print(f"[green]✓ Pinned to IPFS[/]")
222
+ console.print(f" CID: [magenta]{cid}[/]")
223
+ console.print(f" URL: https://storacha.link/ipfs/{cid}")
224
+ else:
225
+ console.print("[red]✗ Failed to pin[/]")
226
+ else:
227
+ print(f"Pinning {args.root}...")
228
+ cid = manager.pin_to_ipfs(obs)
229
+ if cid:
230
+ print(f"✓ Pinned: {cid}")
231
+ else:
232
+ print("✗ Failed")
233
+
234
+
235
+ def cmd_export(args):
236
+ """Export lattice or chain to file."""
237
+ from cascade.viz.lattice_gateway import load_lattice_data
238
+
239
+ data = load_lattice_data()
240
+
241
+ if args.chain:
242
+ chains = [c for c in data.get('chains', []) if c['name'] == args.chain]
243
+ if not chains:
244
+ msg = f"Chain not found: {args.chain}"
245
+ console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
246
+ return
247
+ export_data = chains[0]
248
+ else:
249
+ export_data = data
250
+
251
+ output = Path(args.output)
252
+ output.write_text(json.dumps(export_data, indent=2, default=str))
253
+
254
+ msg = f"✓ Exported to {output}"
255
+ console.print(f"[green]{msg}[/]") if HAS_RICH else print(msg)
256
+
257
+
258
+ def cmd_watch(args):
259
+ """Watch live observations in real-time."""
260
+ from cascade.observation import ObservationManager
261
+ import time
262
+
263
+ manager = ObservationManager()
264
+ last_count = 0
265
+
266
+ if HAS_RICH:
267
+ console.print("[cyan]Watching for observations... (Ctrl+C to stop)[/]\n")
268
+ else:
269
+ print("Watching... (Ctrl+C to stop)")
270
+
271
+ try:
272
+ while True:
273
+ stats = manager.get_stats()
274
+ current = stats['total_observations']
275
+
276
+ if current > last_count:
277
+ new_obs = manager.list_observations(limit=current - last_count)
278
+ for obs in reversed(new_obs):
279
+ if HAS_RICH:
280
+ console.print(
281
+ f"[green]●[/] [{datetime.now().strftime('%H:%M:%S')}] "
282
+ f"[cyan]{obs.get('observation_type', '?')}[/] "
283
+ f"[white]{obs.get('source_id', '?')[:40]}[/] "
284
+ f"[magenta]{obs.get('merkle_root', '?')[:16]}[/]"
285
+ )
286
+ else:
287
+ print(f"● {obs.get('observation_type', '?')} {obs.get('merkle_root', '?')[:16]}")
288
+ last_count = current
289
+
290
+ time.sleep(1)
291
+ except KeyboardInterrupt:
292
+ msg = "\nStopped watching."
293
+ console.print(f"[yellow]{msg}[/]") if HAS_RICH else print(msg)
294
+
295
+
296
+ # ═══════════════════════════════════════════════════════════════════════════════
297
+ # MODEL COMMANDS
298
+ # ═══════════════════════════════════════════════════════════════════════════════
299
+
300
+ def cmd_observe(args):
301
+ """Manually observe a model interaction."""
302
+ from cascade import observe
303
+
304
+ result = observe(
305
+ model_id=args.model,
306
+ input_data=args.input,
307
+ output_data=args.output,
308
+ observation_type='model',
309
+ )
310
+
311
+ if HAS_RICH:
312
+ console.print(f"[green]✓ Observed[/]")
313
+ console.print(f" Merkle Root: [magenta]{result.get('merkle_root', 'N/A')}[/]")
314
+ else:
315
+ print(f"Observed: {result.get('merkle_root', 'N/A')}")
316
+
317
+
318
+ def cmd_fingerprint(args):
319
+ """Generate model fingerprint."""
320
+ try:
321
+ from cascade.forensics.fingerprints import ModelFingerprinter
322
+
323
+ if HAS_RICH:
324
+ with console.status(f"[cyan]Fingerprinting {args.model}...[/]"):
325
+ fp = ModelFingerprinter()
326
+ result = fp.fingerprint(args.model)
327
+
328
+ if result:
329
+ table = Table(title=f"Fingerprint: {args.model}", box=box.ROUNDED)
330
+ table.add_column("Property", style="cyan")
331
+ table.add_column("Value", style="white")
332
+
333
+ for key, value in result.items():
334
+ if isinstance(value, dict):
335
+ value = json.dumps(value)[:50] + '...'
336
+ table.add_row(str(key), str(value)[:60])
337
+
338
+ console.print(table)
339
+ else:
340
+ console.print("[yellow]Could not fingerprint model[/]")
341
+ else:
342
+ fp = ModelFingerprinter()
343
+ result = fp.fingerprint(args.model)
344
+ print(json.dumps(result, indent=2, default=str))
345
+ except Exception as e:
346
+ msg = f"Error: {e}"
347
+ console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
348
+
349
+
350
+ # ═══════════════════════════════════════════════════════════════════════════════
351
+ # DATA COMMANDS
352
+ # ═══════════════════════════════════════════════════════════════════════════════
353
+
354
+ def cmd_entities(args):
355
+ """Run entity resolution on a file."""
356
+ try:
357
+ from cascade.data.entities import EntityResolver
358
+
359
+ if HAS_RICH:
360
+ with console.status(f"[cyan]Resolving entities in {args.file}...[/]"):
361
+ resolver = EntityResolver()
362
+ result = resolver.resolve_file(args.file)
363
+
364
+ if result:
365
+ console.print(f"[green]✓ Found {len(result)} entities[/]")
366
+
367
+ table = Table(box=box.SIMPLE)
368
+ table.add_column("Entity", style="cyan")
369
+ table.add_column("Type", style="magenta")
370
+ table.add_column("Count", justify="right")
371
+
372
+ for entity in result[:20]:
373
+ table.add_row(
374
+ str(entity.get('name', '?'))[:30],
375
+ entity.get('type', '?'),
376
+ str(entity.get('count', 1))
377
+ )
378
+
379
+ console.print(table)
380
+ if len(result) > 20:
381
+ console.print(f"[dim]... and {len(result) - 20} more[/]")
382
+ else:
383
+ resolver = EntityResolver()
384
+ result = resolver.resolve_file(args.file)
385
+ print(f"Found {len(result)} entities")
386
+ except Exception as e:
387
+ msg = f"Error: {e}"
388
+ console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
389
+
390
+
391
+ def cmd_pii(args):
392
+ """Scan for PII in a file."""
393
+ try:
394
+ from cascade.data.pii import PIIScanner
395
+
396
+ if HAS_RICH:
397
+ with console.status(f"[cyan]Scanning {args.file} for PII...[/]"):
398
+ scanner = PIIScanner()
399
+ results = scanner.scan_file(args.file)
400
+
401
+ if results:
402
+ console.print(f"[yellow]⚠ Found {len(results)} potential PII instances[/]")
403
+
404
+ table = Table(box=box.ROUNDED)
405
+ table.add_column("Type", style="red")
406
+ table.add_column("Value", style="yellow")
407
+ table.add_column("Location", style="dim")
408
+
409
+ for pii in results[:20]:
410
+ val = pii.get('value', '?')
411
+ table.add_row(
412
+ pii.get('type', '?'),
413
+ val[:30] + '...' if len(val) > 30 else val,
414
+ str(pii.get('location', '?'))
415
+ )
416
+
417
+ console.print(table)
418
+ else:
419
+ console.print("[green]✓ No PII detected[/]")
420
+ else:
421
+ scanner = PIIScanner()
422
+ results = scanner.scan_file(args.file)
423
+ print(f"Found {len(results)} PII instances")
424
+ except Exception as e:
425
+ msg = f"Error: {e}"
426
+ console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
427
+
428
+
429
+ def cmd_provenance(args):
430
+ """Show data provenance for a file/dataset."""
431
+ try:
432
+ from cascade.data.provenance import DataProvenance
433
+
434
+ if HAS_RICH:
435
+ with console.status(f"[cyan]Analyzing provenance...[/]"):
436
+ prov = DataProvenance()
437
+ result = prov.analyze(args.path)
438
+
439
+ if result:
440
+ tree = Tree(f"[bold cyan]{args.path}[/]")
441
+
442
+ if 'hash' in result:
443
+ tree.add(f"[magenta]Hash:[/] {result['hash']}")
444
+ if 'sources' in result:
445
+ sources = tree.add("[cyan]Sources[/]")
446
+ for src in result['sources']:
447
+ sources.add(str(src))
448
+ if 'transformations' in result:
449
+ transforms = tree.add("[cyan]Transformations[/]")
450
+ for t in result['transformations']:
451
+ transforms.add(str(t))
452
+
453
+ console.print(Panel(tree, title="Data Provenance", border_style="cyan"))
454
+ else:
455
+ prov = DataProvenance()
456
+ result = prov.analyze(args.path)
457
+ print(json.dumps(result, indent=2, default=str))
458
+ except Exception as e:
459
+ msg = f"Error: {e}"
460
+ console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
461
+
462
+
463
+ # ═══════════════════════════════════════════════════════════════════════════════
464
+ # SYSTEM COMMANDS
465
+ # ═══════════════════════════════════════════════════════════════════════════════
466
+
467
+ def cmd_ingest(args):
468
+ """Ingest logs/files into the lattice."""
469
+ try:
470
+ from cascade.system.repo_ingester import RepoIngester
471
+
472
+ if HAS_RICH:
473
+ with console.status(f"[cyan]Ingesting {args.path}...[/]"):
474
+ ingester = RepoIngester()
475
+ result = ingester.ingest(args.path)
476
+
477
+ console.print(f"[green]✓ Ingested[/]")
478
+ console.print(f" Files: {result.get('files', 0)}")
479
+ console.print(f" Observations: {result.get('observations', 0)}")
480
+ console.print(f" Merkle Root: [magenta]{result.get('merkle_root', 'N/A')}[/]")
481
+ else:
482
+ ingester = RepoIngester()
483
+ result = ingester.ingest(args.path)
484
+ print(f"Ingested: {result}")
485
+ except Exception as e:
486
+ msg = f"Error: {e}"
487
+ console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
488
+
489
+
490
+ def cmd_analyze(args):
491
+ """Analyze a log file or folder."""
492
+ try:
493
+ from cascade.system.omnidirectional_analyzer import OmnidirectionalAnalyzer
494
+
495
+ if HAS_RICH:
496
+ with console.status(f"[cyan]Analyzing {args.path}...[/]"):
497
+ analyzer = OmnidirectionalAnalyzer()
498
+ result = analyzer.analyze(args.path)
499
+
500
+ if result:
501
+ console.print(Panel(
502
+ Syntax(json.dumps(result, indent=2, default=str), "json"),
503
+ title="Analysis Result",
504
+ border_style="cyan"
505
+ ))
506
+ else:
507
+ analyzer = OmnidirectionalAnalyzer()
508
+ result = analyzer.analyze(args.path)
509
+ print(json.dumps(result, indent=2, default=str))
510
+ except Exception as e:
511
+ msg = f"Error: {e}"
512
+ console.print(f"[red]{msg}[/]") if HAS_RICH else print(msg)
513
+
514
+
515
+ # ═══════════════════════════════════════════════════════════════════════════════
516
+ # PROXY & INIT
517
+ # ═══════════════════════════════════════════════════════════════════════════════
518
+
519
+ def cmd_proxy(args):
520
+ """Start the CASCADE proxy server."""
521
+ if HAS_RICH:
522
+ console.print(Panel(
523
+ f"""[cyan]CASCADE Proxy Server[/]
524
+
525
+ Listening on [bold]{args.host}:{args.port}[/]
526
+
527
+ Set these environment variables in your app:
528
+ [green]
529
+ OPENAI_BASE_URL=http://localhost:{args.port}/v1
530
+ ANTHROPIC_BASE_URL=http://localhost:{args.port}/anthropic
531
+ [/]
532
+ Press Ctrl+C to stop.""",
533
+ title="🌐 Proxy Mode",
534
+ border_style="cyan",
535
+ ))
536
+ else:
537
+ print(f"CASCADE Proxy on {args.host}:{args.port}")
538
+
539
+ from cascade.proxy import run_proxy
540
+ run_proxy(host=args.host, port=args.port, verbose=not args.quiet)
541
+
542
+
543
+ def cmd_init(args):
544
+ """Show initialization instructions."""
545
+ if HAS_RICH:
546
+ md = """
547
+ # CASCADE Setup
548
+
549
+ ## Option 1: Auto-Patch (Python)
550
+ ```python
551
+ import cascade
552
+ cascade.init()
553
+
554
+ # Now every call emits a receipt
555
+ from openai import OpenAI
556
+ client = OpenAI()
557
+ client.chat.completions.create(...) # ← automatically observed
558
+ ```
559
+
560
+ ## Option 2: Proxy Mode (Any Language)
561
+ ```bash
562
+ cascade proxy --port 7777
563
+ ```
564
+ Then set environment variables:
565
+ ```bash
566
+ export OPENAI_BASE_URL=http://localhost:7777/v1
567
+ export ANTHROPIC_BASE_URL=http://localhost:7777/anthropic
568
+ ```
569
+
570
+ ## Option 3: Manual Observation
571
+ ```python
572
+ from cascade import observe
573
+ observe(model_id="my-model", input_data="prompt", output_data="response")
574
+ ```
575
+
576
+ ---
577
+ **Genesis Root:** `89f940c1a4b7aa65`
578
+ """
579
+ console.print(Panel(Markdown(md), title="[bold cyan]CASCADE[/]", border_style="cyan"))
580
+ else:
581
+ print("""
582
+ CASCADE - Universal AI Provenance Layer
583
+
584
+ OPTION 1: Auto-Patch (Python)
585
+ import cascade
586
+ cascade.init()
587
+
588
+ OPTION 2: Proxy Mode (Any Language)
589
+ cascade proxy
590
+ export OPENAI_BASE_URL=http://localhost:7777/v1
591
+
592
+ OPTION 3: Manual
593
+ from cascade import observe
594
+ observe(model_id="...", input_data="...", output_data="...")
595
+ """)
596
+
597
+
598
+ def cmd_version(args):
599
+ """Show version."""
600
+ try:
601
+ from cascade import __version__
602
+ version = __version__
603
+ except:
604
+ version = "0.1.1"
605
+
606
+ if HAS_RICH:
607
+ console.print(f"[cyan]cascade-ai[/] [bold]{version}[/]")
608
+ console.print(f"[dim]Genesis: 89f940c1a4b7aa65[/]")
609
+ else:
610
+ print(f"cascade-ai {version}")
611
+
612
+
613
+ # ═══════════════════════════════════════════════════════════════════════════════
614
+ # HOLD COMMANDS - Inference-Level Halt Protocol
615
+ # ═══════════════════════════════════════════════════════════════════════════════
616
+
617
+ def cmd_hold_status(args):
618
+ """Show HOLD system status."""
619
+ try:
620
+ from cascade.hold import Hold
621
+ hold = Hold.get()
622
+
623
+ if HAS_RICH:
624
+ from rich.table import Table
625
+
626
+ table = Table(title="🛑 HOLD System Status", box=box.SIMPLE)
627
+ table.add_column("Property", style="cyan")
628
+ table.add_column("Value", style="green")
629
+
630
+ table.add_row("Hold Count", str(hold._hold_count))
631
+ table.add_row("Override Count", str(hold._override_count))
632
+ table.add_row("Timeout", f"{hold.timeout}s")
633
+ table.add_row("Auto Accept", str(hold.auto_accept))
634
+ table.add_row("Listeners", str(len(hold._listeners)))
635
+ table.add_row("Last Merkle", hold._last_merkle or "None")
636
+ table.add_row("Current Hold", "Active" if hold._current_hold else "None")
637
+
638
+ console.print(table)
639
+ else:
640
+ print(f"HOLD Count: {hold._hold_count}")
641
+ print(f"Override Count: {hold._override_count}")
642
+ print(f"Timeout: {hold.timeout}s")
643
+ print(f"Listeners: {len(hold._listeners)}")
644
+ except Exception as e:
645
+ if HAS_RICH:
646
+ console.print(f"[red]Error: {e}[/]")
647
+ else:
648
+ print(f"Error: {e}")
649
+
650
+
651
+ def cmd_hold_info(args):
652
+ """Show HOLD usage information."""
653
+ info = """
654
+ 🛑 HOLD - Inference-Level Halt Protocol
655
+
656
+ HOLD pauses AI inference so humans can observe and intervene.
657
+
658
+ USAGE IN YOUR CODE:
659
+ from cascade.hold import Hold
660
+
661
+ hold = Hold.get()
662
+
663
+ # In your inference loop:
664
+ probs = model.predict(observation)
665
+
666
+ resolution = hold.yield_point(
667
+ action_probs=probs,
668
+ value=value_estimate,
669
+ observation=obs,
670
+ brain_id="my_model",
671
+ # Optional informational wealth:
672
+ action_labels=["up", "down", "left", "right"],
673
+ latent=model.latent,
674
+ attention=model.attention,
675
+ features=model.features,
676
+ imagination=model.imagine(),
677
+ )
678
+
679
+ action = resolution.action # Final action (AI or override)
680
+ was_override = resolution.was_override # True if human intervened
681
+
682
+ REGISTERING LISTENERS:
683
+ def my_handler(hold_point):
684
+ print(f"HOLD: {hold_point.action_probs}")
685
+ # Send to UI, game engine, logger, etc.
686
+
687
+ hold.register_listener(my_handler)
688
+
689
+ RESOLVING HOLDS:
690
+ hold.resolve(action=3, source="human") # Override with action 3
691
+ hold.accept() # Accept AI's choice
692
+ """
693
+ if HAS_RICH:
694
+ console.print(Panel(info, title="[bold red]HOLD[/]", border_style="red"))
695
+ else:
696
+ print(info)
697
+
698
+
699
+ # ═══════════════════════════════════════════════════════════════════════════════
700
+ # MAIN
701
+ # ═══════════════════════════════════════════════════════════════════════════════
702
+
703
+ def main():
704
+ """Main CLI entry point."""
705
+ parser = argparse.ArgumentParser(
706
+ prog="cascade",
707
+ description="CASCADE - Universal AI Provenance Layer",
708
+ formatter_class=argparse.RawDescriptionHelpFormatter,
709
+ epilog="""
710
+ Examples:
711
+ cascade stats Show lattice statistics
712
+ cascade list -n 20 List recent observations
713
+ cascade chains List all chains
714
+ cascade inspect <root> Inspect an observation
715
+ cascade watch Live observation feed
716
+ cascade proxy Start proxy server
717
+ cascade fingerprint <model> Fingerprint a model
718
+ cascade pii <file> Scan file for PII
719
+ cascade ingest <path> Ingest logs/files
720
+ """
721
+ )
722
+ parser.add_argument("--version", "-v", action="store_true", help="Show version")
723
+
724
+ subparsers = parser.add_subparsers(dest="command", help="Commands")
725
+
726
+ # ─── Lattice commands ───
727
+ subparsers.add_parser("stats", help="Show lattice statistics").set_defaults(func=cmd_stats)
728
+ subparsers.add_parser("chains", help="List all chains").set_defaults(func=cmd_chains)
729
+ subparsers.add_parser("init", help="Show setup instructions").set_defaults(func=cmd_init)
730
+ subparsers.add_parser("watch", help="Watch live observations").set_defaults(func=cmd_watch)
731
+
732
+ list_p = subparsers.add_parser("list", help="List recent observations")
733
+ list_p.add_argument("--limit", "-n", type=int, default=10, help="Number to show")
734
+ list_p.set_defaults(func=cmd_list)
735
+
736
+ inspect_p = subparsers.add_parser("inspect", help="Inspect an observation")
737
+ inspect_p.add_argument("root", help="Merkle root to inspect")
738
+ inspect_p.set_defaults(func=cmd_inspect)
739
+
740
+ pin_p = subparsers.add_parser("pin", help="Pin observation to IPFS")
741
+ pin_p.add_argument("root", help="Merkle root to pin")
742
+ pin_p.set_defaults(func=cmd_pin)
743
+
744
+ export_p = subparsers.add_parser("export", help="Export lattice/chain to JSON")
745
+ export_p.add_argument("--chain", "-c", help="Export specific chain")
746
+ export_p.add_argument("--output", "-o", default="cascade_export.json", help="Output file")
747
+ export_p.set_defaults(func=cmd_export)
748
+
749
+ # ─── Model commands ───
750
+ observe_p = subparsers.add_parser("observe", help="Manual observation")
751
+ observe_p.add_argument("--model", "-m", required=True, help="Model ID")
752
+ observe_p.add_argument("--input", "-i", required=True, help="Input data")
753
+ observe_p.add_argument("--output", "-o", required=True, help="Output data")
754
+ observe_p.set_defaults(func=cmd_observe)
755
+
756
+ fp_p = subparsers.add_parser("fingerprint", help="Fingerprint a model")
757
+ fp_p.add_argument("model", help="Model name/path")
758
+ fp_p.set_defaults(func=cmd_fingerprint)
759
+
760
+ # ─── Data commands ───
761
+ entities_p = subparsers.add_parser("entities", help="Entity resolution")
762
+ entities_p.add_argument("file", help="File to analyze")
763
+ entities_p.set_defaults(func=cmd_entities)
764
+
765
+ pii_p = subparsers.add_parser("pii", help="Scan for PII")
766
+ pii_p.add_argument("file", help="File to scan")
767
+ pii_p.set_defaults(func=cmd_pii)
768
+
769
+ prov_p = subparsers.add_parser("provenance", help="Data provenance")
770
+ prov_p.add_argument("path", help="File or dataset path")
771
+ prov_p.set_defaults(func=cmd_provenance)
772
+
773
+ # ─── System commands ───
774
+ ingest_p = subparsers.add_parser("ingest", help="Ingest logs/files")
775
+ ingest_p.add_argument("path", help="Path to ingest")
776
+ ingest_p.set_defaults(func=cmd_ingest)
777
+
778
+ analyze_p = subparsers.add_parser("analyze", help="Analyze logs/files")
779
+ analyze_p.add_argument("path", help="Path to analyze")
780
+ analyze_p.set_defaults(func=cmd_analyze)
781
+
782
+ # ─── Proxy ───
783
+ proxy_p = subparsers.add_parser("proxy", help="Start proxy server")
784
+ proxy_p.add_argument("--host", default="0.0.0.0", help="Host to bind")
785
+ proxy_p.add_argument("--port", "-p", type=int, default=7777, help="Port")
786
+ proxy_p.add_argument("--quiet", "-q", action="store_true", help="Quiet mode")
787
+ proxy_p.set_defaults(func=cmd_proxy)
788
+
789
+ # ─── HOLD - Inference-Level Halt Protocol ───
790
+ hold_p = subparsers.add_parser("hold", help="Show HOLD usage and API info")
791
+ hold_p.set_defaults(func=cmd_hold_info)
792
+
793
+ hold_status_p = subparsers.add_parser("hold-status", help="Show HOLD system status")
794
+ hold_status_p.set_defaults(func=cmd_hold_status)
795
+
796
+ # Parse
797
+ args = parser.parse_args()
798
+
799
+ if args.version:
800
+ cmd_version(args)
801
+ return
802
+
803
+ if not args.command:
804
+ if HAS_RICH:
805
+ console.print(Panel(
806
+ """[cyan]CASCADE[/] - Universal AI Provenance Layer
807
+
808
+ [bold]Lattice Commands:[/]
809
+ [green]stats[/] Show lattice statistics
810
+ [green]chains[/] List all chains
811
+ [green]list[/] List recent observations
812
+ [green]inspect[/] Inspect an observation
813
+ [green]watch[/] Live observation feed
814
+ [green]pin[/] Pin to IPFS
815
+ [green]export[/] Export to JSON
816
+
817
+ [bold]Model Commands:[/]
818
+ [green]observe[/] Manual observation
819
+ [green]fingerprint[/] Fingerprint a model
820
+
821
+ [bold]Data Commands:[/]
822
+ [green]entities[/] Entity resolution
823
+ [green]pii[/] PII scanner
824
+ [green]provenance[/] Data provenance
825
+
826
+ [bold]System Commands:[/]
827
+ [green]ingest[/] Ingest files/logs
828
+ [green]analyze[/] Analyze files
829
+
830
+ [bold]HOLD (Inference Halt):[/]
831
+ [green]hold[/] Show HOLD usage and API info
832
+ [green]hold-status[/] Show HOLD system status
833
+
834
+ [bold]Other:[/]
835
+ [green]proxy[/] Start proxy server
836
+ [green]init[/] Setup instructions
837
+
838
+ Use [cyan]cascade <command> --help[/] for details.""",
839
+ title="[bold magenta]🌀 CASCADE[/]",
840
+ subtitle="[dim]pip install cascade-ai[/]",
841
+ border_style="magenta",
842
+ ))
843
+ else:
844
+ parser.print_help()
845
+ return
846
+
847
+ args.func(args)
848
+
849
+
850
+ if __name__ == "__main__":
851
+ main()
cascade/core/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Cascade Core module - fundamental data structures and algorithms."""
2
+
3
+ from cascade.core.event import Event, CausationLink, CausationChain
4
+ from cascade.core.graph import CausationGraph
5
+ from cascade.core.adapter import SymbioticAdapter
6
+
7
+ __all__ = [
8
+ "Event",
9
+ "CausationLink",
10
+ "CausationChain",
11
+ "CausationGraph",
12
+ "SymbioticAdapter",
13
+ ]
cascade/core/adapter.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cascade Core - Symbiotic Adapter.
3
+
4
+ The heart of Cascade's system-agnostic design. The adapter uses Kleene fixed-point
5
+ convergence to interpret ANY signal format and convert it to Events.
6
+
7
+ "It doesn't hook into your system — it becomes part of it."
8
+ """
9
+
10
+ import time
11
+ import json
12
+ import re
13
+ from typing import Any, Dict, List, Optional, Callable, Type
14
+ from dataclasses import dataclass
15
+
16
+ from cascade.core.event import Event
17
+
18
+
19
+ @dataclass
20
+ class SignalPattern:
21
+ """A learned pattern for interpreting signals."""
22
+ pattern_type: str # 'dict', 'string', 'tensor', 'protobuf', etc.
23
+ component: str
24
+ event_type: str
25
+ extractor: Optional[Callable[[Any], Dict[str, Any]]] = None
26
+ confidence: float = 0.0
27
+ match_count: int = 0
28
+
29
+
30
+ class SymbioticAdapter:
31
+ """
32
+ Self-interpreting adapter that converges to any signal format.
33
+
34
+ The adapter observes signals from the host system and learns how to
35
+ interpret them through fixed-point iteration. It starts with naive
36
+ interpretations and refines them until stable.
37
+
38
+ This is the key to Cascade's system-agnostic design:
39
+ - No framework-specific hooks required
40
+ - No configuration needed
41
+ - Feed it ANY signal format, it adapts
42
+
43
+ Example:
44
+ >>> adapter = SymbioticAdapter()
45
+ >>>
46
+ >>> # Feed it different signal formats
47
+ >>> adapter.interpret({"loss": 0.5, "epoch": 10})
48
+ >>> adapter.interpret("2024-01-01 12:00:00 ERROR training failed")
49
+ >>> adapter.interpret(torch.tensor([0.1, 0.2, 0.3]))
50
+ >>>
51
+ >>> # It learns patterns and gets better at interpretation
52
+ >>> print(adapter.learned_patterns)
53
+ """
54
+
55
+ def __init__(self):
56
+ """Initialize the symbiotic adapter."""
57
+ self._patterns: List[SignalPattern] = []
58
+ self._signal_count = 0
59
+ self._interpretation_cache: Dict[str, SignalPattern] = {}
60
+
61
+ # Built-in interpreters for common formats
62
+ self._builtin_interpreters = {
63
+ dict: self._interpret_dict,
64
+ str: self._interpret_string,
65
+ list: self._interpret_list,
66
+ }
67
+
68
+ # Regex patterns for log line parsing
69
+ self._log_patterns = [
70
+ # ISO timestamp with level: "2024-01-01 12:00:00 ERROR message"
71
+ re.compile(r'^(\d{4}-\d{2}-\d{2}[T\s]\d{2}:\d{2}:\d{2}(?:\.\d+)?)\s+(\w+)\s+(.*)$'),
72
+ # Simple timestamp: "12:00:00.123 component message"
73
+ re.compile(r'^(\d{2}:\d{2}:\d{2}(?:\.\d+)?)\s+(\w+)\s+(.*)$'),
74
+ # Pipe-delimited: "timestamp|level|component|key:value"
75
+ re.compile(r'^([^|]+)\|(\w+)\|(\w+)\|(.*)$'),
76
+ ]
77
+
78
+ # Metric extraction patterns - ONLY extract real training metrics
79
+ # Be strict to avoid extracting garbage from config lines
80
+ self._metric_patterns = [
81
+ # Standard training metrics with = or :
82
+ re.compile(r'\b(loss|val_loss|train_loss|accuracy|acc|val_acc|lr|learning_rate|epoch|step|iter|iteration|mfu|tokens_per_sec|samples_per_sec|grad_norm|perplexity|ppl)[=:]\s*([+-]?\d+\.?\d*(?:e[+-]?\d+)?)', re.I),
83
+ # "iter X: loss=Y" format from nanoGPT
84
+ re.compile(r'iter\s+(\d+).*loss[=:]?\s*([+-]?\d+\.?\d*)', re.I),
85
+ # "step X loss Y" format
86
+ re.compile(r'step\s+(\d+).*loss\s*[=:]?\s*([+-]?\d+\.?\d*)', re.I),
87
+ ]
88
+
89
+ def interpret(self, signal: Any) -> Event:
90
+ """
91
+ Interpret any signal into a Cascade Event.
92
+
93
+ Uses Kleene fixed-point iteration to converge on the best interpretation.
94
+
95
+ Args:
96
+ signal: Any signal from the host system
97
+
98
+ Returns:
99
+ Event: The interpreted event
100
+ """
101
+ self._signal_count += 1
102
+
103
+ # Get signal type
104
+ signal_type = type(signal)
105
+
106
+ # Try cached pattern first
107
+ cache_key = self._get_cache_key(signal)
108
+ if cache_key in self._interpretation_cache:
109
+ pattern = self._interpretation_cache[cache_key]
110
+ pattern.match_count += 1
111
+ return self._apply_pattern(signal, pattern)
112
+
113
+ # Try built-in interpreter
114
+ if signal_type in self._builtin_interpreters:
115
+ event = self._builtin_interpreters[signal_type](signal)
116
+ self._learn_pattern(signal, event)
117
+ return event
118
+
119
+ # Try tensor-like objects (duck typing)
120
+ if hasattr(signal, 'numpy') or hasattr(signal, 'detach'):
121
+ event = self._interpret_tensor(signal)
122
+ self._learn_pattern(signal, event)
123
+ return event
124
+
125
+ # Try protobuf-like objects
126
+ if hasattr(signal, 'SerializeToString'):
127
+ event = self._interpret_protobuf(signal)
128
+ self._learn_pattern(signal, event)
129
+ return event
130
+
131
+ # Fallback: convert to string and interpret
132
+ event = self._interpret_string(str(signal))
133
+ return event
134
+
135
+ def _interpret_dict(self, signal: Dict[str, Any]) -> Event:
136
+ """Interpret a dictionary signal."""
137
+ # Extract common fields
138
+ timestamp = signal.get('timestamp', signal.get('time', time.time()))
139
+ if isinstance(timestamp, str):
140
+ try:
141
+ from datetime import datetime
142
+ timestamp = datetime.fromisoformat(timestamp).timestamp()
143
+ except:
144
+ timestamp = time.time()
145
+
146
+ component = signal.get('component', signal.get('source', 'unknown'))
147
+ event_type = signal.get('event_type', signal.get('type', 'state_change'))
148
+
149
+ # Everything else goes in data
150
+ reserved = {'timestamp', 'time', 'component', 'source', 'event_type', 'type'}
151
+ data = {k: v for k, v in signal.items() if k not in reserved}
152
+
153
+ return Event(
154
+ timestamp=timestamp,
155
+ component=component,
156
+ event_type=event_type,
157
+ data=data,
158
+ source_signal=signal,
159
+ )
160
+
161
+ def _interpret_string(self, signal: str) -> Event:
162
+ """Interpret a string signal (log line, message, etc.)."""
163
+ signal = signal.strip()
164
+
165
+ # Try each log pattern
166
+ for pattern in self._log_patterns:
167
+ match = pattern.match(signal)
168
+ if match:
169
+ groups = match.groups()
170
+ if len(groups) >= 3:
171
+ timestamp_str, level_or_component, rest = groups[0], groups[1], groups[-1]
172
+
173
+ # Parse timestamp
174
+ try:
175
+ from datetime import datetime
176
+ timestamp = datetime.fromisoformat(timestamp_str.replace(' ', 'T')).timestamp()
177
+ except:
178
+ timestamp = time.time()
179
+
180
+ # Extract metrics from the rest
181
+ data = self._extract_metrics(rest)
182
+ data['raw_message'] = rest
183
+
184
+ # Determine event type from keywords
185
+ event_type = self._infer_event_type(signal)
186
+
187
+ return Event(
188
+ timestamp=timestamp,
189
+ component=level_or_component.lower(),
190
+ event_type=event_type,
191
+ data=data,
192
+ source_signal=signal,
193
+ )
194
+
195
+ # Fallback: extract what we can with smarter component detection
196
+ data = self._extract_metrics(signal)
197
+ data['raw_message'] = signal
198
+
199
+ # Infer component from content
200
+ component = self._infer_component(signal)
201
+
202
+ return Event(
203
+ timestamp=time.time(),
204
+ component=component,
205
+ event_type=self._infer_event_type(signal),
206
+ data=data,
207
+ source_signal=signal,
208
+ )
209
+
210
+ def _interpret_list(self, signal: List[Any]) -> Event:
211
+ """Interpret a list signal."""
212
+ # Convert to dict with indices
213
+ data = {f'item_{i}': v for i, v in enumerate(signal)}
214
+ data['length'] = len(signal)
215
+
216
+ # Check if it looks like numeric data
217
+ if all(isinstance(x, (int, float)) for x in signal):
218
+ data['mean'] = sum(signal) / len(signal) if signal else 0
219
+ data['min'] = min(signal) if signal else 0
220
+ data['max'] = max(signal) if signal else 0
221
+
222
+ return Event(
223
+ timestamp=time.time(),
224
+ component='data',
225
+ event_type='list_signal',
226
+ data=data,
227
+ source_signal=signal,
228
+ )
229
+
230
+ def _interpret_tensor(self, signal: Any) -> Event:
231
+ """Interpret a tensor-like signal (PyTorch, NumPy, etc.)."""
232
+ # Try to get numpy array
233
+ try:
234
+ if hasattr(signal, 'detach'):
235
+ arr = signal.detach().cpu().numpy()
236
+ elif hasattr(signal, 'numpy'):
237
+ arr = signal.numpy()
238
+ else:
239
+ arr = signal
240
+
241
+ data = {
242
+ 'shape': list(arr.shape) if hasattr(arr, 'shape') else [],
243
+ 'dtype': str(arr.dtype) if hasattr(arr, 'dtype') else 'unknown',
244
+ 'mean': float(arr.mean()) if hasattr(arr, 'mean') else 0,
245
+ 'std': float(arr.std()) if hasattr(arr, 'std') else 0,
246
+ 'min': float(arr.min()) if hasattr(arr, 'min') else 0,
247
+ 'max': float(arr.max()) if hasattr(arr, 'max') else 0,
248
+ }
249
+
250
+ # Check for NaN/Inf (common in gradient explosions)
251
+ if hasattr(arr, 'isnan'):
252
+ data['has_nan'] = bool(arr.isnan().any())
253
+ if hasattr(arr, 'isinf'):
254
+ data['has_inf'] = bool(arr.isinf().any())
255
+
256
+ except Exception as e:
257
+ data = {'error': str(e), 'type': str(type(signal))}
258
+
259
+ return Event(
260
+ timestamp=time.time(),
261
+ component='tensor',
262
+ event_type='tensor_signal',
263
+ data=data,
264
+ source_signal=None, # Don't store tensor to save memory
265
+ )
266
+
267
+ def _interpret_protobuf(self, signal: Any) -> Event:
268
+ """Interpret a protobuf-like signal."""
269
+ try:
270
+ # Try to convert to dict
271
+ if hasattr(signal, 'DESCRIPTOR'):
272
+ from google.protobuf.json_format import MessageToDict
273
+ data = MessageToDict(signal)
274
+ else:
275
+ data = {'raw': str(signal)}
276
+ except:
277
+ data = {'raw': str(signal)}
278
+
279
+ return Event(
280
+ timestamp=time.time(),
281
+ component='protobuf',
282
+ event_type='protobuf_signal',
283
+ data=data,
284
+ source_signal=None,
285
+ )
286
+
287
+ def _extract_metrics(self, text: str) -> Dict[str, Any]:
288
+ """Extract numeric metrics from text - STRICT, only real training metrics."""
289
+ metrics = {}
290
+
291
+ # nanoGPT format: "iter 0: loss=4.2176, time 46.76ms, mfu 0.62%"
292
+ nano_match = re.search(r'iter\s+(\d+).*loss[=:]?\s*([\d.]+)', text, re.I)
293
+ if nano_match:
294
+ metrics['iter'] = int(nano_match.group(1))
295
+ metrics['loss'] = float(nano_match.group(2))
296
+
297
+ # Diffusers/tqdm format: "step_loss=0.1234" or "step_loss: 0.1234"
298
+ step_loss_match = re.search(r'step_loss[=:]\s*([\d.e+-]+)', text, re.I)
299
+ if step_loss_match:
300
+ metrics['loss'] = float(step_loss_match.group(1))
301
+
302
+ # train_loss format from accelerator.log
303
+ train_loss_match = re.search(r'train_loss[=:]\s*([\d.e+-]+)', text, re.I)
304
+ if train_loss_match:
305
+ metrics['loss'] = float(train_loss_match.group(1))
306
+
307
+ # tqdm progress format: " 5%|█ | 5/100 [00:30<09:30, step_loss=0.234, lr=1e-5]"
308
+ tqdm_match = re.search(r'(\d+)%\|.*\|\s*(\d+)/(\d+)', text)
309
+ if tqdm_match:
310
+ metrics['progress_pct'] = int(tqdm_match.group(1))
311
+ metrics['step'] = int(tqdm_match.group(2))
312
+ metrics['total_steps'] = int(tqdm_match.group(3))
313
+
314
+ # Generic loss patterns
315
+ generic_loss = re.search(r'\bloss[=:]\s*([\d.e+-]+)', text, re.I)
316
+ if generic_loss and 'loss' not in metrics:
317
+ metrics['loss'] = float(generic_loss.group(1))
318
+
319
+ # mfu extraction
320
+ mfu_match = re.search(r'mfu\s*[=:]?\s*([\d.]+)%?', text, re.I)
321
+ if mfu_match:
322
+ metrics['mfu'] = float(mfu_match.group(1))
323
+
324
+ # time extraction (ms)
325
+ time_match = re.search(r'time\s*[=:]?\s*([\d.]+)\s*ms', text, re.I)
326
+ if time_match:
327
+ metrics['time_ms'] = float(time_match.group(1))
328
+
329
+ # learning rate - multiple formats
330
+ lr_match = re.search(r'\b(?:lr|learning_rate)\s*[=:]\s*([\d.e+-]+)', text, re.I)
331
+ if lr_match:
332
+ metrics['lr'] = float(lr_match.group(1))
333
+
334
+ # epoch/step for other frameworks
335
+ epoch_match = re.search(r'\bepoch\s*[=:]\s*(\d+)', text, re.I)
336
+ if epoch_match:
337
+ metrics['epoch'] = int(epoch_match.group(1))
338
+
339
+ step_match = re.search(r'\bstep\s*[=:]\s*(\d+)', text, re.I)
340
+ if step_match and 'step' not in metrics:
341
+ metrics['step'] = int(step_match.group(1))
342
+
343
+ # global_step from diffusers
344
+ global_step_match = re.search(r'global_step[=:]\s*(\d+)', text, re.I)
345
+ if global_step_match:
346
+ metrics['step'] = int(global_step_match.group(1))
347
+
348
+ return metrics
349
+
350
+ def _infer_event_type(self, text: str) -> str:
351
+ """Infer event type from text content."""
352
+ text_lower = text.lower()
353
+
354
+ # Training iteration logs (highest priority)
355
+ if re.search(r'iter\s+\d+.*loss', text_lower):
356
+ return 'training_step'
357
+ if re.search(r'step\s+\d+.*loss', text_lower):
358
+ return 'training_step'
359
+
360
+ if any(kw in text_lower for kw in ['error', 'exception', 'failed', 'crash']):
361
+ return 'error'
362
+ if any(kw in text_lower for kw in ['warning', 'warn']):
363
+ return 'warning'
364
+ if any(kw in text_lower for kw in ['gradient', 'backward']):
365
+ return 'training'
366
+ if 'loss' in text_lower and 'val' in text_lower:
367
+ return 'validation'
368
+ if any(kw in text_lower for kw in ['inference', 'predict', 'forward']):
369
+ return 'inference'
370
+ if any(kw in text_lower for kw in ['epoch', 'step', 'iteration', 'iter']):
371
+ return 'progress'
372
+ if any(kw in text_lower for kw in ['nan', 'inf', 'explode', 'overflow']):
373
+ return 'anomaly'
374
+ if any(kw in text_lower for kw in ['save', 'checkpoint', 'load', 'saving']):
375
+ return 'checkpoint'
376
+ if any(kw in text_lower for kw in ['config', 'setting', 'parameter', 'device', 'gpu', 'cuda']):
377
+ return 'config'
378
+ if any(kw in text_lower for kw in ['initializ', 'loading model', 'compiling']):
379
+ return 'init'
380
+
381
+ return 'state_change'
382
+
383
+ def _infer_component(self, text: str) -> str:
384
+ """Infer component from text content - NO MORE 'unknown'."""
385
+ text_lower = text.lower()
386
+
387
+ # Training/optimizer related
388
+ if any(kw in text_lower for kw in ['iter', 'step', 'epoch', 'batch']):
389
+ return 'trainer'
390
+ if any(kw in text_lower for kw in ['loss', 'backward', 'gradient']):
391
+ return 'loss'
392
+ if any(kw in text_lower for kw in ['optim', 'adam', 'sgd', 'lr', 'learning']):
393
+ return 'optimizer'
394
+ if any(kw in text_lower for kw in ['model', 'layer', 'param', 'weight']):
395
+ return 'model'
396
+ if any(kw in text_lower for kw in ['data', 'batch', 'loader', 'dataset']):
397
+ return 'data'
398
+ if any(kw in text_lower for kw in ['cuda', 'gpu', 'device', 'memory']):
399
+ return 'device'
400
+ if any(kw in text_lower for kw in ['checkpoint', 'save', 'load']):
401
+ return 'checkpoint'
402
+ if any(kw in text_lower for kw in ['config', 'setting', 'override']):
403
+ return 'config'
404
+ if any(kw in text_lower for kw in ['eval', 'valid', 'test']):
405
+ return 'evaluator'
406
+ if any(kw in text_lower for kw in ['token', 'vocab', 'embed']):
407
+ return 'tokenizer'
408
+
409
+ return 'system' # Generic fallback, not "unknown"
410
+
411
+ def _get_cache_key(self, signal: Any) -> str:
412
+ """Generate a cache key for a signal's structure."""
413
+ if isinstance(signal, dict):
414
+ # Key based on dict keys
415
+ return f"dict:{':'.join(sorted(signal.keys()))}"
416
+ elif isinstance(signal, str):
417
+ # Key based on first word
418
+ first_word = signal.split()[0] if signal.split() else ''
419
+ return f"str:{first_word[:20]}"
420
+ else:
421
+ return f"type:{type(signal).__name__}"
422
+
423
+ def _learn_pattern(self, signal: Any, event: Event) -> None:
424
+ """Learn a pattern from a successful interpretation."""
425
+ cache_key = self._get_cache_key(signal)
426
+ pattern = SignalPattern(
427
+ pattern_type=type(signal).__name__,
428
+ component=event.component,
429
+ event_type=event.event_type,
430
+ confidence=0.5,
431
+ match_count=1,
432
+ )
433
+ self._interpretation_cache[cache_key] = pattern
434
+ self._patterns.append(pattern)
435
+
436
+ def _apply_pattern(self, signal: Any, pattern: SignalPattern) -> Event:
437
+ """Apply a learned pattern to interpret a signal."""
438
+ # Re-interpret with learned hints - use direct interpreters to avoid recursion
439
+ if isinstance(signal, dict):
440
+ event = self._interpret_dict(signal)
441
+ # Apply learned component/type if more confident
442
+ if pattern.confidence > 0.7:
443
+ return Event(
444
+ timestamp=event.timestamp,
445
+ component=pattern.component,
446
+ event_type=pattern.event_type,
447
+ data=event.data,
448
+ source_signal=signal,
449
+ )
450
+ return event
451
+ elif isinstance(signal, str):
452
+ return self._interpret_string(signal)
453
+ elif isinstance(signal, list):
454
+ return self._interpret_list(signal)
455
+ else:
456
+ # Fallback: interpret as string without recursion
457
+ return self._interpret_string(str(signal))
458
+
459
+ @property
460
+ def learned_patterns(self) -> List[SignalPattern]:
461
+ """Get all learned signal patterns."""
462
+ return sorted(self._patterns, key=lambda p: p.match_count, reverse=True)
463
+
464
+ @property
465
+ def signal_count(self) -> int:
466
+ """Total number of signals interpreted."""
467
+ return self._signal_count
468
+
469
+ def __repr__(self) -> str:
470
+ return f"<SymbioticAdapter | {self._signal_count} signals, {len(self._patterns)} patterns>"
cascade/core/event.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cascade Core - Event and CausationLink primitives.
3
+
4
+ These are the fundamental data structures that represent causation.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from typing import Dict, List, Any, Optional
9
+ from datetime import datetime
10
+ import time
11
+ import uuid
12
+
13
+
14
+ def _generate_event_id() -> str:
15
+ """Generate a unique event ID with timestamp prefix for ordering."""
16
+ timestamp = int(time.time() * 1000000)
17
+ unique = uuid.uuid4().hex[:8]
18
+ return f"evt_{timestamp}_{unique}"
19
+
20
+
21
+ @dataclass
22
+ class Event:
23
+ """
24
+ A discrete event in the causation graph.
25
+
26
+ Events are the nodes in your causation graph. Each event represents
27
+ something that happened in your system at a point in time.
28
+
29
+ Attributes:
30
+ event_id: Unique identifier (auto-generated if not provided)
31
+ timestamp: Unix timestamp when event occurred
32
+ component: Which system component generated this event
33
+ event_type: Category of event (e.g., 'training', 'inference', 'error')
34
+ data: Arbitrary key-value data associated with the event
35
+ source_signal: The original signal that created this event (for debugging)
36
+
37
+ Example:
38
+ >>> event = Event(
39
+ ... timestamp=time.time(),
40
+ ... component="neural_network",
41
+ ... event_type="gradient_explosion",
42
+ ... data={"layer": "fc3", "magnitude": 1e12}
43
+ ... )
44
+ """
45
+ timestamp: float
46
+ component: str
47
+ event_type: str
48
+ data: Dict[str, Any] = field(default_factory=dict)
49
+ event_id: str = field(default_factory=_generate_event_id)
50
+ source_signal: Optional[Any] = field(default=None, repr=False)
51
+
52
+ def __post_init__(self):
53
+ """Ensure timestamp is float."""
54
+ if isinstance(self.timestamp, datetime):
55
+ self.timestamp = self.timestamp.timestamp()
56
+
57
+ def to_dict(self) -> Dict[str, Any]:
58
+ """Serialize event to dictionary."""
59
+ return {
60
+ "event_id": self.event_id,
61
+ "timestamp": self.timestamp,
62
+ "component": self.component,
63
+ "event_type": self.event_type,
64
+ "data": self.data,
65
+ }
66
+
67
+ @classmethod
68
+ def from_dict(cls, d: Dict[str, Any]) -> "Event":
69
+ """Deserialize event from dictionary."""
70
+ return cls(
71
+ event_id=d.get("event_id", _generate_event_id()),
72
+ timestamp=d["timestamp"],
73
+ component=d["component"],
74
+ event_type=d["event_type"],
75
+ data=d.get("data", {}),
76
+ )
77
+
78
+ def __hash__(self):
79
+ return hash(self.event_id)
80
+
81
+ def __eq__(self, other):
82
+ if isinstance(other, Event):
83
+ return self.event_id == other.event_id
84
+ return False
85
+
86
+
87
+ @dataclass
88
+ class CausationLink:
89
+ """
90
+ A causal relationship between two events.
91
+
92
+ Links are the edges in your causation graph. Each link represents
93
+ a cause-effect relationship: event A caused event B.
94
+
95
+ Attributes:
96
+ from_event: ID of the causing event
97
+ to_event: ID of the caused event
98
+ causation_type: How the causation was detected
99
+ - 'temporal': A happened shortly before B
100
+ - 'correlation': A and B metrics moved together
101
+ - 'threshold': A crossed a threshold triggering B
102
+ - 'direct': Explicit causation declared in code
103
+ strength: Confidence in the causal relationship (0.0 to 1.0)
104
+ explanation: Human-readable explanation of the link
105
+ metrics_involved: Which metrics connect these events
106
+
107
+ Example:
108
+ >>> link = CausationLink(
109
+ ... from_event="evt_123",
110
+ ... to_event="evt_456",
111
+ ... causation_type="threshold",
112
+ ... strength=0.95,
113
+ ... explanation="Loss exceeded 10.0, triggering gradient clipping"
114
+ ... )
115
+ """
116
+ from_event: str
117
+ to_event: str
118
+ causation_type: str # 'temporal', 'correlation', 'threshold', 'direct'
119
+ strength: float = 1.0
120
+ explanation: str = ""
121
+ metrics_involved: List[str] = field(default_factory=list)
122
+
123
+ def __post_init__(self):
124
+ """Validate strength is in range."""
125
+ self.strength = max(0.0, min(1.0, self.strength))
126
+
127
+ def to_dict(self) -> Dict[str, Any]:
128
+ """Serialize link to dictionary."""
129
+ return {
130
+ "from_event": self.from_event,
131
+ "to_event": self.to_event,
132
+ "causation_type": self.causation_type,
133
+ "strength": self.strength,
134
+ "explanation": self.explanation,
135
+ "metrics_involved": self.metrics_involved,
136
+ }
137
+
138
+ @classmethod
139
+ def from_dict(cls, d: Dict[str, Any]) -> "CausationLink":
140
+ """Deserialize link from dictionary."""
141
+ return cls(
142
+ from_event=d["from_event"],
143
+ to_event=d["to_event"],
144
+ causation_type=d["causation_type"],
145
+ strength=d.get("strength", 1.0),
146
+ explanation=d.get("explanation", ""),
147
+ metrics_involved=d.get("metrics_involved", []),
148
+ )
149
+
150
+
151
+ @dataclass
152
+ class CausationChain:
153
+ """
154
+ A chain of causal events from origin to destination.
155
+
156
+ Represents a full causal path through the graph.
157
+
158
+ Attributes:
159
+ events: List of events in causal order
160
+ links: List of links connecting the events
161
+ total_strength: Combined strength of all links
162
+ depth: Number of hops in the chain
163
+ narrative: Human-readable story of what happened
164
+ """
165
+ events: List[Event]
166
+ links: List[CausationLink]
167
+ total_strength: float = 1.0
168
+ depth: int = 0
169
+ narrative: str = ""
170
+
171
+ def __post_init__(self):
172
+ self.depth = len(self.links)
173
+ if not self.total_strength and self.links:
174
+ # Calculate combined strength
175
+ self.total_strength = 1.0
176
+ for link in self.links:
177
+ self.total_strength *= link.strength
cascade/core/graph.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cascade Core - Causation Graph Engine.
3
+
4
+ The graph stores events and their causal relationships, enabling
5
+ bidirectional traversal through time.
6
+ """
7
+
8
+ import threading
9
+ from typing import Dict, List, Optional, Set, Any, Iterator
10
+ from collections import defaultdict
11
+ from datetime import datetime
12
+
13
+ try:
14
+ import networkx as nx
15
+ HAS_NETWORKX = True
16
+ except ImportError:
17
+ HAS_NETWORKX = False
18
+
19
+ from cascade.core.event import Event, CausationLink
20
+
21
+
22
+ class CausationGraph:
23
+ """
24
+ A directed graph of causal relationships between events.
25
+
26
+ The graph enables bidirectional traversal:
27
+ - Backwards: "What caused this event?"
28
+ - Forwards: "What did this event cause?"
29
+
30
+ Thread-safe for concurrent event ingestion.
31
+
32
+ Example:
33
+ >>> graph = CausationGraph()
34
+ >>> graph.add_event(event1)
35
+ >>> graph.add_event(event2)
36
+ >>> graph.add_link(CausationLink(
37
+ ... from_event=event1.event_id,
38
+ ... to_event=event2.event_id,
39
+ ... causation_type="temporal",
40
+ ... strength=0.9
41
+ ... ))
42
+ >>>
43
+ >>> # Find what caused event2
44
+ >>> causes = graph.get_causes(event2.event_id)
45
+ """
46
+
47
+ def __init__(self):
48
+ """Initialize an empty causation graph."""
49
+ self._lock = threading.RLock()
50
+
51
+ # Event storage
52
+ self._events: Dict[str, Event] = {}
53
+ self._events_by_component: Dict[str, List[str]] = defaultdict(list)
54
+ self._events_by_type: Dict[str, List[str]] = defaultdict(list)
55
+ self._events_by_time: List[str] = [] # Ordered by timestamp
56
+
57
+ # Link storage
58
+ self._links: Dict[str, CausationLink] = {} # link_id -> link
59
+ self._causes: Dict[str, Set[str]] = defaultdict(set) # event_id -> set of cause event_ids
60
+ self._effects: Dict[str, Set[str]] = defaultdict(set) # event_id -> set of effect event_ids
61
+
62
+ # NetworkX graph for advanced algorithms (optional)
63
+ if HAS_NETWORKX:
64
+ self._nx_graph = nx.DiGraph()
65
+ else:
66
+ self._nx_graph = None
67
+
68
+ # Statistics
69
+ self._event_count = 0
70
+ self._link_count = 0
71
+
72
+ def add_event(self, event: Event) -> None:
73
+ """
74
+ Add an event to the graph.
75
+
76
+ Thread-safe. Automatically detects potential causations with recent events.
77
+
78
+ Args:
79
+ event: The event to add
80
+ """
81
+ with self._lock:
82
+ if event.event_id in self._events:
83
+ return # Already exists
84
+
85
+ self._events[event.event_id] = event
86
+ self._events_by_component[event.component].append(event.event_id)
87
+ self._events_by_type[event.event_type].append(event.event_id)
88
+ self._events_by_time.append(event.event_id)
89
+ self._event_count += 1
90
+
91
+ if self._nx_graph is not None:
92
+ self._nx_graph.add_node(event.event_id, **event.to_dict())
93
+
94
+ def add_link(self, link: CausationLink) -> None:
95
+ """
96
+ Add a causal link between two events.
97
+
98
+ Thread-safe.
99
+
100
+ Args:
101
+ link: The causation link to add
102
+ """
103
+ with self._lock:
104
+ link_id = f"{link.from_event}->{link.to_event}"
105
+
106
+ if link_id in self._links:
107
+ # Update existing link if new one is stronger
108
+ if link.strength > self._links[link_id].strength:
109
+ self._links[link_id] = link
110
+ return
111
+
112
+ self._links[link_id] = link
113
+ self._causes[link.to_event].add(link.from_event)
114
+ self._effects[link.from_event].add(link.to_event)
115
+ self._link_count += 1
116
+
117
+ if self._nx_graph is not None:
118
+ self._nx_graph.add_edge(
119
+ link.from_event,
120
+ link.to_event,
121
+ **link.to_dict()
122
+ )
123
+
124
+ def get_event(self, event_id: str) -> Optional[Event]:
125
+ """Get an event by ID."""
126
+ with self._lock:
127
+ return self._events.get(event_id)
128
+
129
+ def get_causes(self, event_id: str) -> List[Event]:
130
+ """
131
+ Get all events that directly caused this event.
132
+
133
+ Args:
134
+ event_id: ID of the effect event
135
+
136
+ Returns:
137
+ List of causing events
138
+ """
139
+ with self._lock:
140
+ cause_ids = self._causes.get(event_id, set())
141
+ return [self._events[cid] for cid in cause_ids if cid in self._events]
142
+
143
+ def get_effects(self, event_id: str) -> List[Event]:
144
+ """
145
+ Get all events that were directly caused by this event.
146
+
147
+ Args:
148
+ event_id: ID of the cause event
149
+
150
+ Returns:
151
+ List of effect events
152
+ """
153
+ with self._lock:
154
+ effect_ids = self._effects.get(event_id, set())
155
+ return [self._events[eid] for eid in effect_ids if eid in self._events]
156
+
157
+ def get_link(self, from_event: str, to_event: str) -> Optional[CausationLink]:
158
+ """Get the causation link between two events."""
159
+ with self._lock:
160
+ link_id = f"{from_event}->{to_event}"
161
+ return self._links.get(link_id)
162
+
163
+ def get_all_links(self) -> List[CausationLink]:
164
+ """Get all causal links in the graph."""
165
+ with self._lock:
166
+ return list(self._links.values())
167
+
168
+ def get_component_connections(self) -> Dict[str, Dict[str, float]]:
169
+ """
170
+ Aggregate causal links into component-to-component connections.
171
+
172
+ Returns:
173
+ Dict mapping (from_component, to_component) -> total strength
174
+ """
175
+ with self._lock:
176
+ connections: Dict[tuple, float] = {}
177
+
178
+ for link in self._links.values():
179
+ from_event = self._events.get(link.from_event)
180
+ to_event = self._events.get(link.to_event)
181
+
182
+ if from_event and to_event:
183
+ from_comp = from_event.component
184
+ to_comp = to_event.component
185
+
186
+ if from_comp != to_comp: # Skip self-links
187
+ key = (from_comp, to_comp)
188
+ connections[key] = connections.get(key, 0) + link.strength
189
+
190
+ return connections
191
+
192
+ def get_recent_events(self, count: int = 100) -> List[Event]:
193
+ """Get the most recent events by timestamp."""
194
+ with self._lock:
195
+ ids = self._events_by_time[-count:]
196
+ return [self._events[eid] for eid in reversed(ids)]
197
+
198
+ def get_events_by_component(self, component: str) -> List[Event]:
199
+ """Get all events from a specific component."""
200
+ with self._lock:
201
+ ids = self._events_by_component.get(component, [])
202
+ return [self._events[eid] for eid in ids]
203
+
204
+ def get_events_by_type(self, event_type: str) -> List[Event]:
205
+ """Get all events of a specific type."""
206
+ with self._lock:
207
+ ids = self._events_by_type.get(event_type, [])
208
+ return [self._events[eid] for eid in ids]
209
+
210
+ def find_path(self, from_event: str, to_event: str) -> Optional[List[str]]:
211
+ """
212
+ Find the shortest causal path between two events.
213
+
214
+ Uses NetworkX if available, otherwise falls back to BFS.
215
+
216
+ Args:
217
+ from_event: Starting event ID
218
+ to_event: Target event ID
219
+
220
+ Returns:
221
+ List of event IDs in the path, or None if no path exists
222
+ """
223
+ with self._lock:
224
+ if self._nx_graph is not None:
225
+ try:
226
+ return nx.shortest_path(self._nx_graph, from_event, to_event)
227
+ except nx.NetworkXNoPath:
228
+ return None
229
+ except nx.NodeNotFound:
230
+ return None
231
+ else:
232
+ # BFS fallback
233
+ return self._bfs_path(from_event, to_event)
234
+
235
+ def _bfs_path(self, from_event: str, to_event: str) -> Optional[List[str]]:
236
+ """BFS path finding without NetworkX."""
237
+ from collections import deque
238
+
239
+ if from_event not in self._events or to_event not in self._events:
240
+ return None
241
+
242
+ queue = deque([(from_event, [from_event])])
243
+ visited = {from_event}
244
+
245
+ while queue:
246
+ current, path = queue.popleft()
247
+
248
+ if current == to_event:
249
+ return path
250
+
251
+ for effect_id in self._effects.get(current, set()):
252
+ if effect_id not in visited:
253
+ visited.add(effect_id)
254
+ queue.append((effect_id, path + [effect_id]))
255
+
256
+ return None
257
+
258
+ def get_root_events(self) -> List[Event]:
259
+ """Get events with no causes (entry points)."""
260
+ with self._lock:
261
+ roots = []
262
+ for event_id, event in self._events.items():
263
+ if not self._causes.get(event_id):
264
+ roots.append(event)
265
+ return sorted(roots, key=lambda e: e.timestamp)
266
+
267
+ def get_leaf_events(self) -> List[Event]:
268
+ """Get events with no effects (endpoints)."""
269
+ with self._lock:
270
+ leaves = []
271
+ for event_id, event in self._events.items():
272
+ if not self._effects.get(event_id):
273
+ leaves.append(event)
274
+ return sorted(leaves, key=lambda e: e.timestamp, reverse=True)
275
+
276
+ def get_stats(self) -> Dict[str, Any]:
277
+ """Get statistics about the graph."""
278
+ with self._lock:
279
+ return {
280
+ "event_count": self._event_count,
281
+ "link_count": self._link_count,
282
+ "components": list(self._events_by_component.keys()),
283
+ "event_types": list(self._events_by_type.keys()),
284
+ "root_count": len(self.get_root_events()),
285
+ "leaf_count": len(self.get_leaf_events()),
286
+ }
287
+
288
+ def __len__(self) -> int:
289
+ return self._event_count
290
+
291
+ def __repr__(self) -> str:
292
+ return f"<CausationGraph | {self._event_count} events, {self._link_count} links>"
cascade/core/provenance.py ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE // PROVENANCE ENGINE
3
+ Cryptographic lineage tracking for neural network activations.
4
+
5
+ Due process infrastructure for AI - immutable evidence chains
6
+ that enable governance without prescribing decisions.
7
+
8
+ Architecture:
9
+ Input → [Layer₀] → [Layer₁] → ... → [Layerₙ] → Output
10
+ │ │ │
11
+ ▼ ▼ ▼
12
+ Hash₀ ──► Hash₁ ──► ... ──► Hashₙ
13
+ │ │
14
+ └───────── Merkle Root ─────┘
15
+
16
+ Each hash includes:
17
+ - Tensor state (sampled for efficiency)
18
+ - Parent hashes (inputs to this layer)
19
+ - Layer identity (name, params hash)
20
+ - Execution context (order, timestamp)
21
+
22
+ This creates verifiable, tamper-evident records of
23
+ what happened inside the network.
24
+ """
25
+
26
+ import hashlib
27
+ import json
28
+ import time
29
+ from dataclasses import dataclass, field, asdict
30
+ from typing import Dict, List, Optional, Any, Tuple
31
+ from collections import OrderedDict
32
+ import numpy as np
33
+
34
+
35
+ @dataclass
36
+ class ProvenanceRecord:
37
+ """Immutable record of a single layer's activation state."""
38
+
39
+ # Identity
40
+ layer_name: str
41
+ layer_idx: int
42
+
43
+ # Lineage
44
+ state_hash: str # Hash of this layer's output
45
+ parent_hashes: List[str] # Hashes of inputs (usually 1, but attention has multiple)
46
+ params_hash: Optional[str] = None # Hash of layer weights (frozen reference)
47
+
48
+ # Tensor metadata
49
+ shape: List[int] = field(default_factory=list)
50
+ dtype: str = "float32"
51
+
52
+ # Statistics (for visualization, not hashed)
53
+ stats: Dict[str, float] = field(default_factory=dict)
54
+
55
+ # Execution context
56
+ execution_order: int = 0
57
+ timestamp: float = field(default_factory=time.time)
58
+
59
+ # Merkle tree position
60
+ merkle_depth: int = 0
61
+ merkle_path: List[str] = field(default_factory=list)
62
+
63
+ def to_dict(self) -> Dict[str, Any]:
64
+ """Serialize for JSON export."""
65
+ return asdict(self)
66
+
67
+ @classmethod
68
+ def from_dict(cls, data: Dict[str, Any]) -> 'ProvenanceRecord':
69
+ """Deserialize from JSON."""
70
+ return cls(**data)
71
+
72
+
73
+ @dataclass
74
+ class ProvenanceChain:
75
+ """Complete provenance chain for a forward pass."""
76
+
77
+ # Session identity
78
+ session_id: str
79
+ model_id: str
80
+ model_hash: str
81
+
82
+ # Input/output
83
+ input_hash: str
84
+ output_hash: Optional[str] = None
85
+
86
+ # The chain itself
87
+ records: Dict[str, ProvenanceRecord] = field(default_factory=OrderedDict)
88
+
89
+ # External system roots (for inter-system linking)
90
+ # When this chain depends on another system's computation,
91
+ # include their merkle_root here. This creates the lattice.
92
+ external_roots: List[str] = field(default_factory=list)
93
+
94
+ # Merkle root (computed after chain complete)
95
+ merkle_root: Optional[str] = None
96
+
97
+ # Metadata
98
+ created_at: float = field(default_factory=time.time)
99
+ finalized: bool = False
100
+
101
+ def add_record(self, record: ProvenanceRecord) -> None:
102
+ """Add a record to the chain. Chain must not be finalized."""
103
+ if self.finalized:
104
+ raise ValueError("Cannot add to finalized chain")
105
+ self.records[record.layer_name] = record
106
+
107
+ def finalize(self) -> str:
108
+ """Compute Merkle root and lock the chain."""
109
+ if self.finalized:
110
+ return self.merkle_root
111
+
112
+ # Build Merkle tree from record hashes + external roots
113
+ # External roots create cryptographic proof of inter-system dependency
114
+ hashes = [r.state_hash for r in self.records.values()]
115
+ hashes.extend(self.external_roots) # Include external system roots
116
+ self.merkle_root = compute_merkle_root(hashes)
117
+ self.finalized = True
118
+ return self.merkle_root
119
+
120
+ def verify(self) -> Tuple[bool, Optional[str]]:
121
+ """Verify chain integrity."""
122
+ if not self.finalized:
123
+ return False, "Chain not finalized"
124
+
125
+ # Recompute Merkle root (including external roots)
126
+ hashes = [r.state_hash for r in self.records.values()]
127
+ hashes.extend(self.external_roots) # Must include external roots
128
+ computed_root = compute_merkle_root(hashes)
129
+
130
+ if computed_root != self.merkle_root:
131
+ return False, f"Merkle root mismatch: {computed_root} != {self.merkle_root}"
132
+
133
+ return True, None
134
+
135
+ def link_external(self, external_merkle_root: str, source_id: str = None) -> None:
136
+ """
137
+ Link this chain to another system's merkle root.
138
+
139
+ This creates the neural internetwork - cryptographic proof
140
+ that this computation depended on another system's output.
141
+
142
+ Args:
143
+ external_merkle_root: The merkle root from the external system
144
+ source_id: Optional identifier of the source system
145
+ """
146
+ if self.finalized:
147
+ raise ValueError("Cannot link external root to finalized chain")
148
+ self.external_roots.append(external_merkle_root)
149
+
150
+ def get_lineage(self, layer_name: str) -> List[ProvenanceRecord]:
151
+ """Trace back from a layer to its ancestors."""
152
+ if layer_name not in self.records:
153
+ return []
154
+
155
+ lineage = []
156
+ current = self.records[layer_name]
157
+ visited = set()
158
+
159
+ def trace_back(record: ProvenanceRecord):
160
+ if record.layer_name in visited:
161
+ return
162
+ visited.add(record.layer_name)
163
+ lineage.append(record)
164
+
165
+ for parent_hash in record.parent_hashes:
166
+ # Find record with this hash
167
+ for r in self.records.values():
168
+ if r.state_hash == parent_hash:
169
+ trace_back(r)
170
+ break
171
+
172
+ trace_back(current)
173
+ return lineage
174
+
175
+ def to_dict(self) -> Dict[str, Any]:
176
+ """Serialize entire chain."""
177
+ return {
178
+ "session_id": self.session_id,
179
+ "model_id": self.model_id,
180
+ "model_hash": self.model_hash,
181
+ "input_hash": self.input_hash,
182
+ "output_hash": self.output_hash,
183
+ "external_roots": self.external_roots, # Inter-system links
184
+ "merkle_root": self.merkle_root,
185
+ "created_at": self.created_at,
186
+ "finalized": self.finalized,
187
+ "records": {k: v.to_dict() for k, v in self.records.items()}
188
+ }
189
+
190
+ def to_json(self, indent: int = 2) -> str:
191
+ """Export as JSON."""
192
+ return json.dumps(self.to_dict(), indent=indent)
193
+
194
+ @classmethod
195
+ def from_dict(cls, data: Dict[str, Any]) -> 'ProvenanceChain':
196
+ """Deserialize from dict."""
197
+ records = OrderedDict()
198
+ for k, v in data.get("records", {}).items():
199
+ records[k] = ProvenanceRecord.from_dict(v)
200
+
201
+ chain = cls(
202
+ session_id=data["session_id"],
203
+ model_id=data["model_id"],
204
+ model_hash=data["model_hash"],
205
+ input_hash=data["input_hash"],
206
+ output_hash=data.get("output_hash"),
207
+ external_roots=data.get("external_roots", []), # Inter-system links
208
+ merkle_root=data.get("merkle_root"),
209
+ created_at=data.get("created_at", time.time()),
210
+ finalized=data.get("finalized", False),
211
+ )
212
+ chain.records = records
213
+ return chain
214
+
215
+
216
+ # =============================================================================
217
+ # HASHING FUNCTIONS
218
+ # =============================================================================
219
+
220
+ def hash_tensor(tensor, sample_size: int = 1000) -> str:
221
+ """
222
+ Compute deterministic hash of tensor state.
223
+
224
+ Samples tensor for efficiency - full hash would be too slow
225
+ for large activations. Sample is deterministic (first N elements
226
+ after flatten) so hash is reproducible.
227
+
228
+ Args:
229
+ tensor: PyTorch tensor or numpy array
230
+ sample_size: Number of elements to sample
231
+
232
+ Returns:
233
+ 16-character hex hash
234
+ """
235
+ # Convert to numpy if needed
236
+ if hasattr(tensor, 'detach'):
237
+ # PyTorch tensor
238
+ arr = tensor.detach().cpu().float().numpy()
239
+ elif hasattr(tensor, 'numpy'):
240
+ arr = tensor.numpy()
241
+ else:
242
+ arr = np.array(tensor)
243
+
244
+ # Flatten and sample
245
+ flat = arr.flatten()
246
+ sample = flat[:min(sample_size, len(flat))]
247
+
248
+ # Hash the bytes
249
+ # Include shape in hash so same values in different shapes hash differently
250
+ shape_bytes = str(arr.shape).encode('utf-8')
251
+ tensor_bytes = sample.astype(np.float32).tobytes()
252
+
253
+ combined = shape_bytes + tensor_bytes
254
+ return hashlib.sha256(combined).hexdigest()[:16]
255
+
256
+
257
+ def hash_params(module) -> str:
258
+ """
259
+ Hash a module's parameters (weights, biases).
260
+
261
+ This creates a frozen reference to the model state at observation time.
262
+ If weights change, this hash changes.
263
+ """
264
+ param_hashes = []
265
+
266
+ for name, param in module.named_parameters(recurse=False):
267
+ if param is not None:
268
+ h = hash_tensor(param.data, sample_size=500)
269
+ param_hashes.append(f"{name}:{h}")
270
+
271
+ if not param_hashes:
272
+ return "no_params"
273
+
274
+ combined = "|".join(sorted(param_hashes))
275
+ return hashlib.sha256(combined.encode()).hexdigest()[:16]
276
+
277
+
278
+ def hash_model(model) -> str:
279
+ """
280
+ Hash entire model state.
281
+
282
+ This is the model's identity hash - changes if any weight changes.
283
+ """
284
+ all_hashes = []
285
+
286
+ for name, param in model.named_parameters():
287
+ h = hash_tensor(param.data, sample_size=100)
288
+ all_hashes.append(f"{name}:{h}")
289
+
290
+ combined = "|".join(all_hashes)
291
+ return hashlib.sha256(combined.encode()).hexdigest()[:32]
292
+
293
+
294
+ def hash_input(data: Any) -> str:
295
+ """
296
+ Hash input data (text, tokens, images, etc).
297
+ """
298
+ if isinstance(data, str):
299
+ return hashlib.sha256(data.encode('utf-8')).hexdigest()[:16]
300
+ elif hasattr(data, 'detach'):
301
+ return hash_tensor(data)
302
+ elif isinstance(data, dict):
303
+ # Tokenizer output
304
+ combined = json.dumps({k: str(v) for k, v in sorted(data.items())})
305
+ return hashlib.sha256(combined.encode()).hexdigest()[:16]
306
+ else:
307
+ return hashlib.sha256(str(data).encode()).hexdigest()[:16]
308
+
309
+
310
+ def compute_merkle_root(hashes: List[str]) -> str:
311
+ """
312
+ Compute Merkle root from list of hashes.
313
+
314
+ Standard Merkle tree construction - pairs hashes bottom-up
315
+ until single root remains.
316
+ """
317
+ if not hashes:
318
+ return hashlib.sha256(b"empty").hexdigest()[:16]
319
+
320
+ if len(hashes) == 1:
321
+ return hashes[0]
322
+
323
+ # Pad to even length
324
+ if len(hashes) % 2 == 1:
325
+ hashes = hashes + [hashes[-1]]
326
+
327
+ # Compute next level
328
+ next_level = []
329
+ for i in range(0, len(hashes), 2):
330
+ combined = hashes[i] + hashes[i + 1]
331
+ next_hash = hashlib.sha256(combined.encode()).hexdigest()[:16]
332
+ next_level.append(next_hash)
333
+
334
+ return compute_merkle_root(next_level)
335
+
336
+
337
+ # =============================================================================
338
+ # PROVENANCE TRACKER (attaches to model)
339
+ # =============================================================================
340
+
341
+ class ProvenanceTracker:
342
+ """
343
+ Tracks provenance during model forward pass.
344
+
345
+ Usage:
346
+ tracker = ProvenanceTracker(model, model_id="gpt2")
347
+ tracker.start_session(input_text)
348
+
349
+ # Run forward pass - hooks capture everything
350
+ output = model(**inputs)
351
+
352
+ chain = tracker.finalize_session()
353
+ print(chain.merkle_root)
354
+
355
+ NEW: Now writes to tape file (JSONL) for redundant logging!
356
+ Correlative with the Live Tracer - both systems log independently.
357
+ """
358
+
359
+ def __init__(self, model, model_id: str, log_dir: str = "./logs"):
360
+ self.model = model
361
+ self.model_id = model_id
362
+ self.model_hash = hash_model(model)
363
+
364
+ self.hooks = []
365
+ self.current_chain: Optional[ProvenanceChain] = None
366
+ self.execution_counter = 0
367
+ self.last_hash = None # Track for parent linking
368
+ self.layer_hashes: Dict[str, str] = {} # layer_name -> hash
369
+
370
+ # === TAPE FILE FOR REDUNDANT LOGGING ===
371
+ from pathlib import Path
372
+ from threading import Lock
373
+ self._log_dir = Path(log_dir)
374
+ self._log_dir.mkdir(parents=True, exist_ok=True)
375
+ self._session_id = int(time.time())
376
+ self._tape_path = self._log_dir / f"provenance_tape_{self._session_id}.jsonl"
377
+ self._tape_file = None
378
+ self._tape_lock = Lock()
379
+ self._record_count = 0
380
+
381
+ def start_session(self, input_data: Any) -> str:
382
+ """Start a new provenance tracking session."""
383
+ import uuid
384
+
385
+ session_id = str(uuid.uuid4())[:8]
386
+ input_hash = hash_input(input_data)
387
+
388
+ self.current_chain = ProvenanceChain(
389
+ session_id=session_id,
390
+ model_id=self.model_id,
391
+ model_hash=self.model_hash,
392
+ input_hash=input_hash
393
+ )
394
+
395
+ self.execution_counter = 0
396
+ self.last_hash = input_hash
397
+ self.layer_hashes = {"input": input_hash}
398
+
399
+ # Register hooks
400
+ self._register_hooks()
401
+
402
+ return session_id
403
+
404
+ def _register_hooks(self):
405
+ """Register forward hooks on all modules."""
406
+ self._remove_hooks() # Clean up any existing
407
+
408
+ for name, module in self.model.named_modules():
409
+ if name: # Skip root
410
+ hook = module.register_forward_hook(
411
+ self._make_hook(name)
412
+ )
413
+ self.hooks.append(hook)
414
+
415
+ def _make_hook(self, layer_name: str):
416
+ """Create a forward hook for a specific layer."""
417
+ def hook(module, inp, out):
418
+ # Extract tensor
419
+ tensor = None
420
+ if hasattr(out, 'detach'):
421
+ tensor = out
422
+ elif isinstance(out, tuple) and len(out) > 0 and hasattr(out[0], 'detach'):
423
+ tensor = out[0]
424
+ elif hasattr(out, 'last_hidden_state'):
425
+ tensor = out.last_hidden_state
426
+ elif hasattr(out, 'logits'):
427
+ tensor = out.logits
428
+
429
+ if tensor is None or not hasattr(tensor, 'numel') or tensor.numel() == 0:
430
+ return
431
+
432
+ # Compute hashes
433
+ state_hash = hash_tensor(tensor)
434
+ params_hash = hash_params(module)
435
+
436
+ # Determine parent hashes
437
+ # For now, use last layer's hash. More sophisticated: track actual data flow.
438
+ parent_hashes = [self.last_hash] if self.last_hash else []
439
+
440
+ # Compute stats
441
+ t = tensor.float()
442
+ stats = {
443
+ "mean": t.mean().item(),
444
+ "std": t.std().item(),
445
+ "min": t.min().item(),
446
+ "max": t.max().item(),
447
+ "sparsity": (tensor == 0).float().mean().item(),
448
+ }
449
+
450
+ # Create record
451
+ record = ProvenanceRecord(
452
+ layer_name=layer_name,
453
+ layer_idx=self.execution_counter,
454
+ state_hash=state_hash,
455
+ parent_hashes=parent_hashes,
456
+ params_hash=params_hash,
457
+ shape=list(tensor.shape),
458
+ dtype=str(tensor.dtype),
459
+ stats=stats,
460
+ execution_order=self.execution_counter,
461
+ )
462
+
463
+ # Add to chain
464
+ if self.current_chain:
465
+ self.current_chain.add_record(record)
466
+
467
+ # === WRITE TO TAPE (REDUNDANT LOGGING) ===
468
+ self._write_to_tape(record)
469
+
470
+ # Update tracking
471
+ self.last_hash = state_hash
472
+ self.layer_hashes[layer_name] = state_hash
473
+ self.execution_counter += 1
474
+ self._record_count += 1
475
+
476
+ return hook
477
+
478
+ def _write_to_tape(self, record: ProvenanceRecord):
479
+ """Write provenance record to tape file for redundant logging."""
480
+ import json
481
+ try:
482
+ with self._tape_lock:
483
+ if self._tape_file is None:
484
+ self._tape_file = open(self._tape_path, "a", encoding="utf-8")
485
+ print(f"[CASCADE] 📼 Provenance tape started: {self._tape_path}")
486
+
487
+ tape_record = {
488
+ "seq": self._record_count,
489
+ "record": record.to_dict(),
490
+ "session_id": self._session_id,
491
+ "model_id": self.model_id,
492
+ }
493
+ self._tape_file.write(json.dumps(tape_record, default=str) + "\n")
494
+ self._tape_file.flush()
495
+ except Exception as e:
496
+ pass # Don't let tape errors break the main flow
497
+
498
+ def close_tape(self):
499
+ """Close the tape file."""
500
+ with self._tape_lock:
501
+ if self._tape_file:
502
+ self._tape_file.close()
503
+ self._tape_file = None
504
+ print(f"[CASCADE] 📼 Provenance tape closed: {self._record_count} records → {self._tape_path}")
505
+
506
+ def get_tape_path(self):
507
+ """Get the current tape file path."""
508
+ return self._tape_path
509
+
510
+ def _remove_hooks(self):
511
+ """Remove all registered hooks."""
512
+ for hook in self.hooks:
513
+ hook.remove()
514
+ self.hooks = []
515
+
516
+ def finalize_session(self, output_data: Any = None) -> ProvenanceChain:
517
+ """Finalize session, compute Merkle root, return chain."""
518
+ self._remove_hooks()
519
+
520
+ if self.current_chain is None:
521
+ raise ValueError("No active session")
522
+
523
+ if output_data is not None:
524
+ self.current_chain.output_hash = hash_input(output_data)
525
+
526
+ self.current_chain.finalize()
527
+
528
+ # Close tape (session complete)
529
+ self.close_tape()
530
+
531
+ chain = self.current_chain
532
+ self.current_chain = None
533
+
534
+ return chain
535
+
536
+
537
+ # =============================================================================
538
+ # VERIFICATION & COMPARISON
539
+ # =============================================================================
540
+
541
+ def verify_chain(chain: ProvenanceChain) -> Tuple[bool, str]:
542
+ """Verify a provenance chain's integrity."""
543
+ return chain.verify()
544
+
545
+
546
+ def compare_chains(chain_a: ProvenanceChain, chain_b: ProvenanceChain) -> Dict[str, Any]:
547
+ """
548
+ Compare two provenance chains.
549
+
550
+ Useful for:
551
+ - Same model, different inputs (where did outputs diverge?)
552
+ - Different models, same input (structural comparison)
553
+ - Same everything (reproducibility check)
554
+ """
555
+ result = {
556
+ "model_match": chain_a.model_hash == chain_b.model_hash,
557
+ "input_match": chain_a.input_hash == chain_b.input_hash,
558
+ "output_match": chain_a.output_hash == chain_b.output_hash,
559
+ "merkle_match": chain_a.merkle_root == chain_b.merkle_root,
560
+ "divergence_points": [],
561
+ "a_only_layers": [],
562
+ "b_only_layers": [],
563
+ "matching_layers": [],
564
+ }
565
+
566
+ a_layers = set(chain_a.records.keys())
567
+ b_layers = set(chain_b.records.keys())
568
+
569
+ result["a_only_layers"] = list(a_layers - b_layers)
570
+ result["b_only_layers"] = list(b_layers - a_layers)
571
+
572
+ # Compare matching layers
573
+ for layer in a_layers & b_layers:
574
+ rec_a = chain_a.records[layer]
575
+ rec_b = chain_b.records[layer]
576
+
577
+ if rec_a.state_hash == rec_b.state_hash:
578
+ result["matching_layers"].append(layer)
579
+ else:
580
+ result["divergence_points"].append({
581
+ "layer": layer,
582
+ "hash_a": rec_a.state_hash,
583
+ "hash_b": rec_b.state_hash,
584
+ "stats_a": rec_a.stats,
585
+ "stats_b": rec_b.stats,
586
+ })
587
+
588
+ return result
589
+
590
+
591
+ def export_chain_for_audit(chain: ProvenanceChain, filepath: str) -> None:
592
+ """Export chain to file for external audit."""
593
+ with open(filepath, 'w') as f:
594
+ f.write(chain.to_json(indent=2))
595
+
596
+
597
+ def import_chain_for_audit(filepath: str) -> ProvenanceChain:
598
+ """Import chain from audit file."""
599
+ with open(filepath, 'r') as f:
600
+ data = json.load(f)
601
+ return ProvenanceChain.from_dict(data)
cascade/core/web3_bridge.py ADDED
@@ -0,0 +1,846 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE // WEB3 BRIDGE
3
+ Blockchain integration for AI provenance.
4
+
5
+ The bridge between neural networks and decentralized infrastructure.
6
+
7
+ ┌─────────────────────────────────────────────────────────────────┐
8
+ │ THE IMMUTABLE RECORD │
9
+ │ │
10
+ │ AI Inference ──► Provenance Chain ──► Merkle Root ──► Chain │
11
+ │ │ │
12
+ │ ▼ │
13
+ │ ┌─────────────────────────────────┐ │
14
+ │ │ ETHEREUM / SOLANA / etc │ │
15
+ │ │ ┌───────────────────────────┐ │ │
16
+ │ │ │ Attestation Contract │ │ │
17
+ │ │ │ - Model hash │ │ │
18
+ │ │ │ - Input hash │ │ │
19
+ │ │ │ - Merkle root │ │ │
20
+ │ │ │ - Timestamp │ │ │
21
+ │ │ └───────────────────────────┘ │ │
22
+ │ └─────────────────────────────────┘ │
23
+ │ │ │
24
+ │ ▼ │
25
+ │ IPFS / Arweave / Filecoin │
26
+ │ (Full provenance chain storage) │
27
+ └─────────────────────────────────────────────────────────────────┘
28
+
29
+ Web3 provides:
30
+ - Timestamping (block finality)
31
+ - Immutability (blockchain consensus)
32
+ - Decentralized storage (IPFS)
33
+ - Public verifiability (anyone can audit)
34
+ - Economic incentives (staking, reputation)
35
+
36
+ This module provides:
37
+ - EIP-712 typed data signatures (Ethereum standard)
38
+ - IPFS CID computation (content addressing)
39
+ - Smart contract ABI for attestation
40
+ - Multi-chain attestation format
41
+ - NFT metadata for provenance tokens
42
+ """
43
+
44
+ import hashlib
45
+ import json
46
+ import time
47
+ import struct
48
+ from typing import Dict, List, Optional, Any, Tuple
49
+ from dataclasses import dataclass, field, asdict
50
+ import base64
51
+
52
+ try:
53
+ from .provenance import ProvenanceChain, ProvenanceRecord, compute_merkle_root
54
+ except ImportError:
55
+ from provenance import ProvenanceChain, ProvenanceRecord, compute_merkle_root
56
+
57
+
58
+ # =============================================================================
59
+ # CONSTANTS
60
+ # =============================================================================
61
+
62
+ # EIP-712 Domain for CASCADE attestations
63
+ CASCADE_DOMAIN = {
64
+ "name": "CASCADE Provenance",
65
+ "version": "1",
66
+ "chainId": 1, # Ethereum mainnet, override for other chains
67
+ "verifyingContract": "0x0000000000000000000000000000000000000000", # Set on deployment
68
+ }
69
+
70
+ # Attestation type definition for EIP-712
71
+ ATTESTATION_TYPES = {
72
+ "Attestation": [
73
+ {"name": "model_hash", "type": "bytes32"},
74
+ {"name": "input_hash", "type": "bytes32"},
75
+ {"name": "merkle_root", "type": "bytes32"},
76
+ {"name": "timestamp", "type": "uint256"},
77
+ {"name": "session_id", "type": "string"},
78
+ {"name": "layer_count", "type": "uint256"},
79
+ ]
80
+ }
81
+
82
+
83
+ # =============================================================================
84
+ # ATTESTATION RECORD
85
+ # =============================================================================
86
+
87
+ @dataclass
88
+ class Web3Attestation:
89
+ """
90
+ Blockchain-ready attestation of AI inference provenance.
91
+
92
+ This is the "receipt" that can be posted on-chain.
93
+ Minimal data for on-chain storage, full data on IPFS.
94
+ """
95
+
96
+ # Core identity
97
+ model_hash: str # 32-byte hash of model weights
98
+ input_hash: str # 32-byte hash of input data
99
+ output_hash: str # 32-byte hash of output
100
+ merkle_root: str # Merkle root of provenance chain
101
+
102
+ # Metadata
103
+ session_id: str # Unique session identifier
104
+ timestamp: int # Unix timestamp
105
+ layer_count: int # Number of layers in chain
106
+
107
+ # Content addressing
108
+ ipfs_cid: Optional[str] = None # IPFS CID for full chain
109
+ arweave_id: Optional[str] = None # Arweave transaction ID
110
+
111
+ # Signatures (set by wallet)
112
+ signature: Optional[str] = None # EIP-712 signature
113
+ signer: Optional[str] = None # Ethereum address
114
+
115
+ # Chain info
116
+ chain_id: int = 1 # 1=Ethereum, 137=Polygon, etc.
117
+ contract_address: Optional[str] = None
118
+ tx_hash: Optional[str] = None # Transaction hash after posting
119
+
120
+ def to_eip712_message(self, domain: Optional[Dict] = None) -> Dict[str, Any]:
121
+ """
122
+ Format as EIP-712 typed data for signing.
123
+
124
+ This is the standard Ethereum signing format that wallets understand.
125
+ """
126
+ domain = domain or CASCADE_DOMAIN
127
+
128
+ return {
129
+ "types": {
130
+ "EIP712Domain": [
131
+ {"name": "name", "type": "string"},
132
+ {"name": "version", "type": "string"},
133
+ {"name": "chainId", "type": "uint256"},
134
+ {"name": "verifyingContract", "type": "address"},
135
+ ],
136
+ **ATTESTATION_TYPES
137
+ },
138
+ "primaryType": "Attestation",
139
+ "domain": domain,
140
+ "message": {
141
+ "model_hash": self._to_bytes32(self.model_hash),
142
+ "input_hash": self._to_bytes32(self.input_hash),
143
+ "merkle_root": self._to_bytes32(self.merkle_root),
144
+ "timestamp": self.timestamp,
145
+ "session_id": self.session_id,
146
+ "layer_count": self.layer_count,
147
+ }
148
+ }
149
+
150
+ def _to_bytes32(self, hex_str: str) -> str:
151
+ """Pad hash to bytes32 format."""
152
+ # Remove 0x prefix if present
153
+ clean = hex_str.replace("0x", "")
154
+ # Pad to 64 chars (32 bytes)
155
+ padded = clean.zfill(64)
156
+ return "0x" + padded
157
+
158
+ def to_contract_args(self) -> Tuple:
159
+ """
160
+ Format for smart contract function call.
161
+
162
+ Returns tuple matching:
163
+ function attest(bytes32 modelHash, bytes32 inputHash, bytes32 merkleRoot,
164
+ string memory sessionId, uint256 layerCount)
165
+ """
166
+ return (
167
+ bytes.fromhex(self.model_hash.replace("0x", "").zfill(64)),
168
+ bytes.fromhex(self.input_hash.replace("0x", "").zfill(64)),
169
+ bytes.fromhex(self.merkle_root.replace("0x", "").zfill(64)),
170
+ self.session_id,
171
+ self.layer_count,
172
+ )
173
+
174
+ def to_dict(self) -> Dict[str, Any]:
175
+ """Serialize for storage/transmission."""
176
+ return asdict(self)
177
+
178
+ def to_json(self) -> str:
179
+ """JSON export."""
180
+ return json.dumps(self.to_dict(), indent=2)
181
+
182
+ @classmethod
183
+ def from_chain(cls, chain: ProvenanceChain) -> 'Web3Attestation':
184
+ """Create attestation from provenance chain."""
185
+ if not chain.finalized:
186
+ chain.finalize()
187
+
188
+ return cls(
189
+ model_hash=chain.model_hash,
190
+ input_hash=chain.input_hash,
191
+ output_hash=chain.output_hash or "0" * 16,
192
+ merkle_root=chain.merkle_root,
193
+ session_id=chain.session_id,
194
+ timestamp=int(chain.created_at),
195
+ layer_count=len(chain.records),
196
+ )
197
+
198
+
199
+ # =============================================================================
200
+ # IPFS CONTENT ADDRESSING
201
+ # =============================================================================
202
+
203
+ def compute_ipfs_cid_v0(data: bytes) -> str:
204
+ """
205
+ Compute IPFS CID v0 (Qm...) for data.
206
+
207
+ This is a simplified computation - actual IPFS uses more complex
208
+ chunking for large files. Suitable for JSON chain data.
209
+
210
+ CIDv0 format: Base58(0x12 || 0x20 || SHA256(data))
211
+ """
212
+ # SHA-256 hash
213
+ sha_hash = hashlib.sha256(data).digest()
214
+
215
+ # Multihash prefix: 0x12 (sha2-256), 0x20 (32 bytes)
216
+ multihash = bytes([0x12, 0x20]) + sha_hash
217
+
218
+ # Base58 encode (Bitcoin alphabet)
219
+ return base58_encode(multihash)
220
+
221
+
222
+ def compute_ipfs_cid_v1(data: bytes) -> str:
223
+ """
224
+ Compute IPFS CID v1 (bafy...) for data.
225
+
226
+ CIDv1 format: multibase || version || codec || multihash
227
+ """
228
+ # SHA-256 hash
229
+ sha_hash = hashlib.sha256(data).digest()
230
+
231
+ # Build CIDv1:
232
+ # 0x01 = CID version 1
233
+ # 0x55 = raw binary codec (could also use 0x71 for dag-cbor)
234
+ # 0x12 = sha2-256
235
+ # 0x20 = 32 bytes
236
+ cid_bytes = bytes([0x01, 0x55, 0x12, 0x20]) + sha_hash
237
+
238
+ # Base32 lower with 'b' prefix (multibase)
239
+ import base64
240
+ b32 = base64.b32encode(cid_bytes).decode('ascii').lower().rstrip('=')
241
+ return 'b' + b32
242
+
243
+
244
+ def base58_encode(data: bytes) -> str:
245
+ """Base58 encoding (Bitcoin alphabet)."""
246
+ ALPHABET = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"
247
+
248
+ # Count leading zeros
249
+ leading_zeros = 0
250
+ for byte in data:
251
+ if byte == 0:
252
+ leading_zeros += 1
253
+ else:
254
+ break
255
+
256
+ # Convert to integer
257
+ num = int.from_bytes(data, 'big')
258
+
259
+ # Convert to base58
260
+ result = ""
261
+ while num > 0:
262
+ num, remainder = divmod(num, 58)
263
+ result = ALPHABET[remainder] + result
264
+
265
+ # Add leading '1's for each leading zero byte
266
+ return '1' * leading_zeros + result
267
+
268
+
269
+ def chain_to_ipfs_ready(chain: ProvenanceChain) -> Tuple[bytes, str]:
270
+ """
271
+ Prepare provenance chain for IPFS upload.
272
+
273
+ Returns:
274
+ (data_bytes, cid) - The data to upload and its expected CID
275
+ """
276
+ json_data = chain.to_json().encode('utf-8')
277
+ cid = compute_ipfs_cid_v0(json_data)
278
+ return json_data, cid
279
+
280
+
281
+ # =============================================================================
282
+ # SMART CONTRACT ABI
283
+ # =============================================================================
284
+
285
+ CASCADE_ATTESTATION_ABI = [
286
+ {
287
+ "name": "Attest",
288
+ "type": "event",
289
+ "inputs": [
290
+ {"name": "attester", "type": "address", "indexed": True},
291
+ {"name": "modelHash", "type": "bytes32", "indexed": True},
292
+ {"name": "merkleRoot", "type": "bytes32", "indexed": False},
293
+ {"name": "sessionId", "type": "string", "indexed": False},
294
+ {"name": "timestamp", "type": "uint256", "indexed": False},
295
+ ]
296
+ },
297
+ {
298
+ "name": "attest",
299
+ "type": "function",
300
+ "stateMutability": "nonpayable",
301
+ "inputs": [
302
+ {"name": "modelHash", "type": "bytes32"},
303
+ {"name": "inputHash", "type": "bytes32"},
304
+ {"name": "merkleRoot", "type": "bytes32"},
305
+ {"name": "sessionId", "type": "string"},
306
+ {"name": "layerCount", "type": "uint256"},
307
+ ],
308
+ "outputs": [{"name": "attestationId", "type": "uint256"}]
309
+ },
310
+ {
311
+ "name": "verify",
312
+ "type": "function",
313
+ "stateMutability": "view",
314
+ "inputs": [
315
+ {"name": "attestationId", "type": "uint256"},
316
+ ],
317
+ "outputs": [
318
+ {"name": "valid", "type": "bool"},
319
+ {"name": "attester", "type": "address"},
320
+ {"name": "modelHash", "type": "bytes32"},
321
+ {"name": "merkleRoot", "type": "bytes32"},
322
+ ]
323
+ },
324
+ {
325
+ "name": "getAttestation",
326
+ "type": "function",
327
+ "stateMutability": "view",
328
+ "inputs": [
329
+ {"name": "attestationId", "type": "uint256"},
330
+ ],
331
+ "outputs": [
332
+ {"name": "attester", "type": "address"},
333
+ {"name": "modelHash", "type": "bytes32"},
334
+ {"name": "inputHash", "type": "bytes32"},
335
+ {"name": "merkleRoot", "type": "bytes32"},
336
+ {"name": "sessionId", "type": "string"},
337
+ {"name": "layerCount", "type": "uint256"},
338
+ {"name": "timestamp", "type": "uint256"},
339
+ ]
340
+ },
341
+ {
342
+ "name": "attestationsByModel",
343
+ "type": "function",
344
+ "stateMutability": "view",
345
+ "inputs": [
346
+ {"name": "modelHash", "type": "bytes32"},
347
+ ],
348
+ "outputs": [
349
+ {"name": "attestationIds", "type": "uint256[]"},
350
+ ]
351
+ },
352
+ ]
353
+
354
+
355
+ # Solidity source for the attestation contract
356
+ CASCADE_ATTESTATION_SOLIDITY = '''
357
+ // SPDX-License-Identifier: MIT
358
+ pragma solidity ^0.8.19;
359
+
360
+ /**
361
+ * @title CascadeAttestation
362
+ * @notice On-chain attestation of AI inference provenance
363
+ * @dev Stores Merkle roots for off-chain provenance chains
364
+ */
365
+ contract CascadeAttestation {
366
+
367
+ struct Attestation {
368
+ address attester;
369
+ bytes32 modelHash;
370
+ bytes32 inputHash;
371
+ bytes32 merkleRoot;
372
+ string sessionId;
373
+ uint256 layerCount;
374
+ uint256 timestamp;
375
+ string ipfsCid; // Optional: full chain on IPFS
376
+ }
377
+
378
+ // Attestation storage
379
+ mapping(uint256 => Attestation) public attestations;
380
+ uint256 public attestationCount;
381
+
382
+ // Index by model
383
+ mapping(bytes32 => uint256[]) public attestationsByModel;
384
+
385
+ // Index by attester
386
+ mapping(address => uint256[]) public attestationsByAttester;
387
+
388
+ // Events
389
+ event Attested(
390
+ uint256 indexed attestationId,
391
+ address indexed attester,
392
+ bytes32 indexed modelHash,
393
+ bytes32 merkleRoot,
394
+ string sessionId
395
+ );
396
+
397
+ /**
398
+ * @notice Create a new attestation
399
+ * @param modelHash Hash of the model weights
400
+ * @param inputHash Hash of the input data
401
+ * @param merkleRoot Merkle root of the provenance chain
402
+ * @param sessionId Unique session identifier
403
+ * @param layerCount Number of layers in the chain
404
+ * @return attestationId The ID of the new attestation
405
+ */
406
+ function attest(
407
+ bytes32 modelHash,
408
+ bytes32 inputHash,
409
+ bytes32 merkleRoot,
410
+ string memory sessionId,
411
+ uint256 layerCount
412
+ ) external returns (uint256 attestationId) {
413
+ attestationId = attestationCount++;
414
+
415
+ attestations[attestationId] = Attestation({
416
+ attester: msg.sender,
417
+ modelHash: modelHash,
418
+ inputHash: inputHash,
419
+ merkleRoot: merkleRoot,
420
+ sessionId: sessionId,
421
+ layerCount: layerCount,
422
+ timestamp: block.timestamp,
423
+ ipfsCid: ""
424
+ });
425
+
426
+ attestationsByModel[modelHash].push(attestationId);
427
+ attestationsByAttester[msg.sender].push(attestationId);
428
+
429
+ emit Attested(attestationId, msg.sender, modelHash, merkleRoot, sessionId);
430
+
431
+ return attestationId;
432
+ }
433
+
434
+ /**
435
+ * @notice Attest with IPFS CID for full chain data
436
+ */
437
+ function attestWithIPFS(
438
+ bytes32 modelHash,
439
+ bytes32 inputHash,
440
+ bytes32 merkleRoot,
441
+ string memory sessionId,
442
+ uint256 layerCount,
443
+ string memory ipfsCid
444
+ ) external returns (uint256 attestationId) {
445
+ attestationId = this.attest(modelHash, inputHash, merkleRoot, sessionId, layerCount);
446
+ attestations[attestationId].ipfsCid = ipfsCid;
447
+ return attestationId;
448
+ }
449
+
450
+ /**
451
+ * @notice Verify an attestation exists and return core data
452
+ */
453
+ function verify(uint256 attestationId) external view returns (
454
+ bool valid,
455
+ address attester,
456
+ bytes32 modelHash,
457
+ bytes32 merkleRoot
458
+ ) {
459
+ if (attestationId >= attestationCount) {
460
+ return (false, address(0), bytes32(0), bytes32(0));
461
+ }
462
+
463
+ Attestation storage a = attestations[attestationId];
464
+ return (true, a.attester, a.modelHash, a.merkleRoot);
465
+ }
466
+
467
+ /**
468
+ * @notice Get all attestations for a model
469
+ */
470
+ function getModelAttestations(bytes32 modelHash) external view returns (uint256[] memory) {
471
+ return attestationsByModel[modelHash];
472
+ }
473
+
474
+ /**
475
+ * @notice Get all attestations by an address
476
+ */
477
+ function getAttesterAttestations(address attester) external view returns (uint256[] memory) {
478
+ return attestationsByAttester[attester];
479
+ }
480
+ }
481
+ '''
482
+
483
+
484
+ # =============================================================================
485
+ # NFT METADATA (for provenance tokens)
486
+ # =============================================================================
487
+
488
+ def generate_nft_metadata(chain: ProvenanceChain,
489
+ image_url: Optional[str] = None,
490
+ animation_url: Optional[str] = None) -> Dict[str, Any]:
491
+ """
492
+ Generate ERC-721 compatible metadata for a provenance NFT.
493
+
494
+ Each unique model×input×output combination could be an NFT,
495
+ proving that this specific inference happened.
496
+ """
497
+ if not chain.finalized:
498
+ chain.finalize()
499
+
500
+ # Generate attributes from chain
501
+ attributes = [
502
+ {"trait_type": "Model Hash", "value": chain.model_hash[:16]},
503
+ {"trait_type": "Input Hash", "value": chain.input_hash},
504
+ {"trait_type": "Merkle Root", "value": chain.merkle_root},
505
+ {"trait_type": "Layer Count", "value": len(chain.records)},
506
+ {"trait_type": "Timestamp", "value": int(chain.created_at)},
507
+ ]
508
+
509
+ # Add layer statistics as traits
510
+ if chain.records:
511
+ total_params = 0
512
+ layer_types = set()
513
+ for record in chain.records.values():
514
+ if record.params_hash != "no_params":
515
+ total_params += 1
516
+ # Extract layer type from name
517
+ parts = record.layer_name.split('.')
518
+ if len(parts) >= 2:
519
+ layer_types.add(parts[-1])
520
+
521
+ attributes.append({"trait_type": "Parameterized Layers", "value": total_params})
522
+ for lt in list(layer_types)[:5]: # Max 5 layer types
523
+ attributes.append({"trait_type": f"Has {lt}", "value": "Yes"})
524
+
525
+ return {
526
+ "name": f"CASCADE Provenance #{chain.session_id}",
527
+ "description": f"Cryptographic proof of AI inference. Model: {chain.model_id}. "
528
+ f"This NFT attests that a specific input was processed through "
529
+ f"the model, producing a verifiable Merkle root of all layer activations.",
530
+ "image": image_url or "ipfs://QmDefaultCascadeImage", # Placeholder
531
+ "animation_url": animation_url, # Could link to 3D visualization
532
+ "external_url": f"https://cascade.ai/verify/{chain.session_id}",
533
+ "attributes": attributes,
534
+ "properties": {
535
+ "model_id": chain.model_id,
536
+ "model_hash": chain.model_hash,
537
+ "input_hash": chain.input_hash,
538
+ "output_hash": chain.output_hash,
539
+ "merkle_root": chain.merkle_root,
540
+ "session_id": chain.session_id,
541
+ "layer_count": len(chain.records),
542
+ "created_at": chain.created_at,
543
+ }
544
+ }
545
+
546
+
547
+ # =============================================================================
548
+ # MULTI-CHAIN SUPPORT
549
+ # =============================================================================
550
+
551
+ CHAIN_CONFIGS = {
552
+ "ethereum": {
553
+ "chain_id": 1,
554
+ "name": "Ethereum Mainnet",
555
+ "explorer": "https://etherscan.io",
556
+ "native_token": "ETH",
557
+ },
558
+ "polygon": {
559
+ "chain_id": 137,
560
+ "name": "Polygon",
561
+ "explorer": "https://polygonscan.com",
562
+ "native_token": "MATIC",
563
+ },
564
+ "arbitrum": {
565
+ "chain_id": 42161,
566
+ "name": "Arbitrum One",
567
+ "explorer": "https://arbiscan.io",
568
+ "native_token": "ETH",
569
+ },
570
+ "optimism": {
571
+ "chain_id": 10,
572
+ "name": "Optimism",
573
+ "explorer": "https://optimistic.etherscan.io",
574
+ "native_token": "ETH",
575
+ },
576
+ "base": {
577
+ "chain_id": 8453,
578
+ "name": "Base",
579
+ "explorer": "https://basescan.org",
580
+ "native_token": "ETH",
581
+ },
582
+ "solana": {
583
+ "chain_id": -1, # Not EVM
584
+ "name": "Solana",
585
+ "explorer": "https://solscan.io",
586
+ "native_token": "SOL",
587
+ },
588
+ }
589
+
590
+
591
+ def get_chain_config(chain_name: str) -> Dict[str, Any]:
592
+ """Get configuration for a specific blockchain."""
593
+ return CHAIN_CONFIGS.get(chain_name.lower(), CHAIN_CONFIGS["ethereum"])
594
+
595
+
596
+ # =============================================================================
597
+ # WEB3 EXPORT UTILITIES
598
+ # =============================================================================
599
+
600
+ def export_for_web3(chain: ProvenanceChain,
601
+ chain_name: str = "ethereum",
602
+ include_full_chain: bool = True) -> Dict[str, Any]:
603
+ """
604
+ Export provenance chain in Web3-ready format.
605
+
606
+ Returns everything needed to post attestation on-chain.
607
+ """
608
+ attestation = Web3Attestation.from_chain(chain)
609
+ chain_config = get_chain_config(chain_name)
610
+
611
+ result = {
612
+ "attestation": attestation.to_dict(),
613
+ "eip712": attestation.to_eip712_message({
614
+ **CASCADE_DOMAIN,
615
+ "chainId": chain_config["chain_id"]
616
+ }),
617
+ "contract_abi": CASCADE_ATTESTATION_ABI,
618
+ "chain_config": chain_config,
619
+ }
620
+
621
+ if include_full_chain:
622
+ data, cid = chain_to_ipfs_ready(chain)
623
+ result["ipfs"] = {
624
+ "data": base64.b64encode(data).decode('ascii'),
625
+ "cid": cid,
626
+ "size_bytes": len(data),
627
+ }
628
+
629
+ return result
630
+
631
+
632
+ def generate_verification_page(attestation: Web3Attestation,
633
+ chain: Optional[ProvenanceChain] = None) -> str:
634
+ """
635
+ Generate an HTML verification page for an attestation.
636
+
637
+ This can be hosted anywhere and allows public verification.
638
+ """
639
+ records_html = ""
640
+ if chain:
641
+ for record in chain.records.values():
642
+ records_html += f"""
643
+ <tr>
644
+ <td>{record.layer_name}</td>
645
+ <td><code>{record.state_hash}</code></td>
646
+ <td>{record.shape}</td>
647
+ <td>{record.stats.get('mean', 0):.4f}</td>
648
+ </tr>
649
+ """
650
+
651
+ return f"""<!DOCTYPE html>
652
+ <html>
653
+ <head>
654
+ <title>CASCADE Provenance Verification</title>
655
+ <meta charset="utf-8">
656
+ <style>
657
+ body {{ font-family: 'Courier New', monospace; background: #0a0a0a; color: #00ff88; padding: 40px; }}
658
+ .container {{ max-width: 900px; margin: 0 auto; }}
659
+ h1 {{ color: #00ffcc; border-bottom: 2px solid #00ff88; padding-bottom: 10px; }}
660
+ .hash {{ font-family: monospace; background: #1a1a2e; padding: 10px; border-radius: 4px; word-break: break-all; }}
661
+ .verified {{ color: #00ff88; }}
662
+ .label {{ color: #888; font-size: 0.9em; }}
663
+ table {{ width: 100%; border-collapse: collapse; margin-top: 20px; }}
664
+ th, td {{ padding: 8px; border: 1px solid #333; text-align: left; }}
665
+ th {{ background: #1a1a2e; }}
666
+ code {{ background: #1a1a2e; padding: 2px 6px; border-radius: 3px; }}
667
+ .merkle {{ font-size: 1.5em; color: #ffcc00; text-align: center; padding: 20px; background: #1a1a2e; border-radius: 8px; margin: 20px 0; }}
668
+ </style>
669
+ </head>
670
+ <body>
671
+ <div class="container">
672
+ <h1>🔗 CASCADE Provenance Verification</h1>
673
+
674
+ <div class="merkle">
675
+ Merkle Root: <code>{attestation.merkle_root}</code>
676
+ </div>
677
+
678
+ <h2>Attestation Details</h2>
679
+ <p class="label">Session ID</p>
680
+ <div class="hash">{attestation.session_id}</div>
681
+
682
+ <p class="label">Model Hash</p>
683
+ <div class="hash">{attestation.model_hash}</div>
684
+
685
+ <p class="label">Input Hash</p>
686
+ <div class="hash">{attestation.input_hash}</div>
687
+
688
+ <p class="label">Output Hash</p>
689
+ <div class="hash">{attestation.output_hash}</div>
690
+
691
+ <p class="label">Timestamp</p>
692
+ <div class="hash">{attestation.timestamp} ({time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime(attestation.timestamp))})</div>
693
+
694
+ <p class="label">Layer Count</p>
695
+ <div class="hash">{attestation.layer_count} layers</div>
696
+
697
+ {"<h2>Provenance Chain</h2><table><tr><th>Layer</th><th>State Hash</th><th>Shape</th><th>Mean</th></tr>" + records_html + "</table>" if chain else ""}
698
+
699
+ <h2>On-Chain Verification</h2>
700
+ <p>{"<span class='verified'>✓ Verified on " + get_chain_config('ethereum')['name'] + "</span>" if attestation.tx_hash else "⏳ Pending on-chain attestation"}</p>
701
+ {f"<p class='label'>Transaction</p><div class='hash'><a href='{get_chain_config('ethereum')['explorer']}/tx/{attestation.tx_hash}' style='color: #00ff88;'>{attestation.tx_hash}</a></div>" if attestation.tx_hash else ""}
702
+
703
+ <h2>IPFS Storage</h2>
704
+ <p>{f"<a href='https://ipfs.io/ipfs/{attestation.ipfs_cid}' style='color: #00ff88;'>{attestation.ipfs_cid}</a>" if attestation.ipfs_cid else "Full chain not yet pinned to IPFS"}</p>
705
+
706
+ <hr style="border-color: #333; margin: 40px 0;">
707
+ <p style="color: #666; text-align: center;">CASCADE Provenance Engine • Due process infrastructure for AI</p>
708
+ </div>
709
+ </body>
710
+ </html>
711
+ """
712
+
713
+
714
+ # =============================================================================
715
+ # SIGNATURE UTILITIES (for wallet integration)
716
+ # =============================================================================
717
+
718
+ def prepare_for_signing(attestation: Web3Attestation,
719
+ chain_name: str = "ethereum") -> Dict[str, Any]:
720
+ """
721
+ Prepare attestation for wallet signing (MetaMask, etc).
722
+
723
+ Returns the EIP-712 message that wallets can sign.
724
+ """
725
+ chain_config = get_chain_config(chain_name)
726
+
727
+ eip712 = attestation.to_eip712_message({
728
+ **CASCADE_DOMAIN,
729
+ "chainId": chain_config["chain_id"]
730
+ })
731
+
732
+ return {
733
+ "method": "eth_signTypedData_v4",
734
+ "params": [
735
+ None, # Address filled by wallet
736
+ json.dumps(eip712)
737
+ ],
738
+ "display": {
739
+ "title": "Sign CASCADE Attestation",
740
+ "description": f"Attest that model {attestation.model_hash[:16]}... "
741
+ f"processed input {attestation.input_hash[:16]}...",
742
+ "merkle_root": attestation.merkle_root,
743
+ }
744
+ }
745
+
746
+
747
+ def verify_signature(attestation: Web3Attestation,
748
+ signature: str,
749
+ expected_signer: str) -> Tuple[bool, str]:
750
+ """
751
+ Verify an EIP-712 signature.
752
+
753
+ Note: Full verification requires eth_utils/web3.py.
754
+ This is a structural check only.
755
+ """
756
+ if not signature or len(signature) < 130:
757
+ return False, "Invalid signature length"
758
+
759
+ if not signature.startswith("0x"):
760
+ return False, "Signature must start with 0x"
761
+
762
+ # Extract r, s, v components
763
+ try:
764
+ sig_bytes = bytes.fromhex(signature[2:])
765
+ if len(sig_bytes) != 65:
766
+ return False, f"Signature must be 65 bytes, got {len(sig_bytes)}"
767
+
768
+ r = sig_bytes[:32]
769
+ s = sig_bytes[32:64]
770
+ v = sig_bytes[64]
771
+
772
+ # v should be 27 or 28 (or 0/1 for some implementations)
773
+ if v not in [0, 1, 27, 28]:
774
+ return False, f"Invalid v value: {v}"
775
+
776
+ # Structural validation passed
777
+ # Full cryptographic verification requires ecrecover
778
+ return True, "Signature structure valid (full verification requires web3.py)"
779
+
780
+ except Exception as e:
781
+ return False, f"Signature parsing error: {str(e)}"
782
+
783
+
784
+ # =============================================================================
785
+ # CONVENIENCE FUNCTIONS
786
+ # =============================================================================
787
+
788
+ def attest_inference(chain: ProvenanceChain,
789
+ chain_name: str = "ethereum") -> Web3Attestation:
790
+ """
791
+ One-liner to create attestation from provenance chain.
792
+
793
+ Usage:
794
+ attestation = attest_inference(chain)
795
+ print(attestation.merkle_root)
796
+ """
797
+ if not chain.finalized:
798
+ chain.finalize()
799
+
800
+ attestation = Web3Attestation.from_chain(chain)
801
+
802
+ # Compute IPFS CID
803
+ data, cid = chain_to_ipfs_ready(chain)
804
+ attestation.ipfs_cid = cid
805
+
806
+ # Set chain
807
+ attestation.chain_id = get_chain_config(chain_name)["chain_id"]
808
+
809
+ return attestation
810
+
811
+
812
+ def quick_verify(merkle_root: str, layer_hashes: List[str]) -> bool:
813
+ """
814
+ Quick verification that layer hashes produce expected Merkle root.
815
+ """
816
+ computed = compute_merkle_root(layer_hashes)
817
+ return computed == merkle_root
818
+
819
+
820
+ # =============================================================================
821
+ # COMMAND LINE INTERFACE
822
+ # =============================================================================
823
+
824
+ if __name__ == "__main__":
825
+ import sys
826
+
827
+ print("CASCADE // WEB3 BRIDGE")
828
+ print("=" * 50)
829
+ print()
830
+ print("Smart Contract (Solidity):")
831
+ print("-" * 50)
832
+ print(CASCADE_ATTESTATION_SOLIDITY[:500] + "...")
833
+ print()
834
+ print("Contract ABI:")
835
+ print("-" * 50)
836
+ print(json.dumps(CASCADE_ATTESTATION_ABI, indent=2)[:500] + "...")
837
+ print()
838
+ print("Supported Chains:")
839
+ print("-" * 50)
840
+ for name, config in CHAIN_CONFIGS.items():
841
+ print(f" {name}: Chain ID {config['chain_id']}")
842
+ print()
843
+ print("Usage:")
844
+ print(" from cascade.core.web3_bridge import attest_inference, export_for_web3")
845
+ print(" attestation = attest_inference(provenance_chain)")
846
+ print(" web3_data = export_for_web3(provenance_chain, 'polygon')")
cascade/data/__init__.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Data Observatory
3
+
4
+ Dataset observation with the same rigor as model observation.
5
+ Tracks provenance, schema, lineage using W3C PROV-O standard.
6
+ """
7
+
8
+ from .entities import (
9
+ DatasetEntity,
10
+ Activity,
11
+ Agent,
12
+ Relationship,
13
+ RelationType,
14
+ ActivityType,
15
+ AgentType,
16
+ create_system_agent,
17
+ create_model_agent,
18
+ create_user_agent,
19
+ )
20
+ from .observer import DatasetObserver, ObservationContext
21
+ from .provenance import ProvenanceGraph
22
+ from .schema import SchemaObserver, DatasetSchema, FieldSchema, hash_content
23
+ from .croissant import CroissantExporter, export_to_croissant
24
+ from .hub import HubIntegration, AccountabilityBundle, push_to_hub, pull_from_hub
25
+ from .license import (
26
+ SPDXLicense,
27
+ LicenseCategory,
28
+ LicenseRestriction,
29
+ LicenseCompatibility,
30
+ LicenseAnalyzer,
31
+ SPDX_LICENSES,
32
+ get_license,
33
+ check_license_compatibility,
34
+ get_derived_license,
35
+ )
36
+ from .pii import (
37
+ PIIType,
38
+ PIISeverity,
39
+ PIIMatch,
40
+ PIIScanResult,
41
+ PIIScanner,
42
+ scan_for_pii,
43
+ quick_pii_check,
44
+ )
45
+ from .live import (
46
+ LiveDocumentTracer,
47
+ TraceEvent,
48
+ TraceEventType,
49
+ DocumentSpan,
50
+ DocumentAssociation,
51
+ ConsoleTraceRenderer,
52
+ create_live_tracer,
53
+ )
54
+
55
+ __all__ = [
56
+ # Entities (PROV-O)
57
+ "DatasetEntity",
58
+ "Activity",
59
+ "Agent",
60
+ "Relationship",
61
+ "RelationType",
62
+ "ActivityType",
63
+ "AgentType",
64
+ "create_system_agent",
65
+ "create_model_agent",
66
+ "create_user_agent",
67
+ # Observer
68
+ "DatasetObserver",
69
+ "ObservationContext",
70
+ # Provenance
71
+ "ProvenanceGraph",
72
+ # Schema
73
+ "SchemaObserver",
74
+ "DatasetSchema",
75
+ "FieldSchema",
76
+ "hash_content",
77
+ # Export
78
+ "CroissantExporter",
79
+ "export_to_croissant",
80
+ # Accountability
81
+ "AccountabilityBundle",
82
+ # Hub
83
+ "HubIntegration",
84
+ "push_to_hub",
85
+ "pull_from_hub",
86
+ # License
87
+ "SPDXLicense",
88
+ "LicenseCategory",
89
+ "LicenseRestriction",
90
+ "LicenseCompatibility",
91
+ "LicenseAnalyzer",
92
+ "SPDX_LICENSES",
93
+ "get_license",
94
+ "check_license_compatibility",
95
+ "get_derived_license",
96
+ # PII Detection
97
+ "PIIType",
98
+ "PIISeverity",
99
+ "PIIMatch",
100
+ "PIIScanResult",
101
+ "PIIScanner",
102
+ "scan_for_pii",
103
+ "quick_pii_check",
104
+ # Live Document Tracing
105
+ "LiveDocumentTracer",
106
+ "TraceEvent",
107
+ "TraceEventType",
108
+ "DocumentSpan",
109
+ "DocumentAssociation",
110
+ "ConsoleTraceRenderer",
111
+ "create_live_tracer",
112
+ ]
cascade/data/croissant.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Croissant Exporter
3
+
4
+ Exports provenance graph to MLCommons Croissant format.
5
+ Croissant is the emerging standard for ML dataset metadata.
6
+
7
+ Reference: https://github.com/mlcommons/croissant
8
+ """
9
+
10
+ import json
11
+ import time
12
+ from datetime import datetime
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ from .entities import DatasetEntity, Activity, Agent
16
+ from .provenance import ProvenanceGraph
17
+
18
+
19
+ class CroissantExporter:
20
+ """
21
+ Export provenance to Croissant JSON-LD format.
22
+
23
+ Croissant layers:
24
+ 1. Metadata - description, license, citation
25
+ 2. Resources - file descriptions
26
+ 3. Structure - record sets and fields
27
+ 4. ML Semantics - task types, splits
28
+
29
+ We add provenance as an extension.
30
+ """
31
+
32
+ CROISSANT_VERSION = "1.0"
33
+ CROISSANT_CONTEXT = "http://mlcommons.org/croissant/1.0"
34
+
35
+ def __init__(self, graph: ProvenanceGraph):
36
+ self.graph = graph
37
+
38
+ def export(
39
+ self,
40
+ name: str = None,
41
+ description: str = None,
42
+ license_url: str = None,
43
+ citation: str = None,
44
+ url: str = None,
45
+ include_provenance: bool = True,
46
+ ) -> Dict[str, Any]:
47
+ """
48
+ Export to Croissant JSON-LD.
49
+
50
+ Args:
51
+ name: Dataset name (defaults to graph name)
52
+ description: Dataset description
53
+ license_url: License URL
54
+ citation: Citation text
55
+ url: Dataset URL
56
+ include_provenance: Whether to include CASCADE provenance extension
57
+
58
+ Returns:
59
+ Croissant JSON-LD document
60
+ """
61
+ name = name or self.graph.name
62
+
63
+ doc = {
64
+ "@context": {
65
+ "@vocab": "http://schema.org/",
66
+ "sc": "http://schema.org/",
67
+ "cr": "http://mlcommons.org/croissant/",
68
+ "rai": "http://mlcommons.org/croissant/RAI/",
69
+ "spdx": "http://spdx.org/rdf/terms#",
70
+ },
71
+ "@type": "sc:Dataset",
72
+ "name": name,
73
+ "conformsTo": self.CROISSANT_CONTEXT,
74
+ "dateCreated": datetime.fromtimestamp(self.graph.created_at).isoformat(),
75
+ "dateModified": datetime.now().isoformat(),
76
+ }
77
+
78
+ if description:
79
+ doc["description"] = description
80
+ if license_url:
81
+ doc["license"] = license_url
82
+ if citation:
83
+ doc["citation"] = citation
84
+ if url:
85
+ doc["url"] = url
86
+
87
+ # Add distributions (file objects)
88
+ doc["distribution"] = self._build_distributions()
89
+
90
+ # Add record sets
91
+ doc["recordSet"] = self._build_record_sets()
92
+
93
+ # Add provenance extension
94
+ if include_provenance:
95
+ doc["cr:provenance"] = self._build_provenance_extension()
96
+
97
+ return doc
98
+
99
+ def _build_distributions(self) -> List[Dict[str, Any]]:
100
+ """Build distribution (FileObject) entries."""
101
+ distributions = []
102
+
103
+ for entity in self.graph.list_entities():
104
+ dist = {
105
+ "@type": "cr:FileObject",
106
+ "@id": entity.id,
107
+ "name": entity.name,
108
+ }
109
+
110
+ if entity.source_uri:
111
+ dist["contentUrl"] = entity.source_uri
112
+
113
+ if entity.content_hash:
114
+ dist["sha256"] = entity.content_hash
115
+
116
+ # License information (SPDX)
117
+ if entity.license_id:
118
+ dist["spdx:license"] = entity.license_id
119
+ if entity.license_url:
120
+ dist["sc:license"] = entity.license_url
121
+ else:
122
+ # Auto-generate SPDX license URL
123
+ dist["sc:license"] = f"https://spdx.org/licenses/{entity.license_id}.html"
124
+
125
+ # Infer encoding format from source type
126
+ format_map = {
127
+ "hf_dataset": "application/x-arrow",
128
+ "hf_hub": "application/x-arrow",
129
+ "parquet": "application/x-parquet",
130
+ "csv": "text/csv",
131
+ "json": "application/json",
132
+ "jsonl": "application/x-jsonlines",
133
+ }
134
+ if entity.source_type in format_map:
135
+ dist["encodingFormat"] = format_map[entity.source_type]
136
+
137
+ if entity.size_bytes:
138
+ dist["contentSize"] = f"{entity.size_bytes} bytes"
139
+
140
+ distributions.append(dist)
141
+
142
+ return distributions
143
+
144
+ def _build_record_sets(self) -> List[Dict[str, Any]]:
145
+ """Build RecordSet entries from entity schemas."""
146
+ record_sets = []
147
+
148
+ for entity in self.graph.list_entities():
149
+ schema = entity.attributes.get("schema")
150
+ if not schema:
151
+ continue
152
+
153
+ fields = []
154
+ for field_name, field_info in schema.get("fields", {}).items():
155
+ field_entry = {
156
+ "@type": "cr:Field",
157
+ "name": field_name,
158
+ "dataType": self._map_dtype_to_croissant(field_info.get("dtype", "string")),
159
+ }
160
+
161
+ if field_info.get("description"):
162
+ field_entry["description"] = field_info["description"]
163
+
164
+ # Source reference
165
+ field_entry["source"] = {
166
+ "fileObject": {"@id": entity.id},
167
+ "extract": {"column": field_name},
168
+ }
169
+
170
+ fields.append(field_entry)
171
+
172
+ if fields:
173
+ record_set = {
174
+ "@type": "cr:RecordSet",
175
+ "@id": f"recordset_{entity.id}",
176
+ "name": f"{entity.name}_records",
177
+ "field": fields,
178
+ }
179
+
180
+ if entity.record_count:
181
+ record_set["cr:recordCount"] = entity.record_count
182
+
183
+ record_sets.append(record_set)
184
+
185
+ return record_sets
186
+
187
+ def _map_dtype_to_croissant(self, dtype: str) -> str:
188
+ """Map internal dtype to Croissant/schema.org type."""
189
+ type_map = {
190
+ "string": "sc:Text",
191
+ "int8": "sc:Integer",
192
+ "int16": "sc:Integer",
193
+ "int32": "sc:Integer",
194
+ "int64": "sc:Integer",
195
+ "uint8": "sc:Integer",
196
+ "uint16": "sc:Integer",
197
+ "uint32": "sc:Integer",
198
+ "uint64": "sc:Integer",
199
+ "float16": "sc:Float",
200
+ "float32": "sc:Float",
201
+ "float64": "sc:Float",
202
+ "bool": "sc:Boolean",
203
+ "binary": "sc:Text", # Base64 encoded
204
+ "image": "sc:ImageObject",
205
+ "audio": "sc:AudioObject",
206
+ "categorical": "sc:Text", # With enumeration
207
+ "list": "sc:ItemList",
208
+ "struct": "sc:StructuredValue",
209
+ }
210
+ return type_map.get(dtype, "sc:Text")
211
+
212
+ def _build_provenance_extension(self) -> Dict[str, Any]:
213
+ """Build CASCADE provenance extension."""
214
+ return {
215
+ "@type": "cascade:ProvenanceGraph",
216
+ "cascade:rootHash": self.graph.root_hash,
217
+ "cascade:createdAt": datetime.fromtimestamp(self.graph.created_at).isoformat(),
218
+
219
+ # Entities with lineage
220
+ "cascade:entities": [
221
+ {
222
+ "@id": e.id,
223
+ "cascade:name": e.name,
224
+ "cascade:contentHash": e.content_hash,
225
+ "cascade:schemaHash": e.schema_hash,
226
+ "cascade:version": e.version,
227
+ "cascade:recordCount": e.record_count,
228
+ "cascade:derivedFrom": self.graph.get_lineage(e.id, "upstream"),
229
+ }
230
+ for e in self.graph.list_entities()
231
+ ],
232
+
233
+ # Activities
234
+ "cascade:activities": [
235
+ {
236
+ "@id": a.id,
237
+ "cascade:type": a.activity_type.value,
238
+ "cascade:name": a.name,
239
+ "cascade:startedAt": datetime.fromtimestamp(a.started_at).isoformat() if a.started_at else None,
240
+ "cascade:endedAt": datetime.fromtimestamp(a.ended_at).isoformat() if a.ended_at else None,
241
+ "cascade:inputs": a.inputs,
242
+ "cascade:outputs": a.outputs,
243
+ "cascade:parameters": a.parameters,
244
+ }
245
+ for a in self.graph.list_activities()
246
+ ],
247
+
248
+ # Agents
249
+ "cascade:agents": [
250
+ {
251
+ "@id": a.id,
252
+ "cascade:type": a.agent_type.value,
253
+ "cascade:name": a.name,
254
+ "cascade:version": a.version,
255
+ }
256
+ for a in self.graph.list_agents()
257
+ ],
258
+ }
259
+
260
+ def to_json(self, **kwargs) -> str:
261
+ """Export to JSON string."""
262
+ return json.dumps(self.export(**kwargs), indent=2, default=str)
263
+
264
+ def save(self, path: str, **kwargs):
265
+ """Save to file."""
266
+ with open(path, "w", encoding="utf-8") as f:
267
+ f.write(self.to_json(**kwargs))
268
+
269
+
270
+ def export_to_croissant(
271
+ graph: ProvenanceGraph,
272
+ name: str = None,
273
+ description: str = None,
274
+ **kwargs,
275
+ ) -> Dict[str, Any]:
276
+ """
277
+ Convenience function to export provenance to Croissant.
278
+
279
+ Args:
280
+ graph: The provenance graph to export
281
+ name: Dataset name
282
+ description: Dataset description
283
+ **kwargs: Additional export options
284
+
285
+ Returns:
286
+ Croissant JSON-LD document
287
+ """
288
+ exporter = CroissantExporter(graph)
289
+ return exporter.export(name=name, description=description, **kwargs)
cascade/data/entities.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PROV-O Entities for Dataset Observation
3
+
4
+ W3C PROV Data Model:
5
+ - Entity: A physical, digital, or conceptual thing (the dataset)
6
+ - Activity: Something that occurs over time and acts upon entities
7
+ - Agent: Something that bears responsibility for an activity
8
+
9
+ Relationships:
10
+ - wasGeneratedBy: Entity → Activity
11
+ - wasDerivedFrom: Entity → Entity
12
+ - wasAttributedTo: Entity → Agent
13
+ - used: Activity → Entity
14
+ - wasAssociatedWith: Activity → Agent
15
+ """
16
+
17
+ import hashlib
18
+ import json
19
+ import time
20
+ from dataclasses import dataclass, field
21
+ from datetime import datetime
22
+ from enum import Enum
23
+ from typing import Any, Dict, List, Optional, Union
24
+
25
+
26
+ class RelationType(Enum):
27
+ """W3C PROV-O relationship types."""
28
+ # Entity relationships
29
+ WAS_GENERATED_BY = "wasGeneratedBy" # Entity → Activity
30
+ WAS_DERIVED_FROM = "wasDerivedFrom" # Entity → Entity
31
+ WAS_ATTRIBUTED_TO = "wasAttributedTo" # Entity → Agent
32
+ WAS_REVISION_OF = "wasRevisionOf" # Entity → Entity (versioning)
33
+ HAD_PRIMARY_SOURCE = "hadPrimarySource" # Entity → Entity
34
+
35
+ # Activity relationships
36
+ USED = "used" # Activity → Entity
37
+ WAS_ASSOCIATED_WITH = "wasAssociatedWith" # Activity → Agent
38
+ WAS_INFORMED_BY = "wasInformedBy" # Activity → Activity
39
+ WAS_STARTED_BY = "wasStartedBy" # Activity → Entity
40
+ WAS_ENDED_BY = "wasEndedBy" # Activity → Entity
41
+
42
+ # Agent relationships
43
+ ACTED_ON_BEHALF_OF = "actedOnBehalfOf" # Agent → Agent
44
+
45
+
46
+ @dataclass
47
+ class Relationship:
48
+ """A provenance relationship between two nodes."""
49
+ relation_type: RelationType
50
+ source_id: str
51
+ target_id: str
52
+ timestamp: float = field(default_factory=time.time)
53
+ attributes: Dict[str, Any] = field(default_factory=dict)
54
+
55
+ def to_dict(self) -> Dict[str, Any]:
56
+ return {
57
+ "type": self.relation_type.value,
58
+ "source": self.source_id,
59
+ "target": self.target_id,
60
+ "timestamp": self.timestamp,
61
+ "attributes": self.attributes,
62
+ }
63
+
64
+ def to_prov_n(self) -> str:
65
+ """Export as PROV-N notation."""
66
+ return f"{self.relation_type.value}({self.source_id}, {self.target_id})"
67
+
68
+
69
+ @dataclass
70
+ class DatasetEntity:
71
+ """
72
+ A dataset entity in the provenance graph.
73
+
74
+ Corresponds to prov:Entity - any physical, digital, or conceptual thing.
75
+ In our case: a dataset, a version of a dataset, or a split.
76
+ """
77
+ id: str
78
+ name: str
79
+
80
+ # Content identification
81
+ content_hash: Optional[str] = None # SHA-256 of data content
82
+ schema_hash: Optional[str] = None # SHA-256 of schema/features
83
+
84
+ # Versioning
85
+ version: Optional[str] = None
86
+ previous_version: Optional[str] = None
87
+
88
+ # Source
89
+ source_type: str = "unknown" # hf_hub, local, s3, gcs, etc.
90
+ source_uri: Optional[str] = None
91
+
92
+ # License (SPDX identifier)
93
+ license_id: Optional[str] = None # e.g., "MIT", "CC-BY-4.0", "Apache-2.0"
94
+ license_url: Optional[str] = None # URL to license text
95
+
96
+ # Statistics
97
+ record_count: Optional[int] = None
98
+ size_bytes: Optional[int] = None
99
+ splits: Dict[str, int] = field(default_factory=dict) # split_name → count
100
+
101
+ # Metadata
102
+ attributes: Dict[str, Any] = field(default_factory=dict)
103
+
104
+ # Timestamps
105
+ created_at: float = field(default_factory=time.time)
106
+
107
+ def __post_init__(self):
108
+ """Generate ID if not provided."""
109
+ if not self.id:
110
+ self.id = f"entity:{self.name}:{int(self.created_at * 1000)}"
111
+
112
+ def compute_hash(self) -> str:
113
+ """Compute entity hash from content."""
114
+ content = json.dumps({
115
+ "id": self.id,
116
+ "name": self.name,
117
+ "content_hash": self.content_hash,
118
+ "schema_hash": self.schema_hash,
119
+ "version": self.version,
120
+ "record_count": self.record_count,
121
+ }, sort_keys=True)
122
+ return hashlib.sha256(content.encode()).hexdigest()
123
+
124
+ def to_dict(self) -> Dict[str, Any]:
125
+ return {
126
+ "@type": "prov:Entity",
127
+ "@id": self.id,
128
+ "name": self.name,
129
+ "content_hash": self.content_hash,
130
+ "schema_hash": self.schema_hash,
131
+ "version": self.version,
132
+ "previous_version": self.previous_version,
133
+ "source_type": self.source_type,
134
+ "source_uri": self.source_uri,
135
+ "license_id": self.license_id,
136
+ "license_url": self.license_url,
137
+ "record_count": self.record_count,
138
+ "size_bytes": self.size_bytes,
139
+ "splits": self.splits,
140
+ "attributes": self.attributes,
141
+ "created_at": self.created_at,
142
+ }
143
+
144
+ def to_prov_n(self) -> str:
145
+ """Export as PROV-N notation."""
146
+ attrs = ", ".join([
147
+ f'prov:label="{self.name}"',
148
+ f'cascade:contentHash="{self.content_hash or "unknown"}"',
149
+ f'cascade:recordCount="{self.record_count or 0}"',
150
+ f'cascade:license="{self.license_id or "unknown"}"',
151
+ ])
152
+ return f"entity({self.id}, [{attrs}])"
153
+
154
+
155
+ class ActivityType(Enum):
156
+ """Types of dataset activities."""
157
+ INGEST = "ingest" # Load from source
158
+ TRANSFORM = "transform" # Filter, map, join, etc.
159
+ SPLIT = "split" # Train/test/val split
160
+ AUGMENT = "augment" # Data augmentation
161
+ CLEAN = "clean" # Cleaning/preprocessing
162
+ MERGE = "merge" # Combining datasets
163
+ SAMPLE = "sample" # Sampling/subsetting
164
+ EXPORT = "export" # Export to format
165
+ TRAIN = "train" # Model training (consumption)
166
+ EVALUATE = "evaluate" # Model evaluation
167
+ INFERENCE = "inference" # Model inference
168
+ ENTITY_RESOLUTION = "entity_resolution" # Data Unity matching
169
+
170
+
171
+ @dataclass
172
+ class Activity:
173
+ """
174
+ An activity in the provenance graph.
175
+
176
+ Corresponds to prov:Activity - something that occurs over time
177
+ and acts upon or with entities.
178
+ """
179
+ id: str
180
+ activity_type: ActivityType
181
+ name: str
182
+
183
+ # Timing
184
+ started_at: Optional[float] = None
185
+ ended_at: Optional[float] = None
186
+
187
+ # Input/Output tracking
188
+ inputs: List[str] = field(default_factory=list) # Entity IDs
189
+ outputs: List[str] = field(default_factory=list) # Entity IDs
190
+
191
+ # Agent who performed this
192
+ agent_id: Optional[str] = None
193
+
194
+ # Parameters/configuration used
195
+ parameters: Dict[str, Any] = field(default_factory=dict)
196
+
197
+ # Metadata
198
+ attributes: Dict[str, Any] = field(default_factory=dict)
199
+
200
+ def __post_init__(self):
201
+ if not self.id:
202
+ self.id = f"activity:{self.activity_type.value}:{int(time.time() * 1000)}"
203
+ if self.started_at is None:
204
+ self.started_at = time.time()
205
+
206
+ def start(self):
207
+ """Mark activity as started."""
208
+ self.started_at = time.time()
209
+
210
+ def end(self):
211
+ """Mark activity as ended."""
212
+ self.ended_at = time.time()
213
+
214
+ @property
215
+ def duration(self) -> Optional[float]:
216
+ """Duration in seconds."""
217
+ if self.started_at and self.ended_at:
218
+ return self.ended_at - self.started_at
219
+ return None
220
+
221
+ def add_input(self, entity_id: str):
222
+ """Record an input entity."""
223
+ if entity_id not in self.inputs:
224
+ self.inputs.append(entity_id)
225
+
226
+ def add_output(self, entity_id: str):
227
+ """Record an output entity."""
228
+ if entity_id not in self.outputs:
229
+ self.outputs.append(entity_id)
230
+
231
+ def to_dict(self) -> Dict[str, Any]:
232
+ return {
233
+ "@type": "prov:Activity",
234
+ "@id": self.id,
235
+ "activity_type": self.activity_type.value,
236
+ "name": self.name,
237
+ "started_at": self.started_at,
238
+ "ended_at": self.ended_at,
239
+ "duration": self.duration,
240
+ "inputs": self.inputs,
241
+ "outputs": self.outputs,
242
+ "agent_id": self.agent_id,
243
+ "parameters": self.parameters,
244
+ "attributes": self.attributes,
245
+ }
246
+
247
+ def to_prov_n(self) -> str:
248
+ """Export as PROV-N notation."""
249
+ start = datetime.fromtimestamp(self.started_at).isoformat() if self.started_at else "-"
250
+ end = datetime.fromtimestamp(self.ended_at).isoformat() if self.ended_at else "-"
251
+ attrs = f'prov:label="{self.name}", cascade:type="{self.activity_type.value}"'
252
+ return f"activity({self.id}, {start}, {end}, [{attrs}])"
253
+
254
+
255
+ class AgentType(Enum):
256
+ """Types of agents."""
257
+ PERSON = "person"
258
+ ORGANIZATION = "organization"
259
+ SOFTWARE = "software"
260
+ MODEL = "model"
261
+ PIPELINE = "pipeline"
262
+ SYSTEM = "system"
263
+
264
+
265
+ @dataclass
266
+ class Agent:
267
+ """
268
+ An agent in the provenance graph.
269
+
270
+ Corresponds to prov:Agent - something that bears responsibility
271
+ for an activity taking place.
272
+ """
273
+ id: str
274
+ agent_type: AgentType
275
+ name: str
276
+
277
+ # For software/model agents
278
+ version: Optional[str] = None
279
+
280
+ # For organizational hierarchy
281
+ parent_agent_id: Optional[str] = None
282
+
283
+ # Contact/identification
284
+ identifier: Optional[str] = None # HF username, email, etc.
285
+
286
+ # Metadata
287
+ attributes: Dict[str, Any] = field(default_factory=dict)
288
+
289
+ # Timestamp
290
+ created_at: float = field(default_factory=time.time)
291
+
292
+ def __post_init__(self):
293
+ if not self.id:
294
+ self.id = f"agent:{self.agent_type.value}:{self.name}".replace(" ", "_").lower()
295
+
296
+ def to_dict(self) -> Dict[str, Any]:
297
+ return {
298
+ "@type": "prov:Agent",
299
+ "@id": self.id,
300
+ "agent_type": self.agent_type.value,
301
+ "name": self.name,
302
+ "version": self.version,
303
+ "parent_agent_id": self.parent_agent_id,
304
+ "identifier": self.identifier,
305
+ "attributes": self.attributes,
306
+ "created_at": self.created_at,
307
+ }
308
+
309
+ def to_prov_n(self) -> str:
310
+ """Export as PROV-N notation."""
311
+ attrs = f'prov:label="{self.name}", cascade:type="{self.agent_type.value}"'
312
+ if self.version:
313
+ attrs += f', cascade:version="{self.version}"'
314
+ return f"agent({self.id}, [{attrs}])"
315
+
316
+
317
+ # Convenience factory functions
318
+ def create_system_agent(name: str = "cascade", version: str = "1.0.0") -> Agent:
319
+ """Create a system agent for automated operations."""
320
+ return Agent(
321
+ id=f"agent:system:{name}",
322
+ agent_type=AgentType.SYSTEM,
323
+ name=name,
324
+ version=version,
325
+ )
326
+
327
+
328
+ def create_model_agent(model_id: str, version: str = None) -> Agent:
329
+ """Create an agent representing an ML model."""
330
+ return Agent(
331
+ id=f"agent:model:{model_id.replace('/', '_')}",
332
+ agent_type=AgentType.MODEL,
333
+ name=model_id,
334
+ version=version,
335
+ identifier=model_id,
336
+ )
337
+
338
+
339
+ def create_user_agent(username: str, org: str = None) -> Agent:
340
+ """Create an agent representing a user."""
341
+ agent = Agent(
342
+ id=f"agent:person:{username}",
343
+ agent_type=AgentType.PERSON,
344
+ name=username,
345
+ identifier=username,
346
+ )
347
+ if org:
348
+ agent.parent_agent_id = f"agent:organization:{org}"
349
+ return agent
cascade/data/hub.py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace Hub Integration
3
+
4
+ Push and pull dataset provenance to/from HuggingFace Hub.
5
+
6
+ Exports complete W3C PROV-O accountability bundle:
7
+ - cascade_provenance.json (CASCADE native format)
8
+ - prov_o.jsonld (W3C PROV-O JSON-LD - interoperable)
9
+ - prov_n.txt (W3C PROV-N notation - human readable)
10
+ - activities.jsonl (Activity log for audit)
11
+ - agents.json (Agent attributions)
12
+ - croissant.json (MLCommons Croissant)
13
+ """
14
+
15
+ import json
16
+ import time
17
+ from datetime import datetime, timezone
18
+ from typing import Any, Dict, List, Optional
19
+
20
+ from .provenance import ProvenanceGraph
21
+ from .croissant import CroissantExporter
22
+
23
+
24
+ class AccountabilityBundle:
25
+ """
26
+ Complete W3C PROV-O accountability package.
27
+
28
+ When a dataset is extracted, this bundle provides full audit trail:
29
+ - Who created/modified it (agents)
30
+ - What transformations occurred (activities)
31
+ - Where it came from (entity lineage)
32
+ - When everything happened (timestamps)
33
+ - How to verify integrity (hashes)
34
+ """
35
+
36
+ def __init__(self, graph: ProvenanceGraph):
37
+ self.graph = graph
38
+ self.created_at = datetime.now(timezone.utc).isoformat()
39
+
40
+ def to_prov_o_jsonld(self) -> Dict[str, Any]:
41
+ """Export W3C PROV-O JSON-LD (interoperable standard)."""
42
+ return self.graph.to_prov_jsonld()
43
+
44
+ def to_prov_n(self) -> str:
45
+ """Export W3C PROV-N notation (human readable)."""
46
+ return self.graph.to_prov_n()
47
+
48
+ def to_activity_log(self) -> List[Dict[str, Any]]:
49
+ """Export activity log for audit (JSONL format)."""
50
+ activities = []
51
+ for activity in self.graph.list_activities():
52
+ activities.append({
53
+ "id": activity.id,
54
+ "name": activity.name,
55
+ "type": activity.activity_type.value,
56
+ "started_at": datetime.fromtimestamp(activity.started_at).isoformat() if activity.started_at else None,
57
+ "ended_at": datetime.fromtimestamp(activity.ended_at).isoformat() if activity.ended_at else None,
58
+ "duration_seconds": activity.duration,
59
+ "inputs": activity.inputs,
60
+ "outputs": activity.outputs,
61
+ "parameters": activity.parameters,
62
+ "attributes": activity.attributes,
63
+ })
64
+ return activities
65
+
66
+ def to_agent_attributions(self) -> Dict[str, Any]:
67
+ """Export agent attributions for accountability."""
68
+ agents = {}
69
+ for agent in self.graph.list_agents():
70
+ agents[agent.id] = {
71
+ "name": agent.name,
72
+ "type": agent.agent_type.value,
73
+ "version": agent.version,
74
+ "identifier": agent.identifier,
75
+ "attributes": agent.attributes,
76
+ }
77
+
78
+ # Build attribution matrix: which agent did what
79
+ attributions = []
80
+ for rel in self.graph.list_relationships():
81
+ if rel.relation_type.value == "wasAssociatedWith":
82
+ activity = self.graph.get_activity(rel.source_id)
83
+ agent = self.graph.get_agent(rel.target_id)
84
+ if activity and agent:
85
+ attributions.append({
86
+ "activity_id": activity.id,
87
+ "activity_name": activity.name,
88
+ "agent_id": agent.id,
89
+ "agent_name": agent.name,
90
+ "timestamp": datetime.fromtimestamp(activity.started_at).isoformat() if activity.started_at else None,
91
+ })
92
+
93
+ return {
94
+ "agents": agents,
95
+ "attributions": attributions,
96
+ "total_agents": len(agents),
97
+ "total_attributions": len(attributions),
98
+ }
99
+
100
+ def to_integrity_manifest(self) -> Dict[str, Any]:
101
+ """Export integrity manifest for verification."""
102
+ is_valid, invalid_ids = self.graph.verify_integrity()
103
+
104
+ return {
105
+ "root_hash": self.graph.root_hash,
106
+ "created_at": self.created_at,
107
+ "is_valid": is_valid,
108
+ "invalid_entity_ids": invalid_ids,
109
+ "entity_hashes": {
110
+ entity.id: {
111
+ "content_hash": entity.content_hash,
112
+ "schema_hash": entity.schema_hash,
113
+ }
114
+ for entity in self.graph.list_entities()
115
+ },
116
+ "verification_note": (
117
+ "To verify: recompute content hashes and compare against this manifest. "
118
+ "Any mismatch indicates data tampering."
119
+ ),
120
+ }
121
+
122
+ def export(self, output_dir: str):
123
+ """Export all accountability artifacts to a directory."""
124
+ import os
125
+ os.makedirs(output_dir, exist_ok=True)
126
+
127
+ # 1. CASCADE provenance JSON
128
+ with open(os.path.join(output_dir, "cascade_provenance.json"), "w") as f:
129
+ json.dump(self.graph.to_dict(), f, indent=2, default=str)
130
+
131
+ # 2. W3C PROV-O JSON-LD
132
+ with open(os.path.join(output_dir, "prov_o.jsonld"), "w") as f:
133
+ json.dump(self.to_prov_o_jsonld(), f, indent=2, default=str)
134
+
135
+ # 3. W3C PROV-N notation
136
+ with open(os.path.join(output_dir, "prov_n.txt"), "w") as f:
137
+ f.write(self.to_prov_n())
138
+
139
+ # 4. Activity log
140
+ with open(os.path.join(output_dir, "activities.jsonl"), "w") as f:
141
+ for activity in self.to_activity_log():
142
+ f.write(json.dumps(activity, default=str) + "\n")
143
+
144
+ # 5. Agent attributions
145
+ with open(os.path.join(output_dir, "agents.json"), "w") as f:
146
+ json.dump(self.to_agent_attributions(), f, indent=2, default=str)
147
+
148
+ # 6. Integrity manifest
149
+ with open(os.path.join(output_dir, "integrity_manifest.json"), "w") as f:
150
+ json.dump(self.to_integrity_manifest(), f, indent=2, default=str)
151
+
152
+ # 7. Croissant metadata
153
+ exporter = CroissantExporter(self.graph)
154
+ croissant_content = exporter.to_json(name="dataset", url="local://")
155
+ with open(os.path.join(output_dir, "croissant.json"), "w") as f:
156
+ f.write(croissant_content)
157
+
158
+ def summary(self) -> Dict[str, Any]:
159
+ """Summary of the accountability bundle."""
160
+ stats = self.graph.stats
161
+ return {
162
+ "bundle_created_at": self.created_at,
163
+ "graph_name": self.graph.name,
164
+ "root_hash": self.graph.root_hash,
165
+ "entities": stats["entities"],
166
+ "activities": stats["activities"],
167
+ "agents": stats["agents"],
168
+ "relationships": stats["relationships"],
169
+ "files_included": [
170
+ "cascade_provenance.json",
171
+ "prov_o.jsonld",
172
+ "prov_n.txt",
173
+ "activities.jsonl",
174
+ "agents.json",
175
+ "integrity_manifest.json",
176
+ "croissant.json",
177
+ ],
178
+ }
179
+
180
+
181
+ class HubIntegration:
182
+ """
183
+ Integration with HuggingFace Hub for dataset provenance.
184
+
185
+ Stores complete accountability bundle:
186
+ 1. cascade_provenance.json - CASCADE native format
187
+ 2. prov_o.jsonld - W3C PROV-O JSON-LD (interoperable)
188
+ 3. prov_n.txt - W3C PROV-N notation (human readable)
189
+ 4. activities.jsonl - Activity log for audit
190
+ 5. agents.json - Agent attributions
191
+ 6. integrity_manifest.json - Hash verification
192
+ 7. croissant.json - MLCommons Croissant
193
+ 8. README.md - Human-readable provenance section
194
+ """
195
+
196
+ PROVENANCE_FILENAME = "cascade_provenance.json"
197
+ PROV_O_FILENAME = "prov_o.jsonld"
198
+ PROV_N_FILENAME = "prov_n.txt"
199
+ ACTIVITIES_FILENAME = "activities.jsonl"
200
+ AGENTS_FILENAME = "agents.json"
201
+ INTEGRITY_FILENAME = "integrity_manifest.json"
202
+ CROISSANT_FILENAME = "croissant.json"
203
+
204
+ def __init__(self, token: str = None):
205
+ """
206
+ Initialize Hub integration.
207
+
208
+ Args:
209
+ token: HuggingFace API token (optional, uses cached token if not provided)
210
+ """
211
+ self.token = token
212
+
213
+ def push_provenance(
214
+ self,
215
+ graph: ProvenanceGraph,
216
+ repo_id: str,
217
+ commit_message: str = "Update provenance",
218
+ private: bool = False,
219
+ include_croissant: bool = True,
220
+ full_accountability: bool = True,
221
+ ) -> str:
222
+ """
223
+ Push complete accountability bundle to HuggingFace Hub.
224
+
225
+ Args:
226
+ graph: The provenance graph to push
227
+ repo_id: HuggingFace repo ID (e.g., "username/dataset-name")
228
+ commit_message: Commit message
229
+ private: Whether the repo should be private
230
+ include_croissant: Whether to include Croissant JSON-LD
231
+ full_accountability: Whether to include full W3C PROV-O bundle
232
+
233
+ Returns:
234
+ URL of the pushed provenance
235
+ """
236
+ from huggingface_hub import HfApi, CommitOperationAdd
237
+
238
+ api = HfApi(token=self.token)
239
+
240
+ # Ensure repo exists
241
+ api.create_repo(
242
+ repo_id=repo_id,
243
+ repo_type="dataset",
244
+ private=private,
245
+ exist_ok=True,
246
+ )
247
+
248
+ operations = []
249
+ bundle = AccountabilityBundle(graph)
250
+
251
+ # 1. CASCADE provenance JSON (native format)
252
+ provenance_content = json.dumps(graph.to_dict(), indent=2, default=str)
253
+ operations.append(CommitOperationAdd(
254
+ path_in_repo=self.PROVENANCE_FILENAME,
255
+ path_or_fileobj=provenance_content.encode("utf-8"),
256
+ ))
257
+
258
+ if full_accountability:
259
+ # 2. W3C PROV-O JSON-LD (interoperable standard)
260
+ prov_o_content = json.dumps(bundle.to_prov_o_jsonld(), indent=2, default=str)
261
+ operations.append(CommitOperationAdd(
262
+ path_in_repo=self.PROV_O_FILENAME,
263
+ path_or_fileobj=prov_o_content.encode("utf-8"),
264
+ ))
265
+
266
+ # 3. W3C PROV-N notation (human readable)
267
+ prov_n_content = bundle.to_prov_n()
268
+ operations.append(CommitOperationAdd(
269
+ path_in_repo=self.PROV_N_FILENAME,
270
+ path_or_fileobj=prov_n_content.encode("utf-8"),
271
+ ))
272
+
273
+ # 4. Activity log (JSONL for easy grep/audit)
274
+ activities = bundle.to_activity_log()
275
+ activities_content = "\n".join(json.dumps(a, default=str) for a in activities)
276
+ operations.append(CommitOperationAdd(
277
+ path_in_repo=self.ACTIVITIES_FILENAME,
278
+ path_or_fileobj=activities_content.encode("utf-8"),
279
+ ))
280
+
281
+ # 5. Agent attributions
282
+ agents_content = json.dumps(bundle.to_agent_attributions(), indent=2, default=str)
283
+ operations.append(CommitOperationAdd(
284
+ path_in_repo=self.AGENTS_FILENAME,
285
+ path_or_fileobj=agents_content.encode("utf-8"),
286
+ ))
287
+
288
+ # 6. Integrity manifest (for verification)
289
+ integrity_content = json.dumps(bundle.to_integrity_manifest(), indent=2, default=str)
290
+ operations.append(CommitOperationAdd(
291
+ path_in_repo=self.INTEGRITY_FILENAME,
292
+ path_or_fileobj=integrity_content.encode("utf-8"),
293
+ ))
294
+
295
+ # 7. Croissant JSON-LD (MLCommons standard)
296
+ if include_croissant:
297
+ exporter = CroissantExporter(graph)
298
+ croissant_content = exporter.to_json(
299
+ name=repo_id.split("/")[-1],
300
+ url=f"https://huggingface.co/datasets/{repo_id}",
301
+ )
302
+ operations.append(CommitOperationAdd(
303
+ path_in_repo=self.CROISSANT_FILENAME,
304
+ path_or_fileobj=croissant_content.encode("utf-8"),
305
+ ))
306
+
307
+ # Commit all accountability artifacts
308
+ api.create_commit(
309
+ repo_id=repo_id,
310
+ repo_type="dataset",
311
+ operations=operations,
312
+ commit_message=commit_message,
313
+ )
314
+
315
+ return f"https://huggingface.co/datasets/{repo_id}"
316
+
317
+ def pull_provenance(self, repo_id: str) -> Optional[ProvenanceGraph]:
318
+ """
319
+ Pull provenance from HuggingFace Hub.
320
+
321
+ Args:
322
+ repo_id: HuggingFace repo ID
323
+
324
+ Returns:
325
+ ProvenanceGraph if found, None otherwise
326
+ """
327
+ from huggingface_hub import hf_hub_download
328
+
329
+ try:
330
+ # Download provenance file
331
+ local_path = hf_hub_download(
332
+ repo_id=repo_id,
333
+ filename=self.PROVENANCE_FILENAME,
334
+ repo_type="dataset",
335
+ token=self.token,
336
+ )
337
+
338
+ with open(local_path, "r", encoding="utf-8") as f:
339
+ data = json.load(f)
340
+
341
+ return ProvenanceGraph.from_dict(data)
342
+
343
+ except Exception as e:
344
+ print(f"Could not pull provenance from {repo_id}: {e}")
345
+ return None
346
+
347
+ def get_dataset_provenance_url(self, repo_id: str) -> str:
348
+ """Get URL to provenance file in Hub."""
349
+ return f"https://huggingface.co/datasets/{repo_id}/blob/main/{self.PROVENANCE_FILENAME}"
350
+
351
+ def update_dataset_card(
352
+ self,
353
+ repo_id: str,
354
+ graph: ProvenanceGraph,
355
+ ) -> str:
356
+ """
357
+ Update dataset card with provenance summary.
358
+
359
+ Adds/updates YAML front-matter with:
360
+ - Lineage information
361
+ - Root hash
362
+ - Entity/activity counts
363
+
364
+ Args:
365
+ repo_id: HuggingFace repo ID
366
+ graph: Provenance graph
367
+
368
+ Returns:
369
+ URL of the updated dataset
370
+ """
371
+ from huggingface_hub import HfApi, hf_hub_download
372
+
373
+ api = HfApi(token=self.token)
374
+
375
+ # Build provenance section for README
376
+ provenance_section = self._build_readme_section(graph)
377
+
378
+ # Get current README
379
+ try:
380
+ readme_path = hf_hub_download(
381
+ repo_id=repo_id,
382
+ filename="README.md",
383
+ repo_type="dataset",
384
+ token=self.token,
385
+ )
386
+ with open(readme_path, "r", encoding="utf-8") as f:
387
+ current_readme = f.read()
388
+ except:
389
+ current_readme = f"# {repo_id.split('/')[-1]}\n\n"
390
+
391
+ # Update or append provenance section
392
+ marker_start = "<!-- CASCADE_PROVENANCE_START -->"
393
+ marker_end = "<!-- CASCADE_PROVENANCE_END -->"
394
+
395
+ if marker_start in current_readme:
396
+ # Replace existing section
397
+ import re
398
+ pattern = re.escape(marker_start) + r".*?" + re.escape(marker_end)
399
+ new_readme = re.sub(
400
+ pattern,
401
+ f"{marker_start}\n{provenance_section}\n{marker_end}",
402
+ current_readme,
403
+ flags=re.DOTALL,
404
+ )
405
+ else:
406
+ # Append section
407
+ new_readme = current_readme.rstrip() + f"\n\n{marker_start}\n{provenance_section}\n{marker_end}\n"
408
+
409
+ # Push updated README
410
+ api.upload_file(
411
+ path_or_fileobj=new_readme.encode("utf-8"),
412
+ path_in_repo="README.md",
413
+ repo_id=repo_id,
414
+ repo_type="dataset",
415
+ commit_message="Update provenance in README",
416
+ )
417
+
418
+ return f"https://huggingface.co/datasets/{repo_id}"
419
+
420
+ def _build_readme_section(self, graph: ProvenanceGraph) -> str:
421
+ """Build provenance section for README."""
422
+ stats = graph.stats
423
+ bundle = AccountabilityBundle(graph)
424
+
425
+ lines = [
426
+ "## 🔗 Provenance & Accountability",
427
+ "",
428
+ "This dataset has CASCADE provenance tracking enabled with full W3C PROV-O compliance.",
429
+ "",
430
+ "### Integrity",
431
+ "",
432
+ f"| Metric | Value |",
433
+ f"|--------|-------|",
434
+ f"| Root Hash | `{graph.root_hash[:16]}...` |",
435
+ f"| Entities | {stats['entities']} |",
436
+ f"| Activities | {stats['activities']} |",
437
+ f"| Agents | {stats['agents']} |",
438
+ f"| Relationships | {stats['relationships']} |",
439
+ "",
440
+ ]
441
+
442
+ # Add lineage summary
443
+ entities = graph.list_entities()
444
+ if entities:
445
+ lines.append("### Lineage")
446
+ lines.append("")
447
+ for entity in entities[:5]: # Show first 5
448
+ upstream = graph.get_lineage(entity.id, "upstream")
449
+ if upstream:
450
+ lines.append(f"- **{entity.name}** derived from: {', '.join(upstream[:3])}")
451
+ else:
452
+ lines.append(f"- **{entity.name}** (source)")
453
+ if len(entities) > 5:
454
+ lines.append(f"- ... and {len(entities) - 5} more entities")
455
+ lines.append("")
456
+
457
+ # Add activities summary
458
+ activities = graph.list_activities()
459
+ if activities:
460
+ lines.append("### Activities")
461
+ lines.append("")
462
+ for activity in activities[:5]:
463
+ duration = f" ({activity.duration:.2f}s)" if activity.duration else ""
464
+ lines.append(f"- **{activity.name}** [{activity.activity_type.value}]{duration}")
465
+ if len(activities) > 5:
466
+ lines.append(f"- ... and {len(activities) - 5} more activities")
467
+ lines.append("")
468
+
469
+ # Add agents summary
470
+ agents = graph.list_agents()
471
+ if agents:
472
+ lines.append("### Agents (Accountability)")
473
+ lines.append("")
474
+ for agent in agents[:5]:
475
+ lines.append(f"- **{agent.name}** [{agent.agent_type.value}]")
476
+ if len(agents) > 5:
477
+ lines.append(f"- ... and {len(agents) - 5} more agents")
478
+ lines.append("")
479
+
480
+ # Accountability bundle files
481
+ lines.extend([
482
+ "### Accountability Bundle",
483
+ "",
484
+ "| File | Standard | Description |",
485
+ "|------|----------|-------------|",
486
+ f"| [{self.PROVENANCE_FILENAME}]({self.PROVENANCE_FILENAME}) | CASCADE | Native provenance format |",
487
+ f"| [{self.PROV_O_FILENAME}]({self.PROV_O_FILENAME}) | W3C PROV-O | Interoperable JSON-LD |",
488
+ f"| [{self.PROV_N_FILENAME}]({self.PROV_N_FILENAME}) | W3C PROV-N | Human-readable notation |",
489
+ f"| [{self.ACTIVITIES_FILENAME}]({self.ACTIVITIES_FILENAME}) | JSONL | Activity audit log |",
490
+ f"| [{self.AGENTS_FILENAME}]({self.AGENTS_FILENAME}) | JSON | Agent attributions |",
491
+ f"| [{self.INTEGRITY_FILENAME}]({self.INTEGRITY_FILENAME}) | JSON | Hash verification manifest |",
492
+ f"| [{self.CROISSANT_FILENAME}]({self.CROISSANT_FILENAME}) | MLCommons | Croissant metadata |",
493
+ "",
494
+ ])
495
+
496
+ return "\n".join(lines)
497
+
498
+
499
+ def push_to_hub(
500
+ graph: ProvenanceGraph,
501
+ repo_id: str,
502
+ token: str = None,
503
+ private: bool = False,
504
+ ) -> str:
505
+ """
506
+ Convenience function to push provenance to Hub.
507
+
508
+ Args:
509
+ graph: Provenance graph to push
510
+ repo_id: HuggingFace repo ID
511
+ token: HF token (optional)
512
+ private: Whether repo should be private
513
+
514
+ Returns:
515
+ URL of the pushed provenance
516
+ """
517
+ hub = HubIntegration(token=token)
518
+ return hub.push_provenance(graph, repo_id, private=private)
519
+
520
+
521
+ def pull_from_hub(repo_id: str, token: str = None) -> Optional[ProvenanceGraph]:
522
+ """
523
+ Convenience function to pull provenance from Hub.
524
+
525
+ Args:
526
+ repo_id: HuggingFace repo ID
527
+ token: HF token (optional)
528
+
529
+ Returns:
530
+ ProvenanceGraph if found
531
+ """
532
+ hub = HubIntegration(token=token)
533
+ return hub.pull_provenance(repo_id)
cascade/data/license.py ADDED
@@ -0,0 +1,635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SPDX License Tracking for CASCADE
3
+
4
+ Industry standard license tracking based on:
5
+ - SPDX (Software Package Data Exchange) - Linux Foundation
6
+ - HuggingFace Dataset Cards license field
7
+ - Croissant metadata license property
8
+
9
+ License Compatibility Rules:
10
+ - Permissive (MIT, Apache-2.0) → Can derive into restrictive
11
+ - Copyleft (GPL-3.0) → Derivatives must also be copyleft
12
+ - NonCommercial (CC-BY-NC-*) → Propagates non-commercial restriction
13
+ - ShareAlike (CC-BY-SA-*) → Derivatives must use same license
14
+ - NoDerivatives (CC-BY-ND-*) → Cannot create derivatives
15
+
16
+ References:
17
+ - https://spdx.org/licenses/
18
+ - https://creativecommons.org/licenses/
19
+ """
20
+
21
+ from dataclasses import dataclass, field
22
+ from enum import Enum
23
+ from typing import Dict, List, Optional, Set, Tuple, Any
24
+
25
+
26
+ class LicenseCategory(Enum):
27
+ """License categories for compatibility analysis."""
28
+ PERMISSIVE = "permissive" # MIT, Apache, BSD
29
+ WEAK_COPYLEFT = "weak-copyleft" # LGPL, MPL
30
+ STRONG_COPYLEFT = "strong-copyleft" # GPL, AGPL
31
+ CREATIVE_COMMONS = "creative-commons"
32
+ PUBLIC_DOMAIN = "public-domain" # CC0, Unlicense
33
+ PROPRIETARY = "proprietary"
34
+ UNKNOWN = "unknown"
35
+
36
+
37
+ class LicenseRestriction(Enum):
38
+ """License restrictions that propagate to derivatives."""
39
+ NONE = "none"
40
+ ATTRIBUTION = "attribution" # Must credit original
41
+ SHARE_ALIKE = "share-alike" # Derivatives same license
42
+ NON_COMMERCIAL = "non-commercial" # No commercial use
43
+ NO_DERIVATIVES = "no-derivatives" # Cannot modify
44
+ COPYLEFT = "copyleft" # Must open source derivatives
45
+
46
+
47
+ @dataclass
48
+ class SPDXLicense:
49
+ """
50
+ SPDX License Information.
51
+
52
+ Based on SPDX License List: https://spdx.org/licenses/
53
+ """
54
+ id: str # SPDX identifier (e.g., "MIT", "Apache-2.0")
55
+ name: str # Full name
56
+ category: LicenseCategory = LicenseCategory.UNKNOWN
57
+ restrictions: Set[LicenseRestriction] = field(default_factory=set)
58
+ osi_approved: bool = False # Open Source Initiative approved
59
+ fsf_libre: bool = False # FSF Free/Libre
60
+ url: Optional[str] = None # License text URL
61
+
62
+ def allows_commercial(self) -> bool:
63
+ """Check if license allows commercial use."""
64
+ return LicenseRestriction.NON_COMMERCIAL not in self.restrictions
65
+
66
+ def allows_derivatives(self) -> bool:
67
+ """Check if license allows creating derivatives."""
68
+ return LicenseRestriction.NO_DERIVATIVES not in self.restrictions
69
+
70
+ def requires_attribution(self) -> bool:
71
+ """Check if license requires attribution."""
72
+ return LicenseRestriction.ATTRIBUTION in self.restrictions
73
+
74
+ def requires_share_alike(self) -> bool:
75
+ """Check if license requires same license for derivatives."""
76
+ return (
77
+ LicenseRestriction.SHARE_ALIKE in self.restrictions or
78
+ LicenseRestriction.COPYLEFT in self.restrictions
79
+ )
80
+
81
+ def to_dict(self) -> Dict[str, Any]:
82
+ return {
83
+ "spdx_id": self.id,
84
+ "name": self.name,
85
+ "category": self.category.value,
86
+ "restrictions": [r.value for r in self.restrictions],
87
+ "osi_approved": self.osi_approved,
88
+ "fsf_libre": self.fsf_libre,
89
+ "url": self.url,
90
+ }
91
+
92
+
93
+ # SPDX License Registry - Common ML/Data licenses
94
+ SPDX_LICENSES: Dict[str, SPDXLicense] = {
95
+ # Public Domain
96
+ "CC0-1.0": SPDXLicense(
97
+ id="CC0-1.0",
98
+ name="Creative Commons Zero v1.0 Universal",
99
+ category=LicenseCategory.PUBLIC_DOMAIN,
100
+ restrictions=set(),
101
+ osi_approved=False,
102
+ fsf_libre=True,
103
+ url="https://creativecommons.org/publicdomain/zero/1.0/",
104
+ ),
105
+ "Unlicense": SPDXLicense(
106
+ id="Unlicense",
107
+ name="The Unlicense",
108
+ category=LicenseCategory.PUBLIC_DOMAIN,
109
+ restrictions=set(),
110
+ osi_approved=True,
111
+ fsf_libre=True,
112
+ url="https://unlicense.org/",
113
+ ),
114
+
115
+ # Permissive
116
+ "MIT": SPDXLicense(
117
+ id="MIT",
118
+ name="MIT License",
119
+ category=LicenseCategory.PERMISSIVE,
120
+ restrictions={LicenseRestriction.ATTRIBUTION},
121
+ osi_approved=True,
122
+ fsf_libre=True,
123
+ url="https://opensource.org/licenses/MIT",
124
+ ),
125
+ "Apache-2.0": SPDXLicense(
126
+ id="Apache-2.0",
127
+ name="Apache License 2.0",
128
+ category=LicenseCategory.PERMISSIVE,
129
+ restrictions={LicenseRestriction.ATTRIBUTION},
130
+ osi_approved=True,
131
+ fsf_libre=True,
132
+ url="https://www.apache.org/licenses/LICENSE-2.0",
133
+ ),
134
+ "BSD-2-Clause": SPDXLicense(
135
+ id="BSD-2-Clause",
136
+ name='BSD 2-Clause "Simplified" License',
137
+ category=LicenseCategory.PERMISSIVE,
138
+ restrictions={LicenseRestriction.ATTRIBUTION},
139
+ osi_approved=True,
140
+ fsf_libre=True,
141
+ url="https://opensource.org/licenses/BSD-2-Clause",
142
+ ),
143
+ "BSD-3-Clause": SPDXLicense(
144
+ id="BSD-3-Clause",
145
+ name='BSD 3-Clause "New" or "Revised" License',
146
+ category=LicenseCategory.PERMISSIVE,
147
+ restrictions={LicenseRestriction.ATTRIBUTION},
148
+ osi_approved=True,
149
+ fsf_libre=True,
150
+ url="https://opensource.org/licenses/BSD-3-Clause",
151
+ ),
152
+
153
+ # Creative Commons
154
+ "CC-BY-4.0": SPDXLicense(
155
+ id="CC-BY-4.0",
156
+ name="Creative Commons Attribution 4.0",
157
+ category=LicenseCategory.CREATIVE_COMMONS,
158
+ restrictions={LicenseRestriction.ATTRIBUTION},
159
+ osi_approved=False,
160
+ fsf_libre=True,
161
+ url="https://creativecommons.org/licenses/by/4.0/",
162
+ ),
163
+ "CC-BY-SA-4.0": SPDXLicense(
164
+ id="CC-BY-SA-4.0",
165
+ name="Creative Commons Attribution ShareAlike 4.0",
166
+ category=LicenseCategory.CREATIVE_COMMONS,
167
+ restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.SHARE_ALIKE},
168
+ osi_approved=False,
169
+ fsf_libre=True,
170
+ url="https://creativecommons.org/licenses/by-sa/4.0/",
171
+ ),
172
+ "CC-BY-NC-4.0": SPDXLicense(
173
+ id="CC-BY-NC-4.0",
174
+ name="Creative Commons Attribution NonCommercial 4.0",
175
+ category=LicenseCategory.CREATIVE_COMMONS,
176
+ restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.NON_COMMERCIAL},
177
+ osi_approved=False,
178
+ fsf_libre=False,
179
+ url="https://creativecommons.org/licenses/by-nc/4.0/",
180
+ ),
181
+ "CC-BY-NC-SA-4.0": SPDXLicense(
182
+ id="CC-BY-NC-SA-4.0",
183
+ name="Creative Commons Attribution NonCommercial ShareAlike 4.0",
184
+ category=LicenseCategory.CREATIVE_COMMONS,
185
+ restrictions={
186
+ LicenseRestriction.ATTRIBUTION,
187
+ LicenseRestriction.NON_COMMERCIAL,
188
+ LicenseRestriction.SHARE_ALIKE,
189
+ },
190
+ osi_approved=False,
191
+ fsf_libre=False,
192
+ url="https://creativecommons.org/licenses/by-nc-sa/4.0/",
193
+ ),
194
+ "CC-BY-ND-4.0": SPDXLicense(
195
+ id="CC-BY-ND-4.0",
196
+ name="Creative Commons Attribution NoDerivatives 4.0",
197
+ category=LicenseCategory.CREATIVE_COMMONS,
198
+ restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.NO_DERIVATIVES},
199
+ osi_approved=False,
200
+ fsf_libre=False,
201
+ url="https://creativecommons.org/licenses/by-nd/4.0/",
202
+ ),
203
+
204
+ # Weak Copyleft
205
+ "LGPL-3.0": SPDXLicense(
206
+ id="LGPL-3.0",
207
+ name="GNU Lesser General Public License v3.0",
208
+ category=LicenseCategory.WEAK_COPYLEFT,
209
+ restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT},
210
+ osi_approved=True,
211
+ fsf_libre=True,
212
+ url="https://www.gnu.org/licenses/lgpl-3.0.html",
213
+ ),
214
+ "MPL-2.0": SPDXLicense(
215
+ id="MPL-2.0",
216
+ name="Mozilla Public License 2.0",
217
+ category=LicenseCategory.WEAK_COPYLEFT,
218
+ restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT},
219
+ osi_approved=True,
220
+ fsf_libre=True,
221
+ url="https://www.mozilla.org/en-US/MPL/2.0/",
222
+ ),
223
+
224
+ # Strong Copyleft
225
+ "GPL-3.0": SPDXLicense(
226
+ id="GPL-3.0",
227
+ name="GNU General Public License v3.0",
228
+ category=LicenseCategory.STRONG_COPYLEFT,
229
+ restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT},
230
+ osi_approved=True,
231
+ fsf_libre=True,
232
+ url="https://www.gnu.org/licenses/gpl-3.0.html",
233
+ ),
234
+ "AGPL-3.0": SPDXLicense(
235
+ id="AGPL-3.0",
236
+ name="GNU Affero General Public License v3.0",
237
+ category=LicenseCategory.STRONG_COPYLEFT,
238
+ restrictions={LicenseRestriction.ATTRIBUTION, LicenseRestriction.COPYLEFT},
239
+ osi_approved=True,
240
+ fsf_libre=True,
241
+ url="https://www.gnu.org/licenses/agpl-3.0.html",
242
+ ),
243
+
244
+ # ML-Specific
245
+ "OpenRAIL": SPDXLicense(
246
+ id="OpenRAIL",
247
+ name="Open RAIL License",
248
+ category=LicenseCategory.PERMISSIVE,
249
+ restrictions={LicenseRestriction.ATTRIBUTION},
250
+ osi_approved=False,
251
+ fsf_libre=False,
252
+ url="https://huggingface.co/blog/open_rail",
253
+ ),
254
+ "OpenRAIL-M": SPDXLicense(
255
+ id="OpenRAIL-M",
256
+ name="Open RAIL-M License",
257
+ category=LicenseCategory.PERMISSIVE,
258
+ restrictions={LicenseRestriction.ATTRIBUTION},
259
+ osi_approved=False,
260
+ fsf_libre=False,
261
+ url="https://www.licenses.ai/blog/2022/8/26/bigscience-open-rail-m-license",
262
+ ),
263
+
264
+ # Special
265
+ "other": SPDXLicense(
266
+ id="other",
267
+ name="Other/Custom License",
268
+ category=LicenseCategory.UNKNOWN,
269
+ restrictions=set(),
270
+ osi_approved=False,
271
+ fsf_libre=False,
272
+ url=None,
273
+ ),
274
+ "unknown": SPDXLicense(
275
+ id="unknown",
276
+ name="Unknown License",
277
+ category=LicenseCategory.UNKNOWN,
278
+ restrictions=set(),
279
+ osi_approved=False,
280
+ fsf_libre=False,
281
+ url=None,
282
+ ),
283
+ }
284
+
285
+
286
+ def get_license(spdx_id: str) -> SPDXLicense:
287
+ """
288
+ Get license by SPDX identifier.
289
+
290
+ Args:
291
+ spdx_id: SPDX license identifier (case-insensitive)
292
+
293
+ Returns:
294
+ SPDXLicense object (unknown if not found)
295
+ """
296
+ # Normalize common variants
297
+ normalized = spdx_id.strip()
298
+
299
+ # Direct lookup
300
+ if normalized in SPDX_LICENSES:
301
+ return SPDX_LICENSES[normalized]
302
+
303
+ # Case-insensitive lookup
304
+ for key, license in SPDX_LICENSES.items():
305
+ if key.lower() == normalized.lower():
306
+ return license
307
+
308
+ # Common aliases
309
+ aliases = {
310
+ "mit": "MIT",
311
+ "apache": "Apache-2.0",
312
+ "apache2": "Apache-2.0",
313
+ "gpl": "GPL-3.0",
314
+ "gpl3": "GPL-3.0",
315
+ "lgpl": "LGPL-3.0",
316
+ "bsd": "BSD-3-Clause",
317
+ "cc0": "CC0-1.0",
318
+ "cc-by": "CC-BY-4.0",
319
+ "cc-by-sa": "CC-BY-SA-4.0",
320
+ "cc-by-nc": "CC-BY-NC-4.0",
321
+ "cc-by-nc-sa": "CC-BY-NC-SA-4.0",
322
+ "cc-by-nd": "CC-BY-ND-4.0",
323
+ "unlicense": "Unlicense",
324
+ "public domain": "CC0-1.0",
325
+ "openrail": "OpenRAIL",
326
+ }
327
+
328
+ lower_id = normalized.lower().replace("_", "-").replace(" ", "-")
329
+ if lower_id in aliases:
330
+ return SPDX_LICENSES[aliases[lower_id]]
331
+
332
+ # Return unknown
333
+ return SPDX_LICENSES["unknown"]
334
+
335
+
336
+ @dataclass
337
+ class LicenseCompatibility:
338
+ """Result of license compatibility check."""
339
+ compatible: bool
340
+ derived_license: Optional[SPDXLicense] = None
341
+ issues: List[str] = field(default_factory=list)
342
+ warnings: List[str] = field(default_factory=list)
343
+ attribution_required: List[str] = field(default_factory=list) # Source IDs requiring attribution
344
+
345
+
346
+ class LicenseAnalyzer:
347
+ """
348
+ Analyze license compatibility for dataset derivation.
349
+
350
+ Rules:
351
+ 1. No-Derivatives: Cannot create derivatives
352
+ 2. Share-Alike: Must use same license
353
+ 3. Copyleft: Must use compatible copyleft license
354
+ 4. Non-Commercial: Restriction propagates
355
+ 5. Attribution: Must credit all sources
356
+ """
357
+
358
+ # License compatibility matrix (can this → derive into that?)
359
+ # Rows: source license category, Columns: derived license category
360
+ COMPATIBILITY_MATRIX = {
361
+ LicenseCategory.PUBLIC_DOMAIN: {
362
+ LicenseCategory.PUBLIC_DOMAIN: True,
363
+ LicenseCategory.PERMISSIVE: True,
364
+ LicenseCategory.CREATIVE_COMMONS: True,
365
+ LicenseCategory.WEAK_COPYLEFT: True,
366
+ LicenseCategory.STRONG_COPYLEFT: True,
367
+ LicenseCategory.PROPRIETARY: True,
368
+ },
369
+ LicenseCategory.PERMISSIVE: {
370
+ LicenseCategory.PUBLIC_DOMAIN: False,
371
+ LicenseCategory.PERMISSIVE: True,
372
+ LicenseCategory.CREATIVE_COMMONS: True,
373
+ LicenseCategory.WEAK_COPYLEFT: True,
374
+ LicenseCategory.STRONG_COPYLEFT: True,
375
+ LicenseCategory.PROPRIETARY: True,
376
+ },
377
+ LicenseCategory.CREATIVE_COMMONS: {
378
+ LicenseCategory.PUBLIC_DOMAIN: False,
379
+ LicenseCategory.PERMISSIVE: False, # Depends on specific CC
380
+ LicenseCategory.CREATIVE_COMMONS: True, # Depends on specific CC
381
+ LicenseCategory.WEAK_COPYLEFT: False,
382
+ LicenseCategory.STRONG_COPYLEFT: False,
383
+ LicenseCategory.PROPRIETARY: False,
384
+ },
385
+ LicenseCategory.WEAK_COPYLEFT: {
386
+ LicenseCategory.PUBLIC_DOMAIN: False,
387
+ LicenseCategory.PERMISSIVE: False,
388
+ LicenseCategory.CREATIVE_COMMONS: False,
389
+ LicenseCategory.WEAK_COPYLEFT: True,
390
+ LicenseCategory.STRONG_COPYLEFT: True,
391
+ LicenseCategory.PROPRIETARY: False,
392
+ },
393
+ LicenseCategory.STRONG_COPYLEFT: {
394
+ LicenseCategory.PUBLIC_DOMAIN: False,
395
+ LicenseCategory.PERMISSIVE: False,
396
+ LicenseCategory.CREATIVE_COMMONS: False,
397
+ LicenseCategory.WEAK_COPYLEFT: False,
398
+ LicenseCategory.STRONG_COPYLEFT: True,
399
+ LicenseCategory.PROPRIETARY: False,
400
+ },
401
+ }
402
+
403
+ def check_compatibility(
404
+ self,
405
+ source_licenses: List[Tuple[str, str]], # List of (entity_id, spdx_id)
406
+ target_license: Optional[str] = None,
407
+ ) -> LicenseCompatibility:
408
+ """
409
+ Check if source licenses allow derivation.
410
+
411
+ Args:
412
+ source_licenses: List of (entity_id, license_id) tuples
413
+ target_license: Intended license for derived work (optional)
414
+
415
+ Returns:
416
+ LicenseCompatibility result
417
+ """
418
+ if not source_licenses:
419
+ return LicenseCompatibility(
420
+ compatible=True,
421
+ derived_license=SPDX_LICENSES["unknown"],
422
+ )
423
+
424
+ issues = []
425
+ warnings = []
426
+ attribution_required = []
427
+
428
+ # Collect all restrictions
429
+ all_restrictions: Set[LicenseRestriction] = set()
430
+ licenses = []
431
+
432
+ for entity_id, spdx_id in source_licenses:
433
+ lic = get_license(spdx_id)
434
+ licenses.append((entity_id, lic))
435
+ all_restrictions.update(lic.restrictions)
436
+
437
+ # Track attribution requirements
438
+ if lic.requires_attribution():
439
+ attribution_required.append(entity_id)
440
+
441
+ # Check No-Derivatives
442
+ for entity_id, lic in licenses:
443
+ if LicenseRestriction.NO_DERIVATIVES in lic.restrictions:
444
+ issues.append(
445
+ f"Cannot derive from '{entity_id}': license '{lic.id}' prohibits derivatives"
446
+ )
447
+
448
+ if issues:
449
+ return LicenseCompatibility(
450
+ compatible=False,
451
+ issues=issues,
452
+ warnings=warnings,
453
+ attribution_required=attribution_required,
454
+ )
455
+
456
+ # Determine derived license
457
+ derived = self._compute_derived_license(licenses, all_restrictions)
458
+
459
+ # Check target license compatibility
460
+ if target_license:
461
+ target = get_license(target_license)
462
+ if not self._can_relicense(derived, target):
463
+ issues.append(
464
+ f"Cannot license derived work as '{target.id}': "
465
+ f"must use '{derived.id}' or compatible license"
466
+ )
467
+
468
+ # Add warnings
469
+ if LicenseRestriction.NON_COMMERCIAL in all_restrictions:
470
+ warnings.append("Derived work restricted to non-commercial use only")
471
+
472
+ if LicenseRestriction.SHARE_ALIKE in all_restrictions:
473
+ warnings.append(f"Derived work must use ShareAlike-compatible license: {derived.id}")
474
+
475
+ if LicenseRestriction.COPYLEFT in all_restrictions:
476
+ warnings.append(f"Derived work must use copyleft license: {derived.id}")
477
+
478
+ return LicenseCompatibility(
479
+ compatible=len(issues) == 0,
480
+ derived_license=derived,
481
+ issues=issues,
482
+ warnings=warnings,
483
+ attribution_required=attribution_required,
484
+ )
485
+
486
+ def _compute_derived_license(
487
+ self,
488
+ licenses: List[Tuple[str, SPDXLicense]],
489
+ all_restrictions: Set[LicenseRestriction],
490
+ ) -> SPDXLicense:
491
+ """
492
+ Compute the most restrictive license for derived work.
493
+
494
+ The derived license is the "lowest common denominator" that
495
+ satisfies all source license requirements.
496
+ """
497
+ # Priority: Strong Copyleft > Weak Copyleft > CC-SA > CC-NC > Permissive > Public Domain
498
+
499
+ has_strong_copyleft = any(
500
+ lic.category == LicenseCategory.STRONG_COPYLEFT
501
+ for _, lic in licenses
502
+ )
503
+ has_weak_copyleft = any(
504
+ lic.category == LicenseCategory.WEAK_COPYLEFT
505
+ for _, lic in licenses
506
+ )
507
+ has_share_alike = LicenseRestriction.SHARE_ALIKE in all_restrictions
508
+ has_non_commercial = LicenseRestriction.NON_COMMERCIAL in all_restrictions
509
+
510
+ # Strong copyleft dominates
511
+ if has_strong_copyleft:
512
+ for _, lic in licenses:
513
+ if lic.category == LicenseCategory.STRONG_COPYLEFT:
514
+ return lic
515
+
516
+ # Weak copyleft next
517
+ if has_weak_copyleft:
518
+ for _, lic in licenses:
519
+ if lic.category == LicenseCategory.WEAK_COPYLEFT:
520
+ return lic
521
+
522
+ # CC with restrictions
523
+ if has_share_alike and has_non_commercial:
524
+ return SPDX_LICENSES["CC-BY-NC-SA-4.0"]
525
+ elif has_share_alike:
526
+ return SPDX_LICENSES["CC-BY-SA-4.0"]
527
+ elif has_non_commercial:
528
+ return SPDX_LICENSES["CC-BY-NC-4.0"]
529
+
530
+ # Most permissive with attribution
531
+ if LicenseRestriction.ATTRIBUTION in all_restrictions:
532
+ # Check if any source requires specific license
533
+ for _, lic in licenses:
534
+ if lic.category == LicenseCategory.CREATIVE_COMMONS:
535
+ return lic
536
+ return SPDX_LICENSES["CC-BY-4.0"]
537
+
538
+ # Public domain
539
+ return SPDX_LICENSES["CC0-1.0"]
540
+
541
+ def _can_relicense(self, source: SPDXLicense, target: SPDXLicense) -> bool:
542
+ """Check if source license allows relicensing to target."""
543
+ # Same license is always OK
544
+ if source.id == target.id:
545
+ return True
546
+
547
+ # No relicensing from copyleft to non-copyleft
548
+ if LicenseRestriction.COPYLEFT in source.restrictions:
549
+ if LicenseRestriction.COPYLEFT not in target.restrictions:
550
+ return False
551
+
552
+ # No relicensing from share-alike to non-share-alike
553
+ if LicenseRestriction.SHARE_ALIKE in source.restrictions:
554
+ if LicenseRestriction.SHARE_ALIKE not in target.restrictions:
555
+ return False
556
+
557
+ # Non-commercial must propagate
558
+ if LicenseRestriction.NON_COMMERCIAL in source.restrictions:
559
+ if LicenseRestriction.NON_COMMERCIAL not in target.restrictions:
560
+ return False
561
+
562
+ return True
563
+
564
+ def generate_attribution(
565
+ self,
566
+ sources: List[Tuple[str, str, str]], # (entity_id, license_id, name)
567
+ ) -> str:
568
+ """
569
+ Generate attribution text for derived work.
570
+
571
+ Args:
572
+ sources: List of (entity_id, license_id, name) tuples
573
+
574
+ Returns:
575
+ Attribution text
576
+ """
577
+ lines = [
578
+ "## Attribution",
579
+ "",
580
+ "This dataset is derived from the following sources:",
581
+ "",
582
+ ]
583
+
584
+ for entity_id, license_id, name in sources:
585
+ lic = get_license(license_id)
586
+ if lic.requires_attribution():
587
+ line = f"- **{name}** (`{entity_id}`)"
588
+ if lic.url:
589
+ line += f" - Licensed under [{lic.id}]({lic.url})"
590
+ else:
591
+ line += f" - Licensed under {lic.id}"
592
+ lines.append(line)
593
+
594
+ if len(lines) == 4: # No attributions needed
595
+ return ""
596
+
597
+ lines.append("")
598
+ return "\n".join(lines)
599
+
600
+
601
+ # Singleton analyzer
602
+ _analyzer = LicenseAnalyzer()
603
+
604
+
605
+ def check_license_compatibility(
606
+ sources: List[Tuple[str, str]],
607
+ target: Optional[str] = None,
608
+ ) -> LicenseCompatibility:
609
+ """
610
+ Convenience function to check license compatibility.
611
+
612
+ Args:
613
+ sources: List of (entity_id, license_id) tuples
614
+ target: Intended license for derived work
615
+
616
+ Returns:
617
+ LicenseCompatibility result
618
+ """
619
+ return _analyzer.check_compatibility(sources, target)
620
+
621
+
622
+ def get_derived_license(sources: List[str]) -> SPDXLicense:
623
+ """
624
+ Get the appropriate license for a work derived from given sources.
625
+
626
+ Args:
627
+ sources: List of SPDX license identifiers
628
+
629
+ Returns:
630
+ SPDXLicense for the derived work
631
+ """
632
+ result = _analyzer.check_compatibility([
633
+ (f"source_{i}", lic) for i, lic in enumerate(sources)
634
+ ])
635
+ return result.derived_license or SPDX_LICENSES["unknown"]
cascade/data/live.py ADDED
@@ -0,0 +1,844 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Live Document Tracer
3
+
4
+ Real-time streaming of document-centric provenance events.
5
+ This is the LIVE version of what the export system freezes.
6
+
7
+ Instead of: Model runs → Process → Export frozen provenance
8
+ We do: Model runs → STREAM events → View live document highlights
9
+
10
+ Same data model as the observer/exporter, just streamed in real-time
11
+ with document snippet context attached.
12
+
13
+ Usage:
14
+ # Create observer with live streaming
15
+ observer = DatasetObserver("my_pipeline")
16
+ tracer = LiveDocumentTracer(observer)
17
+
18
+ # Subscribe to events
19
+ tracer.on_event(my_handler)
20
+
21
+ # Or stream to async consumer
22
+ async for event in tracer.stream():
23
+ render_highlight(event)
24
+ """
25
+
26
+ import asyncio
27
+ import json
28
+ import time
29
+ from dataclasses import dataclass, field
30
+ from enum import Enum
31
+ from typing import Any, Callable, Dict, Generator, List, Optional, Set, Tuple
32
+ from queue import Queue
33
+ from threading import Lock
34
+ from pathlib import Path
35
+
36
+
37
+ class TraceEventType(Enum):
38
+ """Types of document trace events."""
39
+ # Data flow events
40
+ DOCUMENT_TOUCHED = "document_touched" # Model accessed this document/record
41
+ SPAN_HIGHLIGHTED = "span_highlighted" # Specific text span being processed
42
+ ASSOCIATION_CREATED = "association_created" # Link between two spans/documents
43
+
44
+ # Activity events
45
+ ACTIVITY_STARTED = "activity_started"
46
+ ACTIVITY_PROGRESS = "activity_progress"
47
+ ACTIVITY_COMPLETED = "activity_completed"
48
+
49
+ # Entity events
50
+ ENTITY_CREATED = "entity_created"
51
+ ENTITY_DERIVED = "entity_derived"
52
+
53
+ # Relationship events
54
+ LINK_CREATED = "link_created"
55
+
56
+
57
+ @dataclass
58
+ class DocumentSpan:
59
+ """
60
+ A span within a document being traced.
61
+
62
+ This is the atomic unit of live visualization -
63
+ the specific text/content the model is touching.
64
+ """
65
+ document_id: str # Entity or record ID
66
+ document_name: str # Human-readable name
67
+ field_name: str = "" # Column/field if applicable
68
+ row_index: int = -1 # Row if applicable
69
+
70
+ # The actual content span
71
+ text: str = "" # The snippet text
72
+ start_char: int = -1 # Start position in full text
73
+ end_char: int = -1 # End position in full text
74
+
75
+ # Visual hints
76
+ highlight_type: str = "default" # "source", "target", "match", "attention"
77
+ confidence: float = 1.0 # For attention/relevance visualization
78
+
79
+ # Metadata
80
+ metadata: Dict[str, Any] = field(default_factory=dict)
81
+
82
+ def to_dict(self) -> Dict[str, Any]:
83
+ return {
84
+ "document_id": self.document_id,
85
+ "document_name": self.document_name,
86
+ "field_name": self.field_name,
87
+ "row_index": self.row_index,
88
+ "text": self.text,
89
+ "start_char": self.start_char,
90
+ "end_char": self.end_char,
91
+ "highlight_type": self.highlight_type,
92
+ "confidence": self.confidence,
93
+ "metadata": self.metadata,
94
+ }
95
+
96
+
97
+ @dataclass
98
+ class DocumentAssociation:
99
+ """
100
+ An association between two document spans.
101
+
102
+ Represents the model saying "this connects to that".
103
+ """
104
+ source: DocumentSpan
105
+ target: DocumentSpan
106
+ association_type: str = "related" # "match", "derived", "similar", "references"
107
+ confidence: float = 1.0
108
+
109
+ # Why this association was made
110
+ reason: str = ""
111
+
112
+ def to_dict(self) -> Dict[str, Any]:
113
+ return {
114
+ "source": self.source.to_dict(),
115
+ "target": self.target.to_dict(),
116
+ "association_type": self.association_type,
117
+ "confidence": self.confidence,
118
+ "reason": self.reason,
119
+ }
120
+
121
+
122
+ @dataclass
123
+ class TraceEvent:
124
+ """
125
+ A single trace event for live document visualization.
126
+
127
+ This is what gets streamed to the UI in real-time.
128
+ """
129
+ event_type: TraceEventType
130
+ timestamp: float = field(default_factory=time.time)
131
+
132
+ # Activity context
133
+ activity_id: Optional[str] = None
134
+ activity_name: Optional[str] = None
135
+ activity_type: Optional[str] = None
136
+
137
+ # Document spans involved
138
+ spans: List[DocumentSpan] = field(default_factory=list)
139
+
140
+ # Association if this event creates one
141
+ association: Optional[DocumentAssociation] = None
142
+
143
+ # Progress for long operations
144
+ progress: Optional[float] = None # 0.0 to 1.0
145
+ progress_message: Optional[str] = None
146
+
147
+ # Raw provenance data (for export compatibility)
148
+ entity_id: Optional[str] = None
149
+ relationship_type: Optional[str] = None
150
+
151
+ # Metadata
152
+ metadata: Dict[str, Any] = field(default_factory=dict)
153
+
154
+ def to_dict(self) -> Dict[str, Any]:
155
+ return {
156
+ "event_type": self.event_type.value,
157
+ "timestamp": self.timestamp,
158
+ "activity_id": self.activity_id,
159
+ "activity_name": self.activity_name,
160
+ "activity_type": self.activity_type,
161
+ "spans": [s.to_dict() for s in self.spans],
162
+ "association": self.association.to_dict() if self.association else None,
163
+ "progress": self.progress,
164
+ "progress_message": self.progress_message,
165
+ "entity_id": self.entity_id,
166
+ "metadata": self.metadata,
167
+ }
168
+
169
+ def to_json(self) -> str:
170
+ return json.dumps(self.to_dict(), default=str)
171
+
172
+
173
+ class LiveDocumentTracer:
174
+ """
175
+ Real-time document tracing for live visualization.
176
+
177
+ Hooks into DatasetObserver to stream events as they happen,
178
+ enriched with document snippet context for visualization.
179
+
180
+ This is the LIVE version of what CroissantExporter freezes.
181
+
182
+ NEW: Now writes all events to a tape file (JSONL) for buffered playback!
183
+ """
184
+
185
+ def __init__(self, observer=None, buffer_size: int = 1000, log_dir: str = "./logs"):
186
+ """
187
+ Initialize tracer.
188
+
189
+ Args:
190
+ observer: DatasetObserver to hook into (optional)
191
+ buffer_size: Max events to buffer for replay
192
+ log_dir: Directory for tape files (JSONL logs)
193
+ """
194
+ self.observer = observer
195
+ self.buffer_size = buffer_size
196
+
197
+ # Event subscribers
198
+ self._handlers: List[Callable[[TraceEvent], None]] = []
199
+ self._async_handlers: List[Callable[[TraceEvent], Any]] = []
200
+
201
+ # Event buffer for replay/late subscribers
202
+ self._buffer: List[TraceEvent] = []
203
+ self._buffer_lock = Lock()
204
+
205
+ # Async queue for streaming
206
+ self._async_queue: Optional[asyncio.Queue] = None
207
+
208
+ # Current activity context
209
+ self._current_activity_id: Optional[str] = None
210
+ self._current_activity_name: Optional[str] = None
211
+ self._current_activity_type: Optional[str] = None
212
+
213
+ # Document context cache
214
+ self._document_cache: Dict[str, Dict[str, Any]] = {}
215
+
216
+ # === TAPE FILE FOR PLAYBACK ===
217
+ self._log_dir = Path(log_dir)
218
+ self._log_dir.mkdir(parents=True, exist_ok=True)
219
+ self._session_id = int(time.time())
220
+ self._tape_path = self._log_dir / f"unity_tape_{self._session_id}.jsonl"
221
+ self._tape_file = None
222
+ self._tape_lock = Lock()
223
+ self._event_count = 0
224
+
225
+ # ═══════════════════════════════════════════════════════════════════════════
226
+ # SUBSCRIPTION
227
+ # ═══════════════════════════════════════════════════════════════════════════
228
+
229
+ def on_event(self, handler: Callable[[TraceEvent], None]):
230
+ """Subscribe to trace events (sync handler)."""
231
+ self._handlers.append(handler)
232
+ return self # Allow chaining
233
+
234
+ def on_event_async(self, handler: Callable[[TraceEvent], Any]):
235
+ """Subscribe to trace events (async handler)."""
236
+ self._async_handlers.append(handler)
237
+ return self
238
+
239
+ def remove_handler(self, handler):
240
+ """Unsubscribe a handler."""
241
+ if handler in self._handlers:
242
+ self._handlers.remove(handler)
243
+ if handler in self._async_handlers:
244
+ self._async_handlers.remove(handler)
245
+
246
+ # ═══════════════════════════════════════════════════════════════════════════
247
+ # EVENT EMISSION
248
+ # ═══════════════════════════════════════════════════════════════════════════
249
+
250
+ def emit(self, event: TraceEvent):
251
+ """
252
+ Emit a trace event to all subscribers.
253
+
254
+ Called internally when provenance events occur.
255
+ Also writes to tape file for buffered playback!
256
+ """
257
+ self._event_count += 1
258
+
259
+ # Add to buffer
260
+ with self._buffer_lock:
261
+ self._buffer.append(event)
262
+ if len(self._buffer) > self.buffer_size:
263
+ self._buffer.pop(0)
264
+
265
+ # === WRITE TO TAPE (JSONL) ===
266
+ self._write_to_tape(event)
267
+
268
+ # Call sync handlers
269
+ for handler in self._handlers:
270
+ try:
271
+ handler(event)
272
+ except Exception as e:
273
+ print(f"Handler error: {e}")
274
+
275
+ # Queue for async handlers
276
+ if self._async_queue:
277
+ try:
278
+ self._async_queue.put_nowait(event)
279
+ except asyncio.QueueFull:
280
+ pass # Drop if queue full
281
+
282
+ def _write_to_tape(self, event: TraceEvent):
283
+ """Write event to tape file for later playback."""
284
+ try:
285
+ with self._tape_lock:
286
+ # Lazy open the file
287
+ if self._tape_file is None:
288
+ self._tape_file = open(self._tape_path, "a", encoding="utf-8")
289
+ print(f"[CASCADE] 📼 Unity tape started: {self._tape_path}")
290
+
291
+ # Build tape record with full context
292
+ record = {
293
+ "seq": self._event_count,
294
+ "event": event.to_dict(),
295
+ "session_id": self._session_id,
296
+ }
297
+
298
+ json_line = json.dumps(record, default=str) + "\n"
299
+ self._tape_file.write(json_line)
300
+ self._tape_file.flush()
301
+
302
+ # Debug: Log first few events
303
+ if self._event_count <= 3:
304
+ print(f"[CASCADE] 📝 Wrote event {self._event_count} to tape: {event.event_type}")
305
+ except Exception as e:
306
+ # Don't let tape errors break the main flow
307
+ print(f"[CASCADE] ⚠️ Tape write error: {e}")
308
+ pass
309
+
310
+ def _write_raw_to_tape(self, record: Dict[str, Any]):
311
+ """Write a raw record to tape file (for docspace events)."""
312
+ try:
313
+ with self._tape_lock:
314
+ # Lazy open the file
315
+ if self._tape_file is None:
316
+ self._tape_file = open(self._tape_path, "a", encoding="utf-8")
317
+ print(f"[CASCADE] 📼 Unity tape started: {self._tape_path}")
318
+
319
+ self._tape_file.write(json.dumps(record, default=str) + "\n")
320
+ self._tape_file.flush()
321
+ except Exception:
322
+ pass
323
+
324
+ # ═══════════════════════════════════════════════════════════════════════════
325
+ # DOCUMENT SPACE EVENTS (for polling iframe)
326
+ # ═══════════════════════════════════════════════════════════════════════════
327
+
328
+ def emit_entity(self, entity_id: str, source: str, text: str, index: int, side: str = "a"):
329
+ """
330
+ Emit an entity for Document Space visualization.
331
+
332
+ Args:
333
+ entity_id: Unique ID for the entity
334
+ source: Source dataset name
335
+ text: Preview text (truncated)
336
+ index: Row index in dataset
337
+ side: "a" or "b" to indicate which dataset
338
+ """
339
+ self._event_count += 1
340
+ record = {
341
+ "seq": self._event_count,
342
+ "type": "docspace_entity",
343
+ "side": side,
344
+ "data": {
345
+ "id": entity_id,
346
+ "source": source,
347
+ "text": text[:200],
348
+ "index": index,
349
+ },
350
+ "session_id": self._session_id,
351
+ }
352
+ self._write_raw_to_tape(record)
353
+
354
+ def emit_match(self, doc_a_id: str, doc_b_id: str, score: float):
355
+ """
356
+ Emit a match for Document Space visualization.
357
+
358
+ Args:
359
+ doc_a_id: ID of entity from dataset A
360
+ doc_b_id: ID of entity from dataset B
361
+ score: Similarity score (0-1)
362
+ """
363
+ self._event_count += 1
364
+ record = {
365
+ "seq": self._event_count,
366
+ "type": "docspace_match",
367
+ "data": {
368
+ "docA": doc_a_id,
369
+ "docB": doc_b_id,
370
+ "score": float(score),
371
+ },
372
+ "session_id": self._session_id,
373
+ }
374
+ self._write_raw_to_tape(record)
375
+
376
+ def emit_phase(self, phase: str, progress: float, message: str = ""):
377
+ """
378
+ Emit a phase update for Document Space.
379
+
380
+ Args:
381
+ phase: Current phase (embedding_a, embedding_b, comparing, complete)
382
+ progress: Progress 0-1
383
+ message: Status message
384
+ """
385
+ self._event_count += 1
386
+ record = {
387
+ "seq": self._event_count,
388
+ "type": "docspace_phase",
389
+ "data": {
390
+ "phase": phase,
391
+ "progress": float(progress),
392
+ "message": message,
393
+ },
394
+ "session_id": self._session_id,
395
+ }
396
+ self._write_raw_to_tape(record)
397
+
398
+ def close_tape(self):
399
+ """Close the tape file (call when session ends)."""
400
+ with self._tape_lock:
401
+ if self._tape_file:
402
+ self._tape_file.close()
403
+ self._tape_file = None
404
+ print(f"[CASCADE] 📼 Unity tape closed: {self._event_count} events → {self._tape_path}")
405
+
406
+ def get_tape_path(self) -> Optional[Path]:
407
+ """Get the path to the current tape file (whether open or not)."""
408
+ return self._tape_path
409
+
410
+ @staticmethod
411
+ def load_tape(tape_path: str) -> List[Dict[str, Any]]:
412
+ """
413
+ Load events from a tape file for playback.
414
+
415
+ Args:
416
+ tape_path: Path to the .jsonl tape file
417
+
418
+ Returns:
419
+ List of event records in chronological order
420
+ """
421
+ events = []
422
+ with open(tape_path, "r", encoding="utf-8") as f:
423
+ for line in f:
424
+ line = line.strip()
425
+ if line:
426
+ try:
427
+ events.append(json.loads(line))
428
+ except json.JSONDecodeError:
429
+ pass # Skip malformed lines
430
+ return events
431
+
432
+ async def stream(self) -> Generator[TraceEvent, None, None]:
433
+ """
434
+ Async generator for streaming events.
435
+
436
+ Usage:
437
+ async for event in tracer.stream():
438
+ await render(event)
439
+ """
440
+ self._async_queue = asyncio.Queue(maxsize=self.buffer_size)
441
+
442
+ # Replay buffer first
443
+ with self._buffer_lock:
444
+ for event in self._buffer:
445
+ yield event
446
+
447
+ # Then stream new events
448
+ while True:
449
+ event = await self._async_queue.get()
450
+ yield event
451
+
452
+ def get_buffer(self) -> List[TraceEvent]:
453
+ """Get buffered events for replay."""
454
+ with self._buffer_lock:
455
+ return list(self._buffer)
456
+
457
+ def clear_buffer(self):
458
+ """Clear the event buffer."""
459
+ with self._buffer_lock:
460
+ self._buffer.clear()
461
+
462
+ # ═══════════════════════════════════════════════════════════════════════════
463
+ # TRACING API - Call these to emit events
464
+ # ═══════════════════════════════════════════════════════════════════════════
465
+
466
+ def start_activity(
467
+ self,
468
+ activity_id: str,
469
+ activity_name: str,
470
+ activity_type: str = "transform",
471
+ ):
472
+ """Signal start of an activity (for context)."""
473
+ self._current_activity_id = activity_id
474
+ self._current_activity_name = activity_name
475
+ self._current_activity_type = activity_type
476
+
477
+ self.emit(TraceEvent(
478
+ event_type=TraceEventType.ACTIVITY_STARTED,
479
+ activity_id=activity_id,
480
+ activity_name=activity_name,
481
+ activity_type=activity_type,
482
+ ))
483
+
484
+ def end_activity(self, activity_id: str = None):
485
+ """Signal end of an activity."""
486
+ self.emit(TraceEvent(
487
+ event_type=TraceEventType.ACTIVITY_COMPLETED,
488
+ activity_id=activity_id or self._current_activity_id,
489
+ activity_name=self._current_activity_name,
490
+ activity_type=self._current_activity_type,
491
+ ))
492
+ self._current_activity_id = None
493
+ self._current_activity_name = None
494
+ self._current_activity_type = None
495
+
496
+ def report_progress(
497
+ self,
498
+ progress: float,
499
+ message: str = "",
500
+ activity_id: str = None,
501
+ ):
502
+ """Report progress on current activity."""
503
+ self.emit(TraceEvent(
504
+ event_type=TraceEventType.ACTIVITY_PROGRESS,
505
+ activity_id=activity_id or self._current_activity_id,
506
+ activity_name=self._current_activity_name,
507
+ progress=progress,
508
+ progress_message=message,
509
+ ))
510
+
511
+ def touch_document(
512
+ self,
513
+ document_id: str,
514
+ document_name: str,
515
+ snippet: str = "",
516
+ field_name: str = "",
517
+ row_index: int = -1,
518
+ highlight_type: str = "default",
519
+ confidence: float = 1.0,
520
+ **metadata,
521
+ ):
522
+ """
523
+ Signal that the model touched a document/record.
524
+
525
+ This creates a highlight in the live view.
526
+ """
527
+ span = DocumentSpan(
528
+ document_id=document_id,
529
+ document_name=document_name,
530
+ field_name=field_name,
531
+ row_index=row_index,
532
+ text=snippet,
533
+ highlight_type=highlight_type,
534
+ confidence=confidence,
535
+ metadata=metadata,
536
+ )
537
+
538
+ self.emit(TraceEvent(
539
+ event_type=TraceEventType.DOCUMENT_TOUCHED,
540
+ activity_id=self._current_activity_id,
541
+ activity_name=self._current_activity_name,
542
+ activity_type=self._current_activity_type,
543
+ spans=[span],
544
+ entity_id=document_id,
545
+ metadata=metadata,
546
+ ))
547
+
548
+ return span
549
+
550
+ def highlight_span(
551
+ self,
552
+ document_id: str,
553
+ document_name: str,
554
+ text: str,
555
+ start_char: int = -1,
556
+ end_char: int = -1,
557
+ field_name: str = "",
558
+ row_index: int = -1,
559
+ highlight_type: str = "attention",
560
+ confidence: float = 1.0,
561
+ **metadata,
562
+ ):
563
+ """
564
+ Highlight a specific span within a document.
565
+
566
+ For showing exactly where in the text the model is focusing.
567
+ """
568
+ span = DocumentSpan(
569
+ document_id=document_id,
570
+ document_name=document_name,
571
+ field_name=field_name,
572
+ row_index=row_index,
573
+ text=text,
574
+ start_char=start_char,
575
+ end_char=end_char,
576
+ highlight_type=highlight_type,
577
+ confidence=confidence,
578
+ metadata=metadata,
579
+ )
580
+
581
+ self.emit(TraceEvent(
582
+ event_type=TraceEventType.SPAN_HIGHLIGHTED,
583
+ activity_id=self._current_activity_id,
584
+ activity_name=self._current_activity_name,
585
+ activity_type=self._current_activity_type,
586
+ spans=[span],
587
+ metadata=metadata,
588
+ ))
589
+
590
+ return span
591
+
592
+ def create_association(
593
+ self,
594
+ source_doc_id: str,
595
+ source_doc_name: str,
596
+ source_text: str,
597
+ target_doc_id: str,
598
+ target_doc_name: str,
599
+ target_text: str,
600
+ association_type: str = "related",
601
+ confidence: float = 1.0,
602
+ reason: str = "",
603
+ **metadata,
604
+ ):
605
+ """
606
+ Create an association between two document spans.
607
+
608
+ This is the "A connects to B" visualization.
609
+ """
610
+ source = DocumentSpan(
611
+ document_id=source_doc_id,
612
+ document_name=source_doc_name,
613
+ text=source_text,
614
+ highlight_type="source",
615
+ confidence=confidence,
616
+ )
617
+
618
+ target = DocumentSpan(
619
+ document_id=target_doc_id,
620
+ document_name=target_doc_name,
621
+ text=target_text,
622
+ highlight_type="target",
623
+ confidence=confidence,
624
+ )
625
+
626
+ association = DocumentAssociation(
627
+ source=source,
628
+ target=target,
629
+ association_type=association_type,
630
+ confidence=confidence,
631
+ reason=reason,
632
+ )
633
+
634
+ self.emit(TraceEvent(
635
+ event_type=TraceEventType.ASSOCIATION_CREATED,
636
+ activity_id=self._current_activity_id,
637
+ activity_name=self._current_activity_name,
638
+ activity_type=self._current_activity_type,
639
+ spans=[source, target],
640
+ association=association,
641
+ metadata=metadata,
642
+ ))
643
+
644
+ return association
645
+
646
+ def entity_created(
647
+ self,
648
+ entity_id: str,
649
+ entity_name: str,
650
+ record_count: int = None,
651
+ **metadata,
652
+ ):
653
+ """Signal that a new entity was created in provenance."""
654
+ self.emit(TraceEvent(
655
+ event_type=TraceEventType.ENTITY_CREATED,
656
+ activity_id=self._current_activity_id,
657
+ activity_name=self._current_activity_name,
658
+ entity_id=entity_id,
659
+ metadata={"name": entity_name, "record_count": record_count, **metadata},
660
+ ))
661
+
662
+ def entity_derived(
663
+ self,
664
+ derived_id: str,
665
+ derived_name: str,
666
+ source_ids: List[str],
667
+ **metadata,
668
+ ):
669
+ """Signal that an entity was derived from others."""
670
+ self.emit(TraceEvent(
671
+ event_type=TraceEventType.ENTITY_DERIVED,
672
+ activity_id=self._current_activity_id,
673
+ activity_name=self._current_activity_name,
674
+ entity_id=derived_id,
675
+ metadata={"name": derived_name, "sources": source_ids, **metadata},
676
+ ))
677
+
678
+ def link_created(
679
+ self,
680
+ source_id: str,
681
+ target_id: str,
682
+ relationship_type: str,
683
+ **metadata,
684
+ ):
685
+ """Signal that a provenance link was created."""
686
+ self.emit(TraceEvent(
687
+ event_type=TraceEventType.LINK_CREATED,
688
+ activity_id=self._current_activity_id,
689
+ activity_name=self._current_activity_name,
690
+ relationship_type=relationship_type,
691
+ metadata={"source": source_id, "target": target_id, **metadata},
692
+ ))
693
+
694
+ # ═══════════════════════════════════════════════════════════════════════════
695
+ # EXPORT (Freeze the live state)
696
+ # ═══════════════════════════════════════════════════════════════════════════
697
+
698
+ def export_session(self) -> Dict[str, Any]:
699
+ """
700
+ Export the trace session as frozen data.
701
+
702
+ This is the bridge between live and export -
703
+ same data, just frozen at a point in time.
704
+ """
705
+ with self._buffer_lock:
706
+ return {
707
+ "events": [e.to_dict() for e in self._buffer],
708
+ "event_count": len(self._buffer),
709
+ "exported_at": time.time(),
710
+ }
711
+
712
+ def export_associations(self) -> List[Dict[str, Any]]:
713
+ """Export just the associations for visualization."""
714
+ associations = []
715
+ with self._buffer_lock:
716
+ for event in self._buffer:
717
+ if event.association:
718
+ associations.append(event.association.to_dict())
719
+ return associations
720
+
721
+ def export_timeline(self) -> List[Dict[str, Any]]:
722
+ """Export events as a timeline."""
723
+ timeline = []
724
+ with self._buffer_lock:
725
+ for event in self._buffer:
726
+ timeline.append({
727
+ "timestamp": event.timestamp,
728
+ "type": event.event_type.value,
729
+ "activity": event.activity_name,
730
+ "spans": len(event.spans),
731
+ "has_association": event.association is not None,
732
+ })
733
+ return timeline
734
+
735
+
736
+ # ═══════════════════════════════════════════════════════════════════════════════
737
+ # CONSOLE RENDERER - Simple text-based live view
738
+ # ═══════════════════════════════════════════════════════════════════════════════
739
+
740
+ class ConsoleTraceRenderer:
741
+ """
742
+ Simple console renderer for live document traces.
743
+
744
+ Good for debugging and terminal-based workflows.
745
+ """
746
+
747
+ def __init__(self, show_snippets: bool = True, max_snippet_len: int = 80):
748
+ self.show_snippets = show_snippets
749
+ self.max_snippet_len = max_snippet_len
750
+
751
+ def render(self, event: TraceEvent):
752
+ """Render event to console."""
753
+ timestamp = time.strftime("%H:%M:%S", time.localtime(event.timestamp))
754
+
755
+ if event.event_type == TraceEventType.ACTIVITY_STARTED:
756
+ print(f"\n[{timestamp}] ▶ {event.activity_name} ({event.activity_type})")
757
+ print("─" * 60)
758
+
759
+ elif event.event_type == TraceEventType.ACTIVITY_COMPLETED:
760
+ print("─" * 60)
761
+ print(f"[{timestamp}] ✓ {event.activity_name} completed")
762
+
763
+ elif event.event_type == TraceEventType.ACTIVITY_PROGRESS:
764
+ pct = int((event.progress or 0) * 100)
765
+ bar = "█" * (pct // 5) + "░" * (20 - pct // 5)
766
+ msg = event.progress_message or ""
767
+ print(f"\r[{timestamp}] [{bar}] {pct}% {msg}", end="", flush=True)
768
+ if pct >= 100:
769
+ print()
770
+
771
+ elif event.event_type == TraceEventType.DOCUMENT_TOUCHED:
772
+ for span in event.spans:
773
+ snippet = self._truncate(span.text)
774
+ print(f"[{timestamp}] 📄 {span.document_name}", end="")
775
+ if span.field_name:
776
+ print(f"[{span.field_name}]", end="")
777
+ if span.row_index >= 0:
778
+ print(f" row={span.row_index}", end="")
779
+ if self.show_snippets and snippet:
780
+ print(f"\n └─ \"{snippet}\"")
781
+ else:
782
+ print()
783
+
784
+ elif event.event_type == TraceEventType.SPAN_HIGHLIGHTED:
785
+ for span in event.spans:
786
+ snippet = self._truncate(span.text)
787
+ conf = f"{span.confidence:.0%}" if span.confidence < 1.0 else ""
788
+ print(f"[{timestamp}] 🔍 [{span.highlight_type}] {conf}")
789
+ if self.show_snippets and snippet:
790
+ print(f" └─ \"{snippet}\"")
791
+
792
+ elif event.event_type == TraceEventType.ASSOCIATION_CREATED:
793
+ assoc = event.association
794
+ if assoc:
795
+ src = self._truncate(assoc.source.text, 40)
796
+ tgt = self._truncate(assoc.target.text, 40)
797
+ print(f"[{timestamp}] 🔗 {assoc.association_type} ({assoc.confidence:.0%})")
798
+ print(f" ├─ \"{src}\"")
799
+ print(f" └─ \"{tgt}\"")
800
+ if assoc.reason:
801
+ print(f" ({assoc.reason})")
802
+
803
+ elif event.event_type == TraceEventType.ENTITY_CREATED:
804
+ name = event.metadata.get("name", event.entity_id)
805
+ count = event.metadata.get("record_count", "?")
806
+ print(f"[{timestamp}] ✦ Entity created: {name} ({count} records)")
807
+
808
+ elif event.event_type == TraceEventType.ENTITY_DERIVED:
809
+ name = event.metadata.get("name", event.entity_id)
810
+ sources = event.metadata.get("sources", [])
811
+ print(f"[{timestamp}] ⤵ Entity derived: {name} ← {len(sources)} sources")
812
+
813
+ def _truncate(self, text: str, max_len: int = None) -> str:
814
+ max_len = max_len or self.max_snippet_len
815
+ if not text:
816
+ return ""
817
+ text = text.replace("\n", " ").strip()
818
+ if len(text) > max_len:
819
+ return text[:max_len-3] + "..."
820
+ return text
821
+
822
+
823
+ # ═══════════════════════════════════════════════════════════════════════════════
824
+ # CONVENIENCE
825
+ # ═══════════════════════════════════════════════════════════════════════════════
826
+
827
+ def create_live_tracer(observer=None, console: bool = False) -> LiveDocumentTracer:
828
+ """
829
+ Create a live document tracer.
830
+
831
+ Args:
832
+ observer: DatasetObserver to hook into
833
+ console: If True, attach console renderer
834
+
835
+ Returns:
836
+ Configured LiveDocumentTracer
837
+ """
838
+ tracer = LiveDocumentTracer(observer)
839
+
840
+ if console:
841
+ renderer = ConsoleTraceRenderer()
842
+ tracer.on_event(renderer.render)
843
+
844
+ return tracer
cascade/data/observer.py ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dataset Observer
3
+
4
+ The main interface for observing datasets.
5
+ Provides context managers for tracking ingest, transform, and consume operations.
6
+ """
7
+
8
+ import hashlib
9
+ import time
10
+ from contextlib import contextmanager
11
+ from dataclasses import dataclass, field
12
+ from typing import Any, Callable, Dict, Generator, List, Optional, Union
13
+
14
+ from .entities import (
15
+ DatasetEntity, Activity, Agent, Relationship, RelationType,
16
+ ActivityType, AgentType, create_system_agent, create_model_agent, create_user_agent
17
+ )
18
+ from .provenance import ProvenanceGraph
19
+ from .schema import SchemaObserver, DatasetSchema, hash_content
20
+
21
+
22
+ @dataclass
23
+ class ObservationContext:
24
+ """
25
+ Context for an ongoing observation.
26
+
27
+ Used within context managers to track inputs/outputs.
28
+ """
29
+ activity: Activity
30
+ observer: "DatasetObserver"
31
+
32
+ _inputs: List[DatasetEntity] = field(default_factory=list)
33
+ _outputs: List[DatasetEntity] = field(default_factory=list)
34
+
35
+ def input(self, dataset, name: str = None, **kwargs) -> DatasetEntity:
36
+ """
37
+ Register an input dataset.
38
+
39
+ Args:
40
+ dataset: HuggingFace Dataset, DatasetDict, or entity ID
41
+ name: Optional name override
42
+ **kwargs: Additional entity attributes
43
+
44
+ Returns:
45
+ The created or retrieved DatasetEntity
46
+ """
47
+ # If string, assume it's an existing entity ID
48
+ if isinstance(dataset, str):
49
+ entity = self.observer.graph.get_entity(dataset)
50
+ if entity:
51
+ self._inputs.append(entity)
52
+ self.activity.add_input(entity.id)
53
+ self.observer.graph.link_usage(self.activity.id, entity.id)
54
+ return entity
55
+ else:
56
+ raise ValueError(f"Entity not found: {dataset}")
57
+
58
+ # Otherwise, observe the dataset
59
+ entity = self.observer.observe_dataset(dataset, name=name, **kwargs)
60
+ self._inputs.append(entity)
61
+ self.activity.add_input(entity.id)
62
+ self.observer.graph.link_usage(self.activity.id, entity.id)
63
+
64
+ return entity
65
+
66
+ def output(self, dataset, name: str = None, **kwargs) -> DatasetEntity:
67
+ """
68
+ Register an output dataset.
69
+
70
+ Args:
71
+ dataset: HuggingFace Dataset, DatasetDict, or dict
72
+ name: Optional name override
73
+ **kwargs: Additional entity attributes
74
+
75
+ Returns:
76
+ The created DatasetEntity
77
+ """
78
+ entity = self.observer.observe_dataset(dataset, name=name, **kwargs)
79
+ self._outputs.append(entity)
80
+ self.activity.add_output(entity.id)
81
+
82
+ # Link generation
83
+ self.observer.graph.link_generation(entity.id, self.activity.id)
84
+
85
+ # Link derivation from all inputs
86
+ for input_entity in self._inputs:
87
+ self.observer.graph.link_derivation(entity.id, input_entity.id)
88
+
89
+ return entity
90
+
91
+ @property
92
+ def inputs(self) -> List[DatasetEntity]:
93
+ return self._inputs
94
+
95
+ @property
96
+ def outputs(self) -> List[DatasetEntity]:
97
+ return self._outputs
98
+
99
+
100
+ class DatasetObserver:
101
+ """
102
+ Observer for dataset operations.
103
+
104
+ Tracks:
105
+ - Dataset loading (ingest)
106
+ - Transformations (filter, map, join, etc.)
107
+ - Consumption (training, inference)
108
+
109
+ Example:
110
+ observer = DatasetObserver()
111
+
112
+ with observer.observe_ingest("squad") as ctx:
113
+ ds = load_dataset("squad")
114
+ ctx.output(ds)
115
+
116
+ with observer.observe_transform("filter_english") as ctx:
117
+ ctx.input(ds)
118
+ filtered = ds.filter(lambda x: x["lang"] == "en")
119
+ ctx.output(filtered)
120
+
121
+ chain = observer.export_provenance()
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ name: str = "default",
127
+ agent: Agent = None,
128
+ ):
129
+ """
130
+ Initialize observer.
131
+
132
+ Args:
133
+ name: Name for the provenance graph
134
+ agent: Default agent for activities (defaults to graph's system agent)
135
+ """
136
+ self.graph = ProvenanceGraph(name=name)
137
+ self.schema_observer = SchemaObserver()
138
+
139
+ # Use provided agent or the graph's default system agent
140
+ if agent:
141
+ self._default_agent = agent
142
+ self.graph.add_agent(agent)
143
+ else:
144
+ # Use the graph's already-created system agent
145
+ self._default_agent = self.graph._system_agent
146
+
147
+ # Entity counter for unique IDs
148
+ self._counter = 0
149
+
150
+ def _next_id(self, prefix: str) -> str:
151
+ """Generate unique ID."""
152
+ self._counter += 1
153
+ return f"{prefix}:{int(time.time() * 1000)}:{self._counter:04d}"
154
+
155
+ # ═════════════════════════════════════════════════════��═════════════════════
156
+ # DATASET OBSERVATION
157
+ # ═══════════════════════════════════════════════════════════════════════════
158
+
159
+ def observe_dataset(
160
+ self,
161
+ dataset,
162
+ name: str = None,
163
+ source_type: str = None,
164
+ source_uri: str = None,
165
+ version: str = None,
166
+ license_id: str = None,
167
+ license_url: str = None,
168
+ **kwargs,
169
+ ) -> DatasetEntity:
170
+ """
171
+ Observe a dataset and create an entity.
172
+
173
+ Args:
174
+ dataset: HuggingFace Dataset, DatasetDict, DataFrame, or dict
175
+ name: Name for the entity
176
+ source_type: Type of source (hf_hub, local, etc.)
177
+ source_uri: URI of the source
178
+ version: Version string
179
+ license_id: SPDX license identifier (e.g., "MIT", "CC-BY-4.0")
180
+ license_url: URL to the license text
181
+ **kwargs: Additional attributes
182
+
183
+ Returns:
184
+ DatasetEntity representing the dataset
185
+ """
186
+ # Infer name if not provided
187
+ if name is None:
188
+ if hasattr(dataset, 'info') and hasattr(dataset.info, 'dataset_name'):
189
+ name = dataset.info.dataset_name
190
+ elif hasattr(dataset, 'config_name'):
191
+ name = dataset.config_name
192
+ else:
193
+ name = f"dataset_{self._counter + 1}"
194
+
195
+ # Try to extract license from HuggingFace dataset info
196
+ if license_id is None and hasattr(dataset, 'info'):
197
+ info = dataset.info
198
+ if hasattr(info, 'license') and info.license:
199
+ license_id = info.license
200
+
201
+ # Observe schema
202
+ schema = self._observe_schema(dataset)
203
+
204
+ # Compute content hash
205
+ content_hash = self._compute_content_hash(dataset)
206
+
207
+ # Get record count and splits
208
+ record_count, splits = self._get_counts(dataset)
209
+
210
+ # Infer source
211
+ if source_type is None:
212
+ source_type = self._infer_source_type(dataset)
213
+
214
+ # Create entity
215
+ entity = DatasetEntity(
216
+ id=self._next_id("entity"),
217
+ name=name,
218
+ content_hash=content_hash,
219
+ schema_hash=schema.hash() if schema else None,
220
+ version=version,
221
+ source_type=source_type,
222
+ source_uri=source_uri,
223
+ license_id=license_id,
224
+ license_url=license_url,
225
+ record_count=record_count,
226
+ splits=splits,
227
+ attributes={
228
+ "schema": schema.to_dict() if schema else None,
229
+ **kwargs,
230
+ },
231
+ )
232
+
233
+ # Add to graph
234
+ self.graph.add_entity(entity)
235
+
236
+ return entity
237
+
238
+ def register_agent(self, name: str, agent_type: str = "software", version: str = None) -> Agent:
239
+ """
240
+ Register a new agent in the provenance graph.
241
+
242
+ Args:
243
+ name: Name of the agent
244
+ agent_type: Type of agent (software, model, person, etc.)
245
+ version: Optional version string
246
+
247
+ Returns:
248
+ The created Agent
249
+ """
250
+ if agent_type == "model":
251
+ agent = create_model_agent(name, version=version)
252
+ elif agent_type == "system":
253
+ agent = create_system_agent(name, version=version)
254
+ elif agent_type == "person":
255
+ agent = create_user_agent(name)
256
+ else:
257
+ # Default to software agent or generic
258
+ try:
259
+ type_enum = AgentType(agent_type)
260
+ except ValueError:
261
+ type_enum = AgentType.SOFTWARE
262
+
263
+ agent = Agent(
264
+ id=f"agent:{type_enum.value}:{name.replace(' ', '_').lower()}",
265
+ agent_type=type_enum,
266
+ name=name,
267
+ version=version
268
+ )
269
+
270
+ self.graph.add_agent(agent)
271
+ return agent
272
+
273
+ def _observe_schema(self, dataset) -> Optional[DatasetSchema]:
274
+ """Extract schema from dataset."""
275
+ try:
276
+ # HuggingFace Dataset
277
+ if hasattr(dataset, 'features'):
278
+ return self.schema_observer.observe_hf_dataset(dataset)
279
+
280
+ # Pandas DataFrame
281
+ if hasattr(dataset, 'dtypes') and hasattr(dataset, 'columns'):
282
+ return self.schema_observer.observe_pandas(dataset)
283
+
284
+ # Dict
285
+ if isinstance(dataset, dict):
286
+ # Check if it's columnar (dict of lists)
287
+ if all(isinstance(v, list) for v in dataset.values()):
288
+ return self.schema_observer.observe_dict(dataset)
289
+
290
+ return None
291
+ except Exception as e:
292
+ # Don't fail observation if schema extraction fails
293
+ print(f"Warning: Could not extract schema: {e}")
294
+ return None
295
+
296
+ def _compute_content_hash(self, dataset) -> str:
297
+ """Compute content hash of dataset."""
298
+ try:
299
+ return hash_content(dataset)
300
+ except Exception:
301
+ # Fallback to timestamp-based hash
302
+ return hashlib.sha256(str(time.time()).encode()).hexdigest()
303
+
304
+ def _get_counts(self, dataset) -> tuple:
305
+ """Get record count and split counts."""
306
+ record_count = None
307
+ splits = {}
308
+
309
+ try:
310
+ # HuggingFace DatasetDict
311
+ if hasattr(dataset, 'keys') and hasattr(dataset, '__getitem__'):
312
+ for split_name in dataset.keys():
313
+ split_ds = dataset[split_name]
314
+ if hasattr(split_ds, '__len__'):
315
+ splits[split_name] = len(split_ds)
316
+ record_count = sum(splits.values()) if splits else None
317
+
318
+ # Single dataset
319
+ elif hasattr(dataset, '__len__'):
320
+ record_count = len(dataset)
321
+
322
+ except Exception:
323
+ pass
324
+
325
+ return record_count, splits
326
+
327
+ def _infer_source_type(self, dataset) -> str:
328
+ """Infer source type from dataset."""
329
+ # HuggingFace Dataset
330
+ if hasattr(dataset, '_info'):
331
+ return "hf_dataset"
332
+
333
+ # Pandas
334
+ if hasattr(dataset, 'dtypes'):
335
+ return "pandas"
336
+
337
+ # Dict
338
+ if isinstance(dataset, dict):
339
+ return "dict"
340
+
341
+ return "unknown"
342
+
343
+ # ═══════════════════════════════════════════════════════════════════════════
344
+ # CONTEXT MANAGERS
345
+ # ═══════════════════════════════════════════════════════════════════════════
346
+
347
+ @contextmanager
348
+ def observe_ingest(
349
+ self,
350
+ name: str,
351
+ source_uri: str = None,
352
+ agent: Agent = None,
353
+ **kwargs,
354
+ ) -> Generator[ObservationContext, None, None]:
355
+ """
356
+ Observe a dataset ingest operation.
357
+
358
+ Args:
359
+ name: Name of the ingest operation
360
+ source_uri: URI of the data source
361
+ agent: Agent performing the ingest
362
+ **kwargs: Additional activity parameters
363
+
364
+ Yields:
365
+ ObservationContext for registering inputs/outputs
366
+
367
+ Example:
368
+ with observer.observe_ingest("load_squad", source_uri="hf://squad") as ctx:
369
+ ds = load_dataset("squad")
370
+ ctx.output(ds, name="squad")
371
+ """
372
+ activity = Activity(
373
+ id=self._next_id("activity"),
374
+ activity_type=ActivityType.INGEST,
375
+ name=name,
376
+ agent_id=(agent or self._default_agent).id,
377
+ parameters={"source_uri": source_uri, **kwargs},
378
+ )
379
+ activity.start()
380
+
381
+ ctx = ObservationContext(activity=activity, observer=self)
382
+
383
+ try:
384
+ yield ctx
385
+ finally:
386
+ activity.end()
387
+ self.graph.add_activity(activity)
388
+ self.graph.link_association(activity.id, activity.agent_id)
389
+
390
+ @contextmanager
391
+ def observe_transform(
392
+ self,
393
+ name: str,
394
+ transform_type: str = None,
395
+ agent: Agent = None,
396
+ **kwargs,
397
+ ) -> Generator[ObservationContext, None, None]:
398
+ """
399
+ Observe a dataset transformation.
400
+
401
+ Args:
402
+ name: Name of the transform
403
+ transform_type: Type of transform (filter, map, join, etc.)
404
+ agent: Agent performing the transform
405
+ **kwargs: Additional activity parameters
406
+
407
+ Yields:
408
+ ObservationContext for registering inputs/outputs
409
+
410
+ Example:
411
+ with observer.observe_transform("filter_english") as ctx:
412
+ ctx.input(ds)
413
+ filtered = ds.filter(lambda x: x["lang"] == "en")
414
+ ctx.output(filtered)
415
+ """
416
+ activity = Activity(
417
+ id=self._next_id("activity"),
418
+ activity_type=ActivityType.TRANSFORM,
419
+ name=name,
420
+ agent_id=(agent or self._default_agent).id,
421
+ parameters={"transform_type": transform_type, **kwargs},
422
+ )
423
+ activity.start()
424
+
425
+ ctx = ObservationContext(activity=activity, observer=self)
426
+
427
+ try:
428
+ yield ctx
429
+ finally:
430
+ activity.end()
431
+ self.graph.add_activity(activity)
432
+ self.graph.link_association(activity.id, activity.agent_id)
433
+
434
+ @contextmanager
435
+ def observe_consume(
436
+ self,
437
+ name: str,
438
+ model_id: str = None,
439
+ consume_type: str = "train",
440
+ agent: Agent = None,
441
+ **kwargs,
442
+ ) -> Generator[ObservationContext, None, None]:
443
+ """
444
+ Observe dataset consumption (training, inference).
445
+
446
+ Args:
447
+ name: Name of the consumption operation
448
+ model_id: ID of the model consuming the data
449
+ consume_type: Type of consumption (train, evaluate, inference)
450
+ agent: Agent performing the consumption
451
+ **kwargs: Additional activity parameters
452
+
453
+ Yields:
454
+ ObservationContext for registering inputs/outputs
455
+
456
+ Example:
457
+ with observer.observe_consume("train_qa_model", model_id="bert-base") as ctx:
458
+ ctx.input(train_ds)
459
+ model = train(train_ds)
460
+ # Model provenance now links to data provenance!
461
+ """
462
+ # Create model agent if model_id provided
463
+ if model_id and agent is None:
464
+ agent = create_model_agent(model_id)
465
+ self.graph.add_agent(agent)
466
+
467
+ activity_type = {
468
+ "train": ActivityType.TRAIN,
469
+ "evaluate": ActivityType.EVALUATE,
470
+ "inference": ActivityType.INFERENCE,
471
+ }.get(consume_type, ActivityType.TRAIN)
472
+
473
+ activity = Activity(
474
+ id=self._next_id("activity"),
475
+ activity_type=activity_type,
476
+ name=name,
477
+ agent_id=(agent or self._default_agent).id,
478
+ parameters={"model_id": model_id, "consume_type": consume_type, **kwargs},
479
+ )
480
+ activity.start()
481
+
482
+ ctx = ObservationContext(activity=activity, observer=self)
483
+
484
+ try:
485
+ yield ctx
486
+ finally:
487
+ activity.end()
488
+ self.graph.add_activity(activity)
489
+ self.graph.link_association(activity.id, activity.agent_id)
490
+
491
+ @contextmanager
492
+ def observe_entity_resolution(
493
+ self,
494
+ name: str,
495
+ model_id: str = None,
496
+ threshold: float = None,
497
+ agent: Agent = None,
498
+ **kwargs,
499
+ ) -> Generator[ObservationContext, None, None]:
500
+ """
501
+ Observe entity resolution / data unity operation.
502
+
503
+ Args:
504
+ name: Name of the operation
505
+ model_id: Embedding model used
506
+ threshold: Similarity threshold
507
+ agent: Agent performing the operation
508
+ **kwargs: Additional parameters
509
+
510
+ Example:
511
+ with observer.observe_entity_resolution("match_patients_claims") as ctx:
512
+ ctx.input(patients_ds)
513
+ ctx.input(claims_ds)
514
+ unified = run_unity(patients_ds, claims_ds)
515
+ ctx.output(unified)
516
+ """
517
+ if model_id and agent is None:
518
+ agent = create_model_agent(model_id)
519
+ self.graph.add_agent(agent)
520
+
521
+ activity = Activity(
522
+ id=self._next_id("activity"),
523
+ activity_type=ActivityType.ENTITY_RESOLUTION,
524
+ name=name,
525
+ agent_id=(agent or self._default_agent).id,
526
+ parameters={
527
+ "model_id": model_id,
528
+ "threshold": threshold,
529
+ **kwargs,
530
+ },
531
+ )
532
+ activity.start()
533
+
534
+ ctx = ObservationContext(activity=activity, observer=self)
535
+
536
+ try:
537
+ yield ctx
538
+ finally:
539
+ activity.end()
540
+ self.graph.add_activity(activity)
541
+ self.graph.link_association(activity.id, activity.agent_id)
542
+
543
+ # ═══════════════════════════════════════════════════════════════════════════
544
+ # EXPORT
545
+ # ═══════════════════════════════════════════════════════════════════════════
546
+
547
+ def export_provenance(self) -> ProvenanceGraph:
548
+ """Export the provenance graph."""
549
+ return self.graph
550
+
551
+ def to_dict(self) -> Dict[str, Any]:
552
+ """Export observation state to dictionary."""
553
+ return {
554
+ "graph": self.graph.to_dict(),
555
+ "counter": self._counter,
556
+ }
557
+
558
+ @classmethod
559
+ def from_dict(cls, data: Dict[str, Any]) -> "DatasetObserver":
560
+ """Load observer from dictionary."""
561
+ observer = cls()
562
+ observer.graph = ProvenanceGraph.from_dict(data["graph"])
563
+ observer._counter = data.get("counter", 0)
564
+ return observer
565
+
566
+ # ═══════════════════════════════════════════════════════════════════════════
567
+ # STATISTICS
568
+ # ══��════════════════════════════════════════════════════════════════════════
569
+
570
+ @property
571
+ def stats(self) -> Dict[str, Any]:
572
+ """Get observer statistics."""
573
+ return {
574
+ "graph": self.graph.stats,
575
+ "root_hash": self.graph.root_hash,
576
+ }
577
+
578
+ # ═══════════════════════════════════════════════════════════════════════════
579
+ # LICENSE TRACKING
580
+ # ═══════════════════════════════════════════════════════════════════════════
581
+
582
+ def check_license_compatibility(
583
+ self,
584
+ entity_ids: List[str],
585
+ target_license: str = None,
586
+ ):
587
+ """
588
+ Check license compatibility for deriving from entities.
589
+
590
+ Args:
591
+ entity_ids: List of source entity IDs
592
+ target_license: Intended SPDX license for derived work
593
+
594
+ Returns:
595
+ LicenseCompatibility result
596
+
597
+ Example:
598
+ result = observer.check_license_compatibility(
599
+ ["entity:123", "entity:456"],
600
+ target_license="MIT"
601
+ )
602
+ if not result.compatible:
603
+ print(f"Issues: {result.issues}")
604
+ """
605
+ from .license import check_license_compatibility
606
+
607
+ sources = []
608
+ for entity_id in entity_ids:
609
+ entity = self.graph.get_entity(entity_id)
610
+ if entity:
611
+ license_id = entity.license_id or "unknown"
612
+ sources.append((entity_id, license_id))
613
+
614
+ return check_license_compatibility(sources, target_license)
615
+
616
+ def get_derived_license(self, entity_ids: List[str]):
617
+ """
618
+ Get the appropriate license for a work derived from entities.
619
+
620
+ Args:
621
+ entity_ids: List of source entity IDs
622
+
623
+ Returns:
624
+ SPDXLicense for the derived work
625
+ """
626
+ from .license import get_derived_license
627
+
628
+ licenses = []
629
+ for entity_id in entity_ids:
630
+ entity = self.graph.get_entity(entity_id)
631
+ if entity and entity.license_id:
632
+ licenses.append(entity.license_id)
633
+
634
+ return get_derived_license(licenses) if licenses else None
635
+
636
+ def generate_attribution(self, entity_ids: List[str] = None) -> str:
637
+ """
638
+ Generate attribution text for entities.
639
+
640
+ Args:
641
+ entity_ids: List of entity IDs (defaults to all entities)
642
+
643
+ Returns:
644
+ Markdown attribution text
645
+ """
646
+ from .license import LicenseAnalyzer
647
+
648
+ analyzer = LicenseAnalyzer()
649
+
650
+ if entity_ids is None:
651
+ entities = self.graph.list_entities()
652
+ else:
653
+ entities = [
654
+ self.graph.get_entity(eid) for eid in entity_ids
655
+ if self.graph.get_entity(eid)
656
+ ]
657
+
658
+ sources = [
659
+ (e.id, e.license_id or "unknown", e.name)
660
+ for e in entities
661
+ ]
662
+
663
+ return analyzer.generate_attribution(sources)
664
+
665
+ def __repr__(self) -> str:
666
+ return f"DatasetObserver({self.graph})"
cascade/data/pii.py ADDED
@@ -0,0 +1,748 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PII Detection for CASCADE
3
+
4
+ Industry standard PII (Personally Identifiable Information) detection
5
+ based on Microsoft Presidio patterns and common PII taxonomies.
6
+
7
+ References:
8
+ - Microsoft Presidio: https://github.com/microsoft/presidio
9
+ - NIST PII Guide: https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-122.pdf
10
+ - GDPR Article 4 (personal data definition)
11
+
12
+ PII Categories:
13
+ 1. Direct Identifiers: Name, SSN, passport, driver's license
14
+ 2. Quasi-Identifiers: Age, ZIP code, gender, dates
15
+ 3. Sensitive Data: Health, financial, biometric
16
+
17
+ Detection Methods:
18
+ - Regex patterns (fast, high precision for structured PII)
19
+ - Context-aware detection (surrounding words improve accuracy)
20
+ - Checksum validation (SSN, credit cards, etc.)
21
+ """
22
+
23
+ import re
24
+ from dataclasses import dataclass, field
25
+ from enum import Enum
26
+ from typing import Any, Callable, Dict, List, Optional, Pattern, Set, Tuple
27
+
28
+
29
+ class PIIType(Enum):
30
+ """Types of PII that can be detected."""
31
+ # Direct Identifiers
32
+ PERSON_NAME = "PERSON_NAME"
33
+ EMAIL = "EMAIL"
34
+ PHONE_NUMBER = "PHONE_NUMBER"
35
+ SSN = "SSN" # Social Security Number
36
+ CREDIT_CARD = "CREDIT_CARD"
37
+ IBAN = "IBAN" # International Bank Account Number
38
+ IP_ADDRESS = "IP_ADDRESS"
39
+ MAC_ADDRESS = "MAC_ADDRESS"
40
+ PASSPORT = "PASSPORT"
41
+ DRIVERS_LICENSE = "DRIVERS_LICENSE"
42
+
43
+ # Quasi-Identifiers
44
+ DATE_OF_BIRTH = "DATE_OF_BIRTH"
45
+ AGE = "AGE"
46
+ ZIPCODE = "ZIPCODE"
47
+ ADDRESS = "ADDRESS"
48
+
49
+ # Sensitive Data
50
+ MEDICAL_RECORD = "MEDICAL_RECORD"
51
+ API_KEY = "API_KEY"
52
+ AWS_KEY = "AWS_KEY"
53
+ PASSWORD = "PASSWORD"
54
+ CRYPTO_WALLET = "CRYPTO_WALLET"
55
+
56
+ # Location
57
+ GPS_COORDINATES = "GPS_COORDINATES"
58
+
59
+ # URLs and IDs
60
+ URL = "URL"
61
+ USERNAME = "USERNAME"
62
+
63
+
64
+ class PIISeverity(Enum):
65
+ """Severity levels for PII findings."""
66
+ CRITICAL = "critical" # Direct identifier, immediate re-identification risk
67
+ HIGH = "high" # Sensitive data, significant privacy risk
68
+ MEDIUM = "medium" # Quasi-identifier, re-identification when combined
69
+ LOW = "low" # Minimal risk, contextual sensitivity
70
+
71
+
72
+ @dataclass
73
+ class PIIMatch:
74
+ """A detected PII instance."""
75
+ pii_type: PIIType
76
+ severity: PIISeverity
77
+ value: str # The matched text (may be redacted for display)
78
+ start: int # Start position in text
79
+ end: int # End position in text
80
+ confidence: float # 0.0 to 1.0
81
+ context: str = "" # Surrounding text for context
82
+ field_name: str = "" # Column/field where found
83
+ row_index: int = -1 # Row index if applicable
84
+
85
+ def to_dict(self) -> Dict[str, Any]:
86
+ return {
87
+ "type": self.pii_type.value,
88
+ "severity": self.severity.value,
89
+ "value_preview": self._redact(self.value),
90
+ "start": self.start,
91
+ "end": self.end,
92
+ "confidence": self.confidence,
93
+ "field_name": self.field_name,
94
+ "row_index": self.row_index,
95
+ }
96
+
97
+ def _redact(self, value: str, show_chars: int = 4) -> str:
98
+ """Partially redact the value for display."""
99
+ if len(value) <= show_chars:
100
+ return "*" * len(value)
101
+ return value[:show_chars] + "*" * (len(value) - show_chars)
102
+
103
+
104
+ @dataclass
105
+ class PIIPattern:
106
+ """A pattern for detecting PII."""
107
+ pii_type: PIIType
108
+ severity: PIISeverity
109
+ pattern: Pattern
110
+ confidence: float = 0.85
111
+ validator: Optional[Callable[[str], bool]] = None # Additional validation
112
+ context_patterns: List[str] = field(default_factory=list) # Boost confidence if context matches
113
+
114
+
115
+ @dataclass
116
+ class PIIScanResult:
117
+ """Result of scanning content for PII."""
118
+ total_matches: int = 0
119
+ matches_by_type: Dict[str, int] = field(default_factory=dict)
120
+ matches_by_severity: Dict[str, int] = field(default_factory=dict)
121
+ matches_by_field: Dict[str, int] = field(default_factory=dict)
122
+ sample_matches: List[PIIMatch] = field(default_factory=list) # First N matches
123
+ fields_with_pii: Set[str] = field(default_factory=set)
124
+ high_risk_fields: Set[str] = field(default_factory=set) # Fields with CRITICAL/HIGH PII
125
+
126
+ def to_dict(self) -> Dict[str, Any]:
127
+ return {
128
+ "total_matches": self.total_matches,
129
+ "matches_by_type": self.matches_by_type,
130
+ "matches_by_severity": self.matches_by_severity,
131
+ "matches_by_field": self.matches_by_field,
132
+ "fields_with_pii": list(self.fields_with_pii),
133
+ "high_risk_fields": list(self.high_risk_fields),
134
+ "sample_matches": [m.to_dict() for m in self.sample_matches[:10]],
135
+ }
136
+
137
+ def has_critical_pii(self) -> bool:
138
+ """Check if any critical PII was found."""
139
+ return self.matches_by_severity.get("critical", 0) > 0
140
+
141
+ def has_high_risk_pii(self) -> bool:
142
+ """Check if any high-risk PII was found."""
143
+ return (
144
+ self.matches_by_severity.get("critical", 0) > 0 or
145
+ self.matches_by_severity.get("high", 0) > 0
146
+ )
147
+
148
+ @property
149
+ def summary(self) -> str:
150
+ """Human-readable summary."""
151
+ if self.total_matches == 0:
152
+ return "No PII detected"
153
+
154
+ lines = [f"Found {self.total_matches} PII instance(s):"]
155
+ for sev in ["critical", "high", "medium", "low"]:
156
+ count = self.matches_by_severity.get(sev, 0)
157
+ if count > 0:
158
+ lines.append(f" • {sev.upper()}: {count}")
159
+
160
+ if self.high_risk_fields:
161
+ lines.append(f" ⚠ High-risk fields: {', '.join(self.high_risk_fields)}")
162
+
163
+ return "\n".join(lines)
164
+
165
+
166
+ # ═══════════════════════════════════════════════════════════════════════════════
167
+ # VALIDATION FUNCTIONS
168
+ # ═══════════════════════════════════════════════════════════════════════════════
169
+
170
+ def validate_luhn(card_number: str) -> bool:
171
+ """
172
+ Validate credit card using Luhn algorithm.
173
+
174
+ Used by Visa, MasterCard, American Express, etc.
175
+ """
176
+ digits = [int(d) for d in re.sub(r'\D', '', card_number)]
177
+ if len(digits) < 13 or len(digits) > 19:
178
+ return False
179
+
180
+ # Luhn checksum
181
+ checksum = 0
182
+ for i, digit in enumerate(reversed(digits)):
183
+ if i % 2 == 1:
184
+ digit *= 2
185
+ if digit > 9:
186
+ digit -= 9
187
+ checksum += digit
188
+
189
+ return checksum % 10 == 0
190
+
191
+
192
+ def validate_ssn(ssn: str) -> bool:
193
+ """
194
+ Validate US Social Security Number format.
195
+
196
+ SSN format: AAA-BB-CCCC
197
+ - AAA: Area number (001-899, excluding 666)
198
+ - BB: Group number (01-99)
199
+ - CCCC: Serial number (0001-9999)
200
+ """
201
+ clean = re.sub(r'\D', '', ssn)
202
+ if len(clean) != 9:
203
+ return False
204
+
205
+ area = int(clean[:3])
206
+ group = int(clean[3:5])
207
+ serial = int(clean[5:])
208
+
209
+ # Invalid patterns
210
+ if area == 0 or area == 666 or area >= 900:
211
+ return False
212
+ if group == 0:
213
+ return False
214
+ if serial == 0:
215
+ return False
216
+
217
+ # Known invalid SSNs (advertising, testing)
218
+ invalid_ssns = {
219
+ "078051120", # Woolworth promotional
220
+ "219099999", # Advertising
221
+ }
222
+ if clean in invalid_ssns:
223
+ return False
224
+
225
+ return True
226
+
227
+
228
+ def validate_iban(iban: str) -> bool:
229
+ """
230
+ Validate IBAN using MOD-97 checksum.
231
+ """
232
+ clean = re.sub(r'\s', '', iban).upper()
233
+ if len(clean) < 15 or len(clean) > 34:
234
+ return False
235
+
236
+ # Move country code and check digits to end
237
+ rearranged = clean[4:] + clean[:4]
238
+
239
+ # Convert letters to numbers (A=10, B=11, etc.)
240
+ numeric = ""
241
+ for char in rearranged:
242
+ if char.isdigit():
243
+ numeric += char
244
+ else:
245
+ numeric += str(ord(char) - ord('A') + 10)
246
+
247
+ # MOD 97 check
248
+ return int(numeric) % 97 == 1
249
+
250
+
251
+ # ═══════════════════════════════════════════════════════════════════════════════
252
+ # PII PATTERNS (Based on Microsoft Presidio)
253
+ # ═══════════════════════════════════════════════════════════════════════════════
254
+
255
+ PII_PATTERNS: List[PIIPattern] = [
256
+ # Email - RFC 5322 simplified
257
+ PIIPattern(
258
+ pii_type=PIIType.EMAIL,
259
+ severity=PIISeverity.HIGH,
260
+ pattern=re.compile(
261
+ r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
262
+ re.IGNORECASE
263
+ ),
264
+ confidence=0.95,
265
+ context_patterns=["email", "e-mail", "contact", "mail"],
266
+ ),
267
+
268
+ # Phone Number - International formats
269
+ PIIPattern(
270
+ pii_type=PIIType.PHONE_NUMBER,
271
+ severity=PIISeverity.MEDIUM,
272
+ pattern=re.compile(
273
+ r'''
274
+ (?:
275
+ \+?1?[-.\s]? # Country code
276
+ \(?[2-9]\d{2}\)?[-.\s]? # Area code
277
+ [2-9]\d{2}[-.\s]? # Exchange
278
+ \d{4} # Subscriber
279
+ |
280
+ \+?\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]? # International
281
+ \d{1,4}[-.\s]?\d{1,9}
282
+ )
283
+ ''',
284
+ re.VERBOSE
285
+ ),
286
+ confidence=0.75,
287
+ context_patterns=["phone", "tel", "mobile", "cell", "call", "fax"],
288
+ ),
289
+
290
+ # SSN - US Social Security Number
291
+ PIIPattern(
292
+ pii_type=PIIType.SSN,
293
+ severity=PIISeverity.CRITICAL,
294
+ pattern=re.compile(
295
+ r'\b(?!000|666|9\d{2})\d{3}[-\s]?(?!00)\d{2}[-\s]?(?!0000)\d{4}\b'
296
+ ),
297
+ confidence=0.85,
298
+ validator=validate_ssn,
299
+ context_patterns=["ssn", "social security", "tax id", "taxpayer"],
300
+ ),
301
+
302
+ # Credit Card - Major card formats
303
+ PIIPattern(
304
+ pii_type=PIIType.CREDIT_CARD,
305
+ severity=PIISeverity.CRITICAL,
306
+ pattern=re.compile(
307
+ r'''
308
+ \b(?:
309
+ 4[0-9]{12}(?:[0-9]{3})? # Visa
310
+ |
311
+ 5[1-5][0-9]{14} # MasterCard
312
+ |
313
+ 3[47][0-9]{13} # American Express
314
+ |
315
+ 6(?:011|5[0-9]{2})[0-9]{12} # Discover
316
+ |
317
+ (?:2131|1800|35\d{3})\d{11} # JCB
318
+ )\b
319
+ |
320
+ \b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b # Spaced format
321
+ ''',
322
+ re.VERBOSE
323
+ ),
324
+ confidence=0.90,
325
+ validator=validate_luhn,
326
+ context_patterns=["card", "credit", "visa", "mastercard", "amex", "payment"],
327
+ ),
328
+
329
+ # IP Address - IPv4
330
+ PIIPattern(
331
+ pii_type=PIIType.IP_ADDRESS,
332
+ severity=PIISeverity.MEDIUM,
333
+ pattern=re.compile(
334
+ r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
335
+ ),
336
+ confidence=0.90,
337
+ context_patterns=["ip", "address", "server", "host", "client"],
338
+ ),
339
+
340
+ # IP Address - IPv6
341
+ PIIPattern(
342
+ pii_type=PIIType.IP_ADDRESS,
343
+ severity=PIISeverity.MEDIUM,
344
+ pattern=re.compile(
345
+ r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b'
346
+ ),
347
+ confidence=0.90,
348
+ ),
349
+
350
+ # MAC Address
351
+ PIIPattern(
352
+ pii_type=PIIType.MAC_ADDRESS,
353
+ severity=PIISeverity.LOW,
354
+ pattern=re.compile(
355
+ r'\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b'
356
+ ),
357
+ confidence=0.95,
358
+ ),
359
+
360
+ # IBAN - International Bank Account Number
361
+ PIIPattern(
362
+ pii_type=PIIType.IBAN,
363
+ severity=PIISeverity.CRITICAL,
364
+ pattern=re.compile(
365
+ r'\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}(?:[A-Z0-9]?){0,16}\b',
366
+ re.IGNORECASE
367
+ ),
368
+ confidence=0.85,
369
+ validator=validate_iban,
370
+ context_patterns=["iban", "bank", "account", "transfer"],
371
+ ),
372
+
373
+ # API Key patterns
374
+ PIIPattern(
375
+ pii_type=PIIType.API_KEY,
376
+ severity=PIISeverity.CRITICAL,
377
+ pattern=re.compile(
378
+ r'''
379
+ (?:
380
+ sk[-_]live[-_][a-zA-Z0-9]{24,} # Stripe
381
+ |
382
+ sk[-_]test[-_][a-zA-Z0-9]{24,} # Stripe test
383
+ |
384
+ pk[-_]live[-_][a-zA-Z0-9]{24,} # Stripe public
385
+ |
386
+ ghp_[a-zA-Z0-9]{36} # GitHub PAT
387
+ |
388
+ gho_[a-zA-Z0-9]{36} # GitHub OAuth
389
+ |
390
+ github_pat_[a-zA-Z0-9]{22}_[a-zA-Z0-9]{59} # GitHub fine-grained
391
+ |
392
+ xox[baprs]-[a-zA-Z0-9-]{10,} # Slack
393
+ |
394
+ ya29\.[a-zA-Z0-9_-]+ # Google OAuth
395
+ )
396
+ ''',
397
+ re.VERBOSE
398
+ ),
399
+ confidence=0.95,
400
+ context_patterns=["api", "key", "token", "secret", "auth"],
401
+ ),
402
+
403
+ # AWS Access Key
404
+ PIIPattern(
405
+ pii_type=PIIType.AWS_KEY,
406
+ severity=PIISeverity.CRITICAL,
407
+ pattern=re.compile(
408
+ r'\b(?:AKIA|ABIA|ACCA|ASIA)[A-Z0-9]{16}\b'
409
+ ),
410
+ confidence=0.95,
411
+ context_patterns=["aws", "amazon", "key", "access"],
412
+ ),
413
+
414
+ # Crypto Wallet - Bitcoin
415
+ PIIPattern(
416
+ pii_type=PIIType.CRYPTO_WALLET,
417
+ severity=PIISeverity.HIGH,
418
+ pattern=re.compile(
419
+ r'\b(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}\b'
420
+ ),
421
+ confidence=0.80,
422
+ context_patterns=["bitcoin", "btc", "wallet", "crypto"],
423
+ ),
424
+
425
+ # Crypto Wallet - Ethereum
426
+ PIIPattern(
427
+ pii_type=PIIType.CRYPTO_WALLET,
428
+ severity=PIISeverity.HIGH,
429
+ pattern=re.compile(
430
+ r'\b0x[a-fA-F0-9]{40}\b'
431
+ ),
432
+ confidence=0.80,
433
+ context_patterns=["ethereum", "eth", "wallet", "crypto"],
434
+ ),
435
+
436
+ # GPS Coordinates
437
+ PIIPattern(
438
+ pii_type=PIIType.GPS_COORDINATES,
439
+ severity=PIISeverity.MEDIUM,
440
+ pattern=re.compile(
441
+ r'[-+]?(?:[1-8]?\d(?:\.\d+)?|90(?:\.0+)?)\s*,\s*[-+]?(?:180(?:\.0+)?|(?:(?:1[0-7]\d)|(?:[1-9]?\d))(?:\.\d+)?)'
442
+ ),
443
+ confidence=0.70,
444
+ context_patterns=["location", "coordinates", "lat", "lng", "gps"],
445
+ ),
446
+
447
+ # Date of Birth patterns
448
+ PIIPattern(
449
+ pii_type=PIIType.DATE_OF_BIRTH,
450
+ severity=PIISeverity.MEDIUM,
451
+ pattern=re.compile(
452
+ r'\b(?:0?[1-9]|1[0-2])[/\-.](?:0?[1-9]|[12]\d|3[01])[/\-.](?:19|20)\d{2}\b'
453
+ ),
454
+ confidence=0.60, # Low base - needs context
455
+ context_patterns=["birth", "dob", "born", "birthday", "date of birth"],
456
+ ),
457
+
458
+ # US ZIP Code
459
+ PIIPattern(
460
+ pii_type=PIIType.ZIPCODE,
461
+ severity=PIISeverity.LOW,
462
+ pattern=re.compile(
463
+ r'\b\d{5}(?:-\d{4})?\b'
464
+ ),
465
+ confidence=0.50, # Low - needs context
466
+ context_patterns=["zip", "postal", "address", "code"],
467
+ ),
468
+
469
+ # URL (can contain sensitive info in path/query)
470
+ PIIPattern(
471
+ pii_type=PIIType.URL,
472
+ severity=PIISeverity.LOW,
473
+ pattern=re.compile(
474
+ r'https?://[^\s<>"{}|\\^`\[\]]+',
475
+ re.IGNORECASE
476
+ ),
477
+ confidence=0.70,
478
+ ),
479
+ ]
480
+
481
+
482
+ class PIIScanner:
483
+ """
484
+ Scanner for detecting PII in text and datasets.
485
+
486
+ Uses regex patterns with optional validation and context boosting.
487
+ """
488
+
489
+ def __init__(
490
+ self,
491
+ patterns: List[PIIPattern] = None,
492
+ min_confidence: float = 0.5,
493
+ context_boost: float = 0.1,
494
+ ):
495
+ """
496
+ Initialize scanner.
497
+
498
+ Args:
499
+ patterns: Custom patterns (defaults to PII_PATTERNS)
500
+ min_confidence: Minimum confidence to report (0.0-1.0)
501
+ context_boost: Confidence boost when context matches
502
+ """
503
+ self.patterns = patterns or PII_PATTERNS
504
+ self.min_confidence = min_confidence
505
+ self.context_boost = context_boost
506
+
507
+ def scan_text(
508
+ self,
509
+ text: str,
510
+ field_name: str = "",
511
+ row_index: int = -1,
512
+ ) -> List[PIIMatch]:
513
+ """
514
+ Scan text for PII.
515
+
516
+ Args:
517
+ text: Text to scan
518
+ field_name: Optional field name for tracking
519
+ row_index: Optional row index for tracking
520
+
521
+ Returns:
522
+ List of PIIMatch objects
523
+ """
524
+ if not text or not isinstance(text, str):
525
+ return []
526
+
527
+ matches = []
528
+ text_lower = text.lower()
529
+
530
+ for pattern in self.patterns:
531
+ for match in pattern.pattern.finditer(text):
532
+ value = match.group()
533
+ confidence = pattern.confidence
534
+
535
+ # Validate if validator provided
536
+ if pattern.validator:
537
+ if not pattern.validator(value):
538
+ continue
539
+
540
+ # Context boost
541
+ if pattern.context_patterns:
542
+ for ctx in pattern.context_patterns:
543
+ if ctx in text_lower:
544
+ confidence = min(1.0, confidence + self.context_boost)
545
+ break
546
+
547
+ # Apply minimum confidence filter
548
+ if confidence >= self.min_confidence:
549
+ # Get surrounding context (50 chars each side)
550
+ start = max(0, match.start() - 50)
551
+ end = min(len(text), match.end() + 50)
552
+ context = text[start:end]
553
+
554
+ matches.append(PIIMatch(
555
+ pii_type=pattern.pii_type,
556
+ severity=pattern.severity,
557
+ value=value,
558
+ start=match.start(),
559
+ end=match.end(),
560
+ confidence=confidence,
561
+ context=context,
562
+ field_name=field_name,
563
+ row_index=row_index,
564
+ ))
565
+
566
+ return matches
567
+
568
+ def scan_dict(
569
+ self,
570
+ data: Dict[str, List[Any]],
571
+ sample_size: int = 1000,
572
+ ) -> PIIScanResult:
573
+ """
574
+ Scan a columnar dict for PII.
575
+
576
+ Args:
577
+ data: Dict of column_name -> values
578
+ sample_size: Max rows to scan per column
579
+
580
+ Returns:
581
+ PIIScanResult with aggregated findings
582
+ """
583
+ result = PIIScanResult()
584
+
585
+ for field_name, values in data.items():
586
+ if not values:
587
+ continue
588
+
589
+ # Sample values
590
+ sample = values[:sample_size]
591
+
592
+ for row_idx, value in enumerate(sample):
593
+ if not isinstance(value, str):
594
+ value = str(value) if value is not None else ""
595
+
596
+ matches = self.scan_text(value, field_name, row_idx)
597
+
598
+ for match in matches:
599
+ result.total_matches += 1
600
+
601
+ # Count by type
602
+ type_name = match.pii_type.value
603
+ result.matches_by_type[type_name] = result.matches_by_type.get(type_name, 0) + 1
604
+
605
+ # Count by severity
606
+ sev = match.severity.value
607
+ result.matches_by_severity[sev] = result.matches_by_severity.get(sev, 0) + 1
608
+
609
+ # Count by field
610
+ result.matches_by_field[field_name] = result.matches_by_field.get(field_name, 0) + 1
611
+
612
+ # Track fields
613
+ result.fields_with_pii.add(field_name)
614
+ if match.severity in [PIISeverity.CRITICAL, PIISeverity.HIGH]:
615
+ result.high_risk_fields.add(field_name)
616
+
617
+ # Keep samples
618
+ if len(result.sample_matches) < 100:
619
+ result.sample_matches.append(match)
620
+
621
+ return result
622
+
623
+ def scan_dataset(
624
+ self,
625
+ dataset,
626
+ sample_size: int = 1000,
627
+ ) -> PIIScanResult:
628
+ """
629
+ Scan a HuggingFace Dataset or DatasetDict for PII.
630
+
631
+ Args:
632
+ dataset: HuggingFace Dataset or DatasetDict
633
+ sample_size: Max rows to scan
634
+
635
+ Returns:
636
+ PIIScanResult with aggregated findings
637
+ """
638
+ # Handle DatasetDict (multiple splits)
639
+ if hasattr(dataset, 'keys') and callable(dataset.keys):
640
+ combined = PIIScanResult()
641
+ for split_name in dataset.keys():
642
+ split_result = self.scan_dataset(dataset[split_name], sample_size)
643
+ # Merge results
644
+ combined.total_matches += split_result.total_matches
645
+ for k, v in split_result.matches_by_type.items():
646
+ combined.matches_by_type[k] = combined.matches_by_type.get(k, 0) + v
647
+ for k, v in split_result.matches_by_severity.items():
648
+ combined.matches_by_severity[k] = combined.matches_by_severity.get(k, 0) + v
649
+ for k, v in split_result.matches_by_field.items():
650
+ combined.matches_by_field[k] = combined.matches_by_field.get(k, 0) + v
651
+ combined.fields_with_pii.update(split_result.fields_with_pii)
652
+ combined.high_risk_fields.update(split_result.high_risk_fields)
653
+ combined.sample_matches.extend(split_result.sample_matches[:20])
654
+ return combined
655
+
656
+ # Single Dataset
657
+ result = PIIScanResult()
658
+
659
+ # Get column names
660
+ if hasattr(dataset, 'features'):
661
+ columns = list(dataset.features.keys())
662
+ elif hasattr(dataset, 'column_names'):
663
+ columns = dataset.column_names
664
+ else:
665
+ return result
666
+
667
+ # Sample rows
668
+ num_rows = len(dataset) if hasattr(dataset, '__len__') else sample_size
669
+ sample_indices = range(min(sample_size, num_rows))
670
+
671
+ for idx in sample_indices:
672
+ row = dataset[idx]
673
+ for col in columns:
674
+ value = row.get(col) if isinstance(row, dict) else getattr(row, col, None)
675
+ if not isinstance(value, str):
676
+ value = str(value) if value is not None else ""
677
+
678
+ matches = self.scan_text(value, col, idx)
679
+
680
+ for match in matches:
681
+ result.total_matches += 1
682
+
683
+ type_name = match.pii_type.value
684
+ result.matches_by_type[type_name] = result.matches_by_type.get(type_name, 0) + 1
685
+
686
+ sev = match.severity.value
687
+ result.matches_by_severity[sev] = result.matches_by_severity.get(sev, 0) + 1
688
+
689
+ result.matches_by_field[col] = result.matches_by_field.get(col, 0) + 1
690
+
691
+ result.fields_with_pii.add(col)
692
+ if match.severity in [PIISeverity.CRITICAL, PIISeverity.HIGH]:
693
+ result.high_risk_fields.add(col)
694
+
695
+ if len(result.sample_matches) < 100:
696
+ result.sample_matches.append(match)
697
+
698
+ return result
699
+
700
+
701
+ # Singleton scanner
702
+ _scanner = PIIScanner()
703
+
704
+
705
+ def scan_for_pii(
706
+ data,
707
+ sample_size: int = 1000,
708
+ min_confidence: float = 0.5,
709
+ ) -> PIIScanResult:
710
+ """
711
+ Convenience function to scan data for PII.
712
+
713
+ Args:
714
+ data: Text, dict, or HuggingFace Dataset
715
+ sample_size: Max rows to scan
716
+ min_confidence: Minimum confidence threshold
717
+
718
+ Returns:
719
+ PIIScanResult with findings
720
+ """
721
+ scanner = PIIScanner(min_confidence=min_confidence)
722
+
723
+ if isinstance(data, str):
724
+ matches = scanner.scan_text(data)
725
+ result = PIIScanResult(
726
+ total_matches=len(matches),
727
+ sample_matches=matches,
728
+ )
729
+ for m in matches:
730
+ result.matches_by_type[m.pii_type.value] = result.matches_by_type.get(m.pii_type.value, 0) + 1
731
+ result.matches_by_severity[m.severity.value] = result.matches_by_severity.get(m.severity.value, 0) + 1
732
+ return result
733
+
734
+ if isinstance(data, dict):
735
+ return scanner.scan_dict(data, sample_size)
736
+
737
+ # Assume HuggingFace Dataset
738
+ return scanner.scan_dataset(data, sample_size)
739
+
740
+
741
+ def quick_pii_check(data, sample_size: int = 100) -> bool:
742
+ """
743
+ Quick check if data contains any PII.
744
+
745
+ Returns True if PII is found, False otherwise.
746
+ """
747
+ result = scan_for_pii(data, sample_size=sample_size, min_confidence=0.7)
748
+ return result.total_matches > 0
cascade/data/provenance.py ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Provenance Graph
3
+
4
+ Tracks entities, activities, agents, and their relationships.
5
+ Supports Merkle tree hashing for tamper-evident lineage.
6
+ """
7
+
8
+ import hashlib
9
+ import json
10
+ import time
11
+ from dataclasses import dataclass, field
12
+ from typing import Any, Dict, List, Optional, Set, Tuple, Iterator
13
+
14
+ from .entities import (
15
+ DatasetEntity, Activity, Agent, Relationship, RelationType,
16
+ ActivityType, AgentType, create_system_agent
17
+ )
18
+
19
+
20
+ @dataclass
21
+ class ProvenanceNode:
22
+ """A node in the provenance graph with hash chain."""
23
+ node_id: str
24
+ node_type: str # entity, activity, agent
25
+ data: Dict[str, Any]
26
+
27
+ # Hash chain
28
+ node_hash: str = ""
29
+ parent_hashes: List[str] = field(default_factory=list)
30
+
31
+ def __post_init__(self):
32
+ if not self.node_hash:
33
+ self.node_hash = self._compute_hash()
34
+
35
+ def _compute_hash(self) -> str:
36
+ """Compute hash including parent hashes (Merkle-style)."""
37
+ content = json.dumps({
38
+ "id": self.node_id,
39
+ "type": self.node_type,
40
+ "data": self.data,
41
+ "parents": sorted(self.parent_hashes),
42
+ }, sort_keys=True, default=str)
43
+ return hashlib.sha256(content.encode()).hexdigest()
44
+
45
+
46
+ class ProvenanceGraph:
47
+ """
48
+ A graph of provenance relationships.
49
+
50
+ Tracks:
51
+ - Entities (datasets, versions, splits)
52
+ - Activities (transforms, training, inference)
53
+ - Agents (users, models, pipelines)
54
+ - Relationships between them
55
+
56
+ Provides:
57
+ - Lineage queries (what produced this? what did this produce?)
58
+ - Hash chain for integrity verification
59
+ - Export to PROV-O and Croissant formats
60
+ """
61
+
62
+ def __init__(self, name: str = "default"):
63
+ self.name = name
64
+ self.created_at = time.time()
65
+
66
+ # Storage
67
+ self._entities: Dict[str, DatasetEntity] = {}
68
+ self._activities: Dict[str, Activity] = {}
69
+ self._agents: Dict[str, Agent] = {}
70
+ self._relationships: List[Relationship] = []
71
+
72
+ # Hash chain
73
+ self._nodes: Dict[str, ProvenanceNode] = {}
74
+ self._root_hash: Optional[str] = None
75
+
76
+ # Default system agent
77
+ self._system_agent = create_system_agent("cascade-data-observatory")
78
+ self.add_agent(self._system_agent)
79
+
80
+ # ═══════════════════════════════════════════════════════════════════════════
81
+ # ENTITY MANAGEMENT
82
+ # ═══════════════════════════════════════════════════════════════════════════
83
+
84
+ def add_entity(self, entity: DatasetEntity) -> str:
85
+ """Add a dataset entity to the graph."""
86
+ self._entities[entity.id] = entity
87
+
88
+ # Create provenance node
89
+ node = ProvenanceNode(
90
+ node_id=entity.id,
91
+ node_type="entity",
92
+ data=entity.to_dict(),
93
+ )
94
+ self._nodes[entity.id] = node
95
+ self._update_root_hash()
96
+
97
+ return entity.id
98
+
99
+ def get_entity(self, entity_id: str) -> Optional[DatasetEntity]:
100
+ """Get entity by ID."""
101
+ return self._entities.get(entity_id)
102
+
103
+ def list_entities(self) -> List[DatasetEntity]:
104
+ """List all entities."""
105
+ return list(self._entities.values())
106
+
107
+ # ═══════════════════════════════════════════════════════════════════════════
108
+ # ACTIVITY MANAGEMENT
109
+ # ═══════════════════════════════════════════════════════════════════════════
110
+
111
+ def add_activity(self, activity: Activity) -> str:
112
+ """Add an activity to the graph."""
113
+ self._activities[activity.id] = activity
114
+
115
+ # Link to agent
116
+ if not activity.agent_id:
117
+ activity.agent_id = self._system_agent.id
118
+
119
+ # Create provenance node with parent hashes from inputs
120
+ parent_hashes = []
121
+ for input_id in activity.inputs:
122
+ if input_id in self._nodes:
123
+ parent_hashes.append(self._nodes[input_id].node_hash)
124
+
125
+ node = ProvenanceNode(
126
+ node_id=activity.id,
127
+ node_type="activity",
128
+ data=activity.to_dict(),
129
+ parent_hashes=parent_hashes,
130
+ )
131
+ self._nodes[activity.id] = node
132
+ self._update_root_hash()
133
+
134
+ return activity.id
135
+
136
+ def get_activity(self, activity_id: str) -> Optional[Activity]:
137
+ """Get activity by ID."""
138
+ return self._activities.get(activity_id)
139
+
140
+ def list_activities(self) -> List[Activity]:
141
+ """List all activities."""
142
+ return list(self._activities.values())
143
+
144
+ # ═══════════════════════════════════════════════════════════════════════════
145
+ # AGENT MANAGEMENT
146
+ # ═══════════════════════════════════════════════════════════════════════════
147
+
148
+ def add_agent(self, agent: Agent) -> str:
149
+ """Add an agent to the graph."""
150
+ self._agents[agent.id] = agent
151
+
152
+ node = ProvenanceNode(
153
+ node_id=agent.id,
154
+ node_type="agent",
155
+ data=agent.to_dict(),
156
+ )
157
+ self._nodes[agent.id] = node
158
+
159
+ return agent.id
160
+
161
+ def get_agent(self, agent_id: str) -> Optional[Agent]:
162
+ """Get agent by ID."""
163
+ return self._agents.get(agent_id)
164
+
165
+ def list_agents(self) -> List[Agent]:
166
+ """List all agents."""
167
+ return list(self._agents.values())
168
+
169
+ def list_relationships(self) -> List[Relationship]:
170
+ """List all relationships."""
171
+ return list(self._relationships)
172
+
173
+ # ═══════════════════════════════════════════════════════════════════════════
174
+ # RELATIONSHIP MANAGEMENT
175
+ # ═══════════════════════════════════════════════════════════════════════════
176
+
177
+ def add_relationship(
178
+ self,
179
+ relation_type: RelationType,
180
+ source_id: str,
181
+ target_id: str,
182
+ attributes: Dict[str, Any] = None,
183
+ timestamp: float = None,
184
+ ) -> Relationship:
185
+ """Add a relationship between nodes."""
186
+ rel = Relationship(
187
+ relation_type=relation_type,
188
+ source_id=source_id,
189
+ target_id=target_id,
190
+ timestamp=timestamp if timestamp is not None else time.time(),
191
+ attributes=attributes or {},
192
+ )
193
+ self._relationships.append(rel)
194
+ return rel
195
+
196
+ def link_derivation(self, derived_id: str, source_id: str) -> Relationship:
197
+ """Record that derived entity came from source entity."""
198
+ return self.add_relationship(
199
+ RelationType.WAS_DERIVED_FROM,
200
+ source_id=derived_id,
201
+ target_id=source_id,
202
+ )
203
+
204
+ def link_generation(self, entity_id: str, activity_id: str) -> Relationship:
205
+ """Record that entity was generated by activity."""
206
+ return self.add_relationship(
207
+ RelationType.WAS_GENERATED_BY,
208
+ source_id=entity_id,
209
+ target_id=activity_id,
210
+ )
211
+
212
+ def link_usage(self, activity_id: str, entity_id: str) -> Relationship:
213
+ """Record that activity used entity as input."""
214
+ return self.add_relationship(
215
+ RelationType.USED,
216
+ source_id=activity_id,
217
+ target_id=entity_id,
218
+ )
219
+
220
+ def link_attribution(self, entity_id: str, agent_id: str) -> Relationship:
221
+ """Record that entity was attributed to agent."""
222
+ return self.add_relationship(
223
+ RelationType.WAS_ATTRIBUTED_TO,
224
+ source_id=entity_id,
225
+ target_id=agent_id,
226
+ )
227
+
228
+ def link_association(self, activity_id: str, agent_id: str) -> Relationship:
229
+ """Record that activity was associated with agent."""
230
+ return self.add_relationship(
231
+ RelationType.WAS_ASSOCIATED_WITH,
232
+ source_id=activity_id,
233
+ target_id=agent_id,
234
+ )
235
+
236
+ # ═══════════════════════════════════════════════════════════════════════════
237
+ # LINEAGE QUERIES
238
+ # ═══════════════════════════════════════════════════════════════════════════
239
+
240
+ def get_lineage(self, entity_id: str, direction: str = "upstream") -> List[str]:
241
+ """
242
+ Get lineage for an entity.
243
+
244
+ Args:
245
+ entity_id: The entity to trace
246
+ direction: "upstream" (what produced this) or "downstream" (what this produced)
247
+
248
+ Returns:
249
+ List of entity IDs in lineage order
250
+ """
251
+ visited: Set[str] = set()
252
+ lineage: List[str] = []
253
+
254
+ def trace(current_id: str):
255
+ if current_id in visited:
256
+ return
257
+ visited.add(current_id)
258
+
259
+ for rel in self._relationships:
260
+ if direction == "upstream":
261
+ # Follow wasDerivedFrom backwards
262
+ if rel.relation_type == RelationType.WAS_DERIVED_FROM:
263
+ if rel.source_id == current_id:
264
+ lineage.append(rel.target_id)
265
+ trace(rel.target_id)
266
+ else:
267
+ # Follow wasDerivedFrom forwards
268
+ if rel.relation_type == RelationType.WAS_DERIVED_FROM:
269
+ if rel.target_id == current_id:
270
+ lineage.append(rel.source_id)
271
+ trace(rel.source_id)
272
+
273
+ trace(entity_id)
274
+ return lineage
275
+
276
+ def get_activities_for_entity(self, entity_id: str) -> List[Activity]:
277
+ """Get activities that generated or used this entity."""
278
+ activity_ids = set()
279
+
280
+ for rel in self._relationships:
281
+ if rel.relation_type == RelationType.WAS_GENERATED_BY:
282
+ if rel.source_id == entity_id:
283
+ activity_ids.add(rel.target_id)
284
+ elif rel.relation_type == RelationType.USED:
285
+ if rel.target_id == entity_id:
286
+ activity_ids.add(rel.source_id)
287
+
288
+ return [self._activities[aid] for aid in activity_ids if aid in self._activities]
289
+
290
+ def get_inputs_for_activity(self, activity_id: str) -> List[DatasetEntity]:
291
+ """Get entities that were inputs to an activity."""
292
+ entity_ids = set()
293
+
294
+ for rel in self._relationships:
295
+ if rel.relation_type == RelationType.USED:
296
+ if rel.source_id == activity_id:
297
+ entity_ids.add(rel.target_id)
298
+
299
+ return [self._entities[eid] for eid in entity_ids if eid in self._entities]
300
+
301
+ def get_outputs_for_activity(self, activity_id: str) -> List[DatasetEntity]:
302
+ """Get entities that were outputs of an activity."""
303
+ entity_ids = set()
304
+
305
+ for rel in self._relationships:
306
+ if rel.relation_type == RelationType.WAS_GENERATED_BY:
307
+ if rel.target_id == activity_id:
308
+ entity_ids.add(rel.source_id)
309
+
310
+ return [self._entities[eid] for eid in entity_ids if eid in self._entities]
311
+
312
+ # ═══════════════════════════════════════════════════════════════════════════
313
+ # HASH CHAIN
314
+ # ═══════════════════════════════════════════════════════════════════════════
315
+
316
+ def _update_root_hash(self):
317
+ """Update the Merkle root hash."""
318
+ if not self._nodes:
319
+ self._root_hash = None
320
+ return
321
+
322
+ # Compute root from all node hashes
323
+ all_hashes = sorted([n.node_hash for n in self._nodes.values()])
324
+ combined = "".join(all_hashes)
325
+ self._root_hash = hashlib.sha256(combined.encode()).hexdigest()
326
+
327
+ @property
328
+ def root_hash(self) -> Optional[str]:
329
+ """Get the current Merkle root hash."""
330
+ return self._root_hash
331
+
332
+ def verify_integrity(self) -> Tuple[bool, List[str]]:
333
+ """
334
+ Verify integrity of the provenance graph.
335
+
336
+ Returns:
337
+ (is_valid, list of invalid node IDs)
338
+ """
339
+ invalid = []
340
+
341
+ for node_id, node in self._nodes.items():
342
+ expected_hash = node._compute_hash()
343
+ if expected_hash != node.node_hash:
344
+ invalid.append(node_id)
345
+
346
+ return len(invalid) == 0, invalid
347
+
348
+ # ═══════════════════════════════════════════════════════════════════════════
349
+ # EXPORT
350
+ # ═══════════════════════════════════════════════════════════════════════════
351
+
352
+ def to_dict(self) -> Dict[str, Any]:
353
+ """Export graph to dictionary."""
354
+ return {
355
+ "name": self.name,
356
+ "created_at": self.created_at,
357
+ "root_hash": self._root_hash,
358
+ "entities": {k: v.to_dict() for k, v in self._entities.items()},
359
+ "activities": {k: v.to_dict() for k, v in self._activities.items()},
360
+ "agents": {k: v.to_dict() for k, v in self._agents.items()},
361
+ "relationships": [r.to_dict() for r in self._relationships],
362
+ }
363
+
364
+ def to_prov_n(self) -> str:
365
+ """Export as PROV-N notation."""
366
+ lines = [
367
+ f"document",
368
+ f" prefix cascade <https://cascade.ai/ns/>",
369
+ f" prefix prov <http://www.w3.org/ns/prov#>",
370
+ f"",
371
+ ]
372
+
373
+ # Entities
374
+ for entity in self._entities.values():
375
+ lines.append(f" {entity.to_prov_n()}")
376
+
377
+ lines.append("")
378
+
379
+ # Activities
380
+ for activity in self._activities.values():
381
+ lines.append(f" {activity.to_prov_n()}")
382
+
383
+ lines.append("")
384
+
385
+ # Agents
386
+ for agent in self._agents.values():
387
+ lines.append(f" {agent.to_prov_n()}")
388
+
389
+ lines.append("")
390
+
391
+ # Relationships
392
+ for rel in self._relationships:
393
+ lines.append(f" {rel.to_prov_n()}")
394
+
395
+ lines.append("")
396
+ lines.append("endDocument")
397
+
398
+ return "\n".join(lines)
399
+
400
+ def to_prov_jsonld(self) -> Dict[str, Any]:
401
+ """Export as PROV-O JSON-LD."""
402
+ return {
403
+ "@context": {
404
+ "prov": "http://www.w3.org/ns/prov#",
405
+ "cascade": "https://cascade.ai/ns/",
406
+ "xsd": "http://www.w3.org/2001/XMLSchema#",
407
+ },
408
+ "@graph": [
409
+ *[e.to_dict() for e in self._entities.values()],
410
+ *[a.to_dict() for a in self._activities.values()],
411
+ *[a.to_dict() for a in self._agents.values()],
412
+ ],
413
+ }
414
+
415
+ @classmethod
416
+ def from_dict(cls, data: Dict[str, Any]) -> "ProvenanceGraph":
417
+ """Load graph from dictionary."""
418
+ graph = cls(name=data.get("name", "default"))
419
+ graph.created_at = data.get("created_at", time.time())
420
+
421
+ # Load entities
422
+ for entity_data in data.get("entities", {}).values():
423
+ entity = DatasetEntity(
424
+ id=entity_data["@id"],
425
+ name=entity_data["name"],
426
+ content_hash=entity_data.get("content_hash"),
427
+ schema_hash=entity_data.get("schema_hash"),
428
+ version=entity_data.get("version"),
429
+ previous_version=entity_data.get("previous_version"),
430
+ source_type=entity_data.get("source_type", "unknown"),
431
+ source_uri=entity_data.get("source_uri"),
432
+ record_count=entity_data.get("record_count"),
433
+ size_bytes=entity_data.get("size_bytes"),
434
+ splits=entity_data.get("splits", {}),
435
+ attributes=entity_data.get("attributes", {}),
436
+ created_at=entity_data.get("created_at", time.time()),
437
+ )
438
+ graph.add_entity(entity)
439
+
440
+ # Load activities
441
+ for activity_data in data.get("activities", {}).values():
442
+ activity = Activity(
443
+ id=activity_data["@id"],
444
+ activity_type=ActivityType(activity_data["activity_type"]),
445
+ name=activity_data["name"],
446
+ started_at=activity_data.get("started_at"),
447
+ ended_at=activity_data.get("ended_at"),
448
+ inputs=activity_data.get("inputs", []),
449
+ outputs=activity_data.get("outputs", []),
450
+ agent_id=activity_data.get("agent_id"),
451
+ parameters=activity_data.get("parameters", {}),
452
+ attributes=activity_data.get("attributes", {}),
453
+ )
454
+ graph.add_activity(activity)
455
+
456
+ # Load agents
457
+ for agent_data in data.get("agents", {}).values():
458
+ agent = Agent(
459
+ id=agent_data["@id"],
460
+ agent_type=AgentType(agent_data["agent_type"]),
461
+ name=agent_data["name"],
462
+ version=agent_data.get("version"),
463
+ parent_agent_id=agent_data.get("parent_agent_id"),
464
+ identifier=agent_data.get("identifier"),
465
+ attributes=agent_data.get("attributes", {}),
466
+ created_at=agent_data.get("created_at", time.time()),
467
+ )
468
+ graph.add_agent(agent)
469
+
470
+ # Load relationships
471
+ for rel_data in data.get("relationships", []):
472
+ graph.add_relationship(
473
+ relation_type=RelationType(rel_data["type"]),
474
+ source_id=rel_data["source"],
475
+ target_id=rel_data["target"],
476
+ attributes=rel_data.get("attributes", {}),
477
+ timestamp=rel_data.get("timestamp"),
478
+ )
479
+
480
+ return graph
481
+
482
+ # ═══════════════════════════════════════════════════════════════════════════
483
+ # STATISTICS
484
+ # ═══════════════════════════════════════════════════════════════════════════
485
+
486
+ @property
487
+ def stats(self) -> Dict[str, int]:
488
+ """Get graph statistics."""
489
+ return {
490
+ "entities": len(self._entities),
491
+ "activities": len(self._activities),
492
+ "agents": len(self._agents),
493
+ "relationships": len(self._relationships),
494
+ }
495
+
496
+ def __repr__(self) -> str:
497
+ stats = self.stats
498
+ return (
499
+ f"ProvenanceGraph(name='{self.name}', "
500
+ f"entities={stats['entities']}, "
501
+ f"activities={stats['activities']}, "
502
+ f"relationships={stats['relationships']})"
503
+ )
cascade/data/schema.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Schema Observer
3
+
4
+ Observes and hashes dataset schemas/features.
5
+ Works with HuggingFace datasets Features, Pandas DataFrames, and raw dicts.
6
+ """
7
+
8
+ import hashlib
9
+ import json
10
+ from dataclasses import dataclass, field
11
+ from typing import Any, Dict, List, Optional, Union
12
+
13
+
14
+ @dataclass
15
+ class FieldSchema:
16
+ """Schema for a single field/column."""
17
+ name: str
18
+ dtype: str # Normalized type name
19
+
20
+ # Type details
21
+ nullable: bool = True
22
+ is_list: bool = False
23
+ list_inner_type: Optional[str] = None
24
+
25
+ # For ClassLabel
26
+ is_categorical: bool = False
27
+ categories: Optional[List[str]] = None
28
+ num_categories: Optional[int] = None
29
+
30
+ # For nested structures
31
+ nested_fields: Optional[Dict[str, "FieldSchema"]] = None
32
+
33
+ # For arrays/tensors
34
+ shape: Optional[tuple] = None
35
+
36
+ # Constraints
37
+ min_value: Optional[float] = None
38
+ max_value: Optional[float] = None
39
+ pattern: Optional[str] = None # Regex for strings
40
+
41
+ # Metadata
42
+ description: Optional[str] = None
43
+
44
+ def to_dict(self) -> Dict[str, Any]:
45
+ result = {
46
+ "name": self.name,
47
+ "dtype": self.dtype,
48
+ "nullable": self.nullable,
49
+ }
50
+ if self.is_list:
51
+ result["is_list"] = True
52
+ result["list_inner_type"] = self.list_inner_type
53
+ if self.is_categorical:
54
+ result["is_categorical"] = True
55
+ result["categories"] = self.categories
56
+ result["num_categories"] = self.num_categories
57
+ if self.nested_fields:
58
+ result["nested_fields"] = {
59
+ k: v.to_dict() for k, v in self.nested_fields.items()
60
+ }
61
+ if self.shape:
62
+ result["shape"] = self.shape
63
+ if self.description:
64
+ result["description"] = self.description
65
+ return result
66
+
67
+ def hash(self) -> str:
68
+ """Hash this field's structure."""
69
+ content = json.dumps(self.to_dict(), sort_keys=True)
70
+ return hashlib.sha256(content.encode()).hexdigest()[:16]
71
+
72
+
73
+ @dataclass
74
+ class DatasetSchema:
75
+ """Complete schema for a dataset."""
76
+ fields: Dict[str, FieldSchema] = field(default_factory=dict)
77
+
78
+ # Dataset-level metadata
79
+ primary_key: Optional[List[str]] = None
80
+ foreign_keys: Dict[str, str] = field(default_factory=dict) # field → target
81
+
82
+ # Source info
83
+ source_format: Optional[str] = None # arrow, parquet, csv, etc.
84
+
85
+ def add_field(self, field_schema: FieldSchema):
86
+ """Add a field to the schema."""
87
+ self.fields[field_schema.name] = field_schema
88
+
89
+ @property
90
+ def field_names(self) -> List[str]:
91
+ return list(self.fields.keys())
92
+
93
+ @property
94
+ def num_fields(self) -> int:
95
+ return len(self.fields)
96
+
97
+ def to_dict(self) -> Dict[str, Any]:
98
+ return {
99
+ "fields": {k: v.to_dict() for k, v in self.fields.items()},
100
+ "primary_key": self.primary_key,
101
+ "foreign_keys": self.foreign_keys,
102
+ "source_format": self.source_format,
103
+ }
104
+
105
+ def hash(self) -> str:
106
+ """Compute schema hash - identifies structure regardless of content."""
107
+ # Sort fields for deterministic hashing
108
+ ordered_fields = sorted(self.fields.keys())
109
+ content = json.dumps({
110
+ "fields": [self.fields[k].to_dict() for k in ordered_fields],
111
+ "primary_key": self.primary_key,
112
+ }, sort_keys=True)
113
+ return hashlib.sha256(content.encode()).hexdigest()
114
+
115
+ def diff(self, other: "DatasetSchema") -> Dict[str, Any]:
116
+ """Compare two schemas and return differences."""
117
+ added = set(other.field_names) - set(self.field_names)
118
+ removed = set(self.field_names) - set(other.field_names)
119
+
120
+ modified = {}
121
+ for name in set(self.field_names) & set(other.field_names):
122
+ if self.fields[name].hash() != other.fields[name].hash():
123
+ modified[name] = {
124
+ "old": self.fields[name].to_dict(),
125
+ "new": other.fields[name].to_dict(),
126
+ }
127
+
128
+ return {
129
+ "added": list(added),
130
+ "removed": list(removed),
131
+ "modified": modified,
132
+ "compatible": len(removed) == 0 and len(modified) == 0,
133
+ }
134
+
135
+
136
+ class SchemaObserver:
137
+ """
138
+ Observes and extracts schemas from various data sources.
139
+ """
140
+
141
+ # Type mapping from various sources to normalized types
142
+ TYPE_MAP = {
143
+ # Python types
144
+ "str": "string",
145
+ "int": "int64",
146
+ "float": "float64",
147
+ "bool": "bool",
148
+ "bytes": "binary",
149
+
150
+ # NumPy types
151
+ "int8": "int8",
152
+ "int16": "int16",
153
+ "int32": "int32",
154
+ "int64": "int64",
155
+ "uint8": "uint8",
156
+ "uint16": "uint16",
157
+ "uint32": "uint32",
158
+ "uint64": "uint64",
159
+ "float16": "float16",
160
+ "float32": "float32",
161
+ "float64": "float64",
162
+
163
+ # Arrow types
164
+ "string": "string",
165
+ "large_string": "string",
166
+ "binary": "binary",
167
+ "large_binary": "binary",
168
+
169
+ # HuggingFace special types
170
+ "Image": "image",
171
+ "Audio": "audio",
172
+ "ClassLabel": "categorical",
173
+ }
174
+
175
+ def observe_hf_dataset(self, dataset) -> DatasetSchema:
176
+ """
177
+ Extract schema from HuggingFace Dataset.
178
+
179
+ Args:
180
+ dataset: A HuggingFace datasets.Dataset or DatasetDict
181
+
182
+ Returns:
183
+ DatasetSchema with all fields
184
+ """
185
+ schema = DatasetSchema(source_format="arrow")
186
+
187
+ # Get features (works for both Dataset and DatasetDict)
188
+ if hasattr(dataset, 'features'):
189
+ features = dataset.features
190
+ elif hasattr(dataset, '__iter__'):
191
+ # DatasetDict - get features from first split
192
+ first_split = next(iter(dataset.values()))
193
+ features = first_split.features
194
+ else:
195
+ raise ValueError(f"Cannot extract features from {type(dataset)}")
196
+
197
+ # Parse each feature
198
+ for name, feature in features.items():
199
+ field_schema = self._parse_hf_feature(name, feature)
200
+ schema.add_field(field_schema)
201
+
202
+ return schema
203
+
204
+ def _parse_hf_feature(self, name: str, feature) -> FieldSchema:
205
+ """Parse a HuggingFace Feature into FieldSchema."""
206
+ # Import here to avoid hard dependency
207
+ try:
208
+ from datasets import (
209
+ Value, ClassLabel, Sequence,
210
+ Array2D, Array3D, Array4D, Array5D,
211
+ Image, Audio
212
+ )
213
+ except ImportError:
214
+ # Fallback for when datasets not installed
215
+ return FieldSchema(name=name, dtype="unknown")
216
+
217
+ # Value type (primitives)
218
+ if isinstance(feature, Value):
219
+ return FieldSchema(
220
+ name=name,
221
+ dtype=self.TYPE_MAP.get(feature.dtype, feature.dtype),
222
+ )
223
+
224
+ # ClassLabel (categorical)
225
+ if isinstance(feature, ClassLabel):
226
+ return FieldSchema(
227
+ name=name,
228
+ dtype="categorical",
229
+ is_categorical=True,
230
+ categories=feature.names,
231
+ num_categories=feature.num_classes,
232
+ )
233
+
234
+ # Sequence (list)
235
+ if isinstance(feature, Sequence):
236
+ inner = self._parse_hf_feature(f"{name}_inner", feature.feature)
237
+ return FieldSchema(
238
+ name=name,
239
+ dtype="list",
240
+ is_list=True,
241
+ list_inner_type=inner.dtype,
242
+ )
243
+
244
+ # Arrays
245
+ if isinstance(feature, (Array2D, Array3D, Array4D, Array5D)):
246
+ return FieldSchema(
247
+ name=name,
248
+ dtype=self.TYPE_MAP.get(feature.dtype, feature.dtype),
249
+ shape=feature.shape,
250
+ )
251
+
252
+ # Image
253
+ if isinstance(feature, Image):
254
+ return FieldSchema(
255
+ name=name,
256
+ dtype="image",
257
+ )
258
+
259
+ # Audio
260
+ if isinstance(feature, Audio):
261
+ return FieldSchema(
262
+ name=name,
263
+ dtype="audio",
264
+ )
265
+
266
+ # Dict/nested structure
267
+ if isinstance(feature, dict):
268
+ nested = {}
269
+ for k, v in feature.items():
270
+ nested[k] = self._parse_hf_feature(k, v)
271
+ return FieldSchema(
272
+ name=name,
273
+ dtype="struct",
274
+ nested_fields=nested,
275
+ )
276
+
277
+ # Fallback
278
+ return FieldSchema(
279
+ name=name,
280
+ dtype=str(type(feature).__name__),
281
+ )
282
+
283
+ def observe_pandas(self, df) -> DatasetSchema:
284
+ """
285
+ Extract schema from Pandas DataFrame.
286
+
287
+ Args:
288
+ df: A pandas DataFrame
289
+
290
+ Returns:
291
+ DatasetSchema with all fields
292
+ """
293
+ schema = DatasetSchema(source_format="pandas")
294
+
295
+ for col in df.columns:
296
+ dtype = str(df[col].dtype)
297
+ normalized = self.TYPE_MAP.get(dtype, dtype)
298
+
299
+ # Check for categorical
300
+ if dtype == "category":
301
+ schema.add_field(FieldSchema(
302
+ name=col,
303
+ dtype="categorical",
304
+ is_categorical=True,
305
+ categories=list(df[col].cat.categories),
306
+ num_categories=len(df[col].cat.categories),
307
+ ))
308
+ else:
309
+ schema.add_field(FieldSchema(
310
+ name=col,
311
+ dtype=normalized,
312
+ nullable=df[col].isna().any(),
313
+ ))
314
+
315
+ return schema
316
+
317
+ def observe_dict(self, data: Dict[str, Any], sample_size: int = 100) -> DatasetSchema:
318
+ """
319
+ Extract schema from a dict of lists (columnar format).
320
+
321
+ Args:
322
+ data: Dict mapping column names to lists of values
323
+ sample_size: Number of values to sample for type inference
324
+
325
+ Returns:
326
+ DatasetSchema with all fields
327
+ """
328
+ schema = DatasetSchema(source_format="dict")
329
+
330
+ for col, values in data.items():
331
+ if not values:
332
+ schema.add_field(FieldSchema(name=col, dtype="unknown"))
333
+ continue
334
+
335
+ # Sample values for type inference
336
+ sample = values[:sample_size]
337
+ types = set(type(v).__name__ for v in sample if v is not None)
338
+
339
+ # Determine type
340
+ if len(types) == 0:
341
+ dtype = "null"
342
+ elif len(types) == 1:
343
+ dtype = self.TYPE_MAP.get(types.pop(), "unknown")
344
+ else:
345
+ dtype = "mixed"
346
+
347
+ # Check for nulls
348
+ nullable = any(v is None for v in sample)
349
+
350
+ schema.add_field(FieldSchema(
351
+ name=col,
352
+ dtype=dtype,
353
+ nullable=nullable,
354
+ ))
355
+
356
+ return schema
357
+
358
+ def observe_arrow(self, table) -> DatasetSchema:
359
+ """
360
+ Extract schema from PyArrow Table.
361
+
362
+ Args:
363
+ table: A pyarrow.Table
364
+
365
+ Returns:
366
+ DatasetSchema with all fields
367
+ """
368
+ schema = DatasetSchema(source_format="arrow")
369
+
370
+ for field in table.schema:
371
+ dtype = str(field.type)
372
+ normalized = self.TYPE_MAP.get(dtype, dtype)
373
+
374
+ schema.add_field(FieldSchema(
375
+ name=field.name,
376
+ dtype=normalized,
377
+ nullable=field.nullable,
378
+ ))
379
+
380
+ return schema
381
+
382
+
383
+ def hash_content(data, sample_size: int = 10000) -> str:
384
+ """
385
+ Compute content hash of dataset.
386
+
387
+ For large datasets, samples rows for efficiency.
388
+ """
389
+ hasher = hashlib.sha256()
390
+
391
+ # Handle dict first (dict also has __iter__ and __len__)
392
+ if isinstance(data, dict):
393
+ content = json.dumps(data, sort_keys=True, default=str)
394
+ hasher.update(content.encode())
395
+
396
+ # Handle list
397
+ elif isinstance(data, list):
398
+ for item in data[:sample_size]:
399
+ item_str = json.dumps(item, sort_keys=True, default=str)
400
+ hasher.update(item_str.encode())
401
+
402
+ # Handle HuggingFace Dataset or other iterables with __len__
403
+ elif hasattr(data, '__iter__') and hasattr(data, '__len__'):
404
+ # Sample if large
405
+ n = len(data)
406
+ if n > sample_size:
407
+ import random
408
+ indices = sorted(random.sample(range(n), sample_size))
409
+ sample = [data[i] for i in indices]
410
+ else:
411
+ sample = list(data)
412
+
413
+ for row in sample:
414
+ row_str = json.dumps(row, sort_keys=True, default=str)
415
+ hasher.update(row_str.encode())
416
+
417
+ return hasher.hexdigest()
cascade/demo.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE-LATTICE Interactive Demo
3
+
4
+ Launch the LunarLander demo showcasing:
5
+ - cascade.hold: Human-in-the-loop intervention
6
+ - cascade.store: Provenance tracking
7
+ - Merkle-chained decision records
8
+
9
+ Usage:
10
+ cascade-demo # Run the demo
11
+ python -m cascade.demo # Alternative
12
+
13
+ Controls:
14
+ [H] HOLD-FREEZE - Pause time, inspect AI decision
15
+ [T] HOLD-TAKEOVER - Continue time, YOU control with WASD
16
+ [ESC] Release hold, return to AI sovereignty
17
+
18
+ In HOLD modes:
19
+ [W] Main Engine (thrust up)
20
+ [A] Left Engine (rotate)
21
+ [D] Right Engine (rotate)
22
+ [S] No-op / Accept AI decision
23
+ """
24
+
25
+ import sys
26
+ import subprocess
27
+ from pathlib import Path
28
+
29
+
30
+ def check_demo_dependencies():
31
+ """Check if demo dependencies are installed."""
32
+ missing = []
33
+
34
+ try:
35
+ import gymnasium
36
+ except ImportError:
37
+ missing.append("gymnasium")
38
+
39
+ try:
40
+ import pygame
41
+ except ImportError:
42
+ missing.append("pygame")
43
+
44
+ try:
45
+ import stable_baselines3
46
+ except ImportError:
47
+ missing.append("stable-baselines3")
48
+
49
+ try:
50
+ import box2d
51
+ except ImportError:
52
+ missing.append("box2d-py")
53
+
54
+ return missing
55
+
56
+
57
+ def main():
58
+ """Launch the interactive CASCADE-LATTICE demo."""
59
+ print("""
60
+ ╔═══════════════════════════════════════════════════════════════════════════════╗
61
+ ║ ║
62
+ ║ ██████╗ █████╗ ███████╗ ██████╗ █████╗ ██████╗ ███████╗ ║
63
+ ║ ██╔════╝██╔══██╗██╔════╝██╔════╝██╔══██╗██╔══██╗██╔════╝ ║
64
+ ║ ██║ ███████║███████╗██║ ███████║██║ ██║█████╗ ║
65
+ ║ ██║ ██╔══██║╚════██║██║ ██╔══██║██║ ██║██╔══╝ ║
66
+ ║ ╚██████╗██║ ██║███████║╚██████╗██║ ██║██████╔╝███████╗ ║
67
+ ║ ╚═════╝╚═╝ ╚═╝╚══════╝ ╚═════╝╚═╝ ╚═╝╚═════╝ ╚══════╝ ║
68
+ ║ ║
69
+ ║ LATTICE DEMO - Sovereign Neural Internetwork Control ║
70
+ ║ ║
71
+ ╚═══════════════════════════════════════════════════════════════════════════════╝
72
+ """)
73
+
74
+ # Check dependencies
75
+ missing = check_demo_dependencies()
76
+ if missing:
77
+ print(f"[!] Missing demo dependencies: {', '.join(missing)}")
78
+ print()
79
+ print(" Install with:")
80
+ print(" pip install cascade-lattice[demo]")
81
+ print()
82
+ print(" Or manually:")
83
+ print(f" pip install {' '.join(missing)}")
84
+ sys.exit(1)
85
+
86
+ # Check for rl-zoo3 (needed for model download)
87
+ try:
88
+ import rl_zoo3
89
+ except ImportError:
90
+ print("[!] Missing rl-zoo3 (needed for pretrained model)")
91
+ print(" pip install rl-zoo3")
92
+ sys.exit(1)
93
+
94
+ print("[CASCADE] Starting LunarLander demo...")
95
+ print()
96
+ print("Controls:")
97
+ print(" [H] HOLD-FREEZE - Pause time, inspect AI decision")
98
+ print(" [T] HOLD-TAKEOVER - Continue time, YOU control with WASD")
99
+ print(" [ESC] Release hold / Quit")
100
+ print()
101
+ print("In HOLD modes:")
102
+ print(" [W] Main Engine [A] Left Engine [D] Right Engine")
103
+ print(" [S] Accept AI choice / No-op")
104
+ print()
105
+
106
+ # Run the demo
107
+ demo_path = Path(__file__).parent.parent / "examples" / "sovereign_lattice_eval.py"
108
+
109
+ if not demo_path.exists():
110
+ # Try installed package location
111
+ import cascade
112
+ package_dir = Path(cascade.__file__).parent
113
+ demo_path = package_dir.parent / "examples" / "sovereign_lattice_eval.py"
114
+
115
+ if not demo_path.exists():
116
+ # Fallback: run inline demo
117
+ print("[!] Demo file not found. Running inline version...")
118
+ _run_inline_demo()
119
+ return
120
+
121
+ # Run the demo script
122
+ subprocess.run([sys.executable, str(demo_path)])
123
+
124
+
125
+ def _run_inline_demo():
126
+ """Minimal inline demo if main file not found."""
127
+ import gymnasium as gym
128
+ import numpy as np
129
+
130
+ from cascade import init
131
+ from cascade.hold import Hold
132
+ from cascade.store import observe
133
+
134
+ init(project="cascade_demo")
135
+ hold = Hold.get()
136
+
137
+ print("[CASCADE] Running minimal demo (install full package for GUI)")
138
+ print()
139
+
140
+ env = gym.make("LunarLander-v3")
141
+ obs, _ = env.reset()
142
+
143
+ for step in range(100):
144
+ # Random policy for minimal demo
145
+ action_probs = np.array([0.25, 0.25, 0.25, 0.25])
146
+
147
+ resolution = hold.yield_point(
148
+ action_probs=action_probs,
149
+ value=0.0,
150
+ observation={"state": obs.tolist()[:4]},
151
+ brain_id="random_demo",
152
+ action_labels=["NOOP", "LEFT", "MAIN", "RIGHT"],
153
+ blocking=False
154
+ )
155
+
156
+ obs, reward, term, trunc, _ = env.step(resolution.action)
157
+
158
+ observe("demo", {
159
+ "step": step,
160
+ "action": int(resolution.action),
161
+ "reward": float(reward),
162
+ "merkle": resolution.merkle_root,
163
+ }, sync=False)
164
+
165
+ if term or trunc:
166
+ print(f"[CASCADE] Episode ended at step {step}")
167
+ break
168
+
169
+ env.close()
170
+ print("[CASCADE] Demo complete. Check ~/.cascade/lattice for provenance data.")
171
+
172
+
173
+ if __name__ == "__main__":
174
+ main()
cascade/demo_sdk.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE SDK Demo - Shows automatic observation of calls.
3
+
4
+ Run: python -m cascade.demo_sdk
5
+ """
6
+
7
+ import os
8
+ import sys
9
+
10
+ # Add cascade to path if needed
11
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
12
+
13
+
14
+ def demo_manual_observation():
15
+ """Demo manual observation without any provider installed."""
16
+ print("=" * 60)
17
+ print("CASCADE SDK Demo - Manual Observation")
18
+ print("=" * 60)
19
+
20
+ import cascade
21
+ from cascade.sdk import CascadeSDK
22
+
23
+ # Initialize with verbose mode
24
+ sdk = CascadeSDK()
25
+ sdk.init(emit_async=False, verbose=True)
26
+
27
+ print("\n[1] Simulating an OpenAI call...")
28
+ sdk.observe(
29
+ model_id="openai/gpt-4",
30
+ input_data="What is the capital of France?",
31
+ output_data="The capital of France is Paris.",
32
+ metrics={"prompt_tokens": 10, "completion_tokens": 8, "total_tokens": 18},
33
+ context={"provider": "openai", "endpoint": "chat.completions"}
34
+ )
35
+
36
+ print("\n[2] Simulating an Anthropic call...")
37
+ sdk.observe(
38
+ model_id="anthropic/claude-3-opus-20240229",
39
+ input_data="Explain quantum entanglement simply.",
40
+ output_data="Quantum entanglement is when two particles become connected...",
41
+ metrics={"input_tokens": 6, "output_tokens": 45},
42
+ context={"provider": "anthropic", "endpoint": "messages"}
43
+ )
44
+
45
+ print("\n[3] Simulating an Ollama local call...")
46
+ sdk.observe(
47
+ model_id="ollama/llama2:7b",
48
+ input_data="Write a haiku about coding.",
49
+ output_data="Fingers on keyboard\nLogic flows like mountain stream\nBugs become features",
50
+ metrics={"eval_count": 20, "eval_duration": 1.5},
51
+ context={"provider": "ollama", "endpoint": "generate"}
52
+ )
53
+
54
+ print("\n" + "=" * 60)
55
+ print("Observations saved to lattice/observations/")
56
+ print("=" * 60)
57
+
58
+ # Show what was saved
59
+ from cascade.observation import ObservationManager
60
+ manager = ObservationManager()
61
+ stats = manager.get_stats()
62
+ print(f"\nTotal observations: {stats['total_observations']}")
63
+ print(f"Model observations: {stats['model_observations']}")
64
+ print(f"Unique models: {stats['unique_models']}")
65
+
66
+
67
+ def demo_auto_patch():
68
+ """Demo auto-patching (requires providers to be installed)."""
69
+ print("\n" + "=" * 60)
70
+ print("CASCADE Auto-Patch Demo")
71
+ print("=" * 60)
72
+
73
+ import cascade
74
+
75
+ # This patches all installed providers
76
+ cascade.init(verbose=True)
77
+
78
+ print("\nPatched providers. Now any call will emit receipts.")
79
+ print("Example usage:")
80
+ print("""
81
+ import cascade
82
+ cascade.init()
83
+
84
+ # OpenAI (if installed)
85
+ import openai
86
+ client = openai.OpenAI()
87
+ response = client.chat.completions.create(
88
+ model="gpt-4",
89
+ messages=[{"role": "user", "content": "Hello!"}]
90
+ )
91
+ # ^^^ Receipt automatically emitted to lattice
92
+
93
+ # Anthropic (if installed)
94
+ import anthropic
95
+ client = anthropic.Anthropic()
96
+ response = client.messages.create(
97
+ model="claude-3-opus-20240229",
98
+ max_tokens=100,
99
+ messages=[{"role": "user", "content": "Hello!"}]
100
+ )
101
+ # ^^^ Receipt automatically emitted to lattice
102
+
103
+ # Ollama (if installed)
104
+ import ollama
105
+ response = ollama.chat(model="llama2", messages=[
106
+ {"role": "user", "content": "Hello!"}
107
+ ])
108
+ # ^^^ Receipt automatically emitted to lattice
109
+ """)
110
+
111
+
112
+ if __name__ == "__main__":
113
+ demo_manual_observation()
114
+ demo_auto_patch()
cascade/export/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Export Module - Tableau and BI Integration
3
+ """
4
+
5
+ from .tableau_export import (
6
+ export_for_tableau,
7
+ export_events_csv,
8
+ export_chains_csv,
9
+ export_metrics_csv,
10
+ export_hold_events_csv,
11
+ export_causation_graph_csv,
12
+ TableauExporter,
13
+ )
14
+
15
+ __all__ = [
16
+ "export_for_tableau",
17
+ "export_events_csv",
18
+ "export_chains_csv",
19
+ "export_metrics_csv",
20
+ "export_hold_events_csv",
21
+ "export_causation_graph_csv",
22
+ "TableauExporter",
23
+ ]
cascade/export/tableau_export.py ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE → Tableau Export Pipeline
3
+
4
+ Exports Cascade data in Tableau-friendly formats:
5
+ - CSV files (universal)
6
+ - Hyper files (native Tableau, optional)
7
+
8
+ Usage:
9
+ from cascade.export import export_for_tableau
10
+
11
+ # Export all data to a directory
12
+ export_for_tableau("./tableau_data")
13
+
14
+ # Then in Tableau: Connect → Text File → select CSVs
15
+ """
16
+
17
+ import csv
18
+ import json
19
+ import os
20
+ from pathlib import Path
21
+ from datetime import datetime
22
+ from typing import Dict, List, Any, Optional
23
+ from dataclasses import dataclass, asdict
24
+
25
+ # Try to import Hyper API (optional)
26
+ try:
27
+ from tableauhyperapi import (
28
+ HyperProcess, Telemetry, Connection, CreateMode,
29
+ TableDefinition, SqlType, TableName, Inserter
30
+ )
31
+ HAS_HYPER = True
32
+ except ImportError:
33
+ HAS_HYPER = False
34
+
35
+
36
+ @dataclass
37
+ class EventRow:
38
+ """Flattened event for Tableau."""
39
+ event_id: str
40
+ timestamp: float
41
+ timestamp_iso: str
42
+ component: str
43
+ event_type: str
44
+ data_json: str
45
+ # Extracted common fields
46
+ loss: Optional[float] = None
47
+ accuracy: Optional[float] = None
48
+ learning_rate: Optional[float] = None
49
+ epoch: Optional[int] = None
50
+ step: Optional[int] = None
51
+ tokens: Optional[int] = None
52
+ latency_ms: Optional[float] = None
53
+ error_message: Optional[str] = None
54
+
55
+
56
+ @dataclass
57
+ class ChainRow:
58
+ """Flattened provenance chain for Tableau."""
59
+ session_id: str
60
+ model_id: str
61
+ model_hash: str
62
+ input_hash: str
63
+ output_hash: Optional[str]
64
+ merkle_root: str
65
+ created_at: float
66
+ created_at_iso: str
67
+ record_count: int
68
+ external_links_count: int
69
+ is_verified: bool
70
+
71
+
72
+ @dataclass
73
+ class HoldEventRow:
74
+ """Flattened HOLD event for Tableau."""
75
+ hold_id: str
76
+ timestamp: float
77
+ timestamp_iso: str
78
+ brain_id: str
79
+ state: str # PENDING, ACCEPTED, OVERRIDDEN, TIMEOUT
80
+ ai_choice: int
81
+ ai_confidence: float
82
+ final_action: int
83
+ was_override: bool
84
+ hold_duration_sec: float
85
+ value_estimate: float
86
+ action_count: int
87
+ override_source: Optional[str] = None
88
+
89
+
90
+ @dataclass
91
+ class CausationEdgeRow:
92
+ """Flattened causation link for Tableau."""
93
+ link_id: str
94
+ from_event_id: str
95
+ to_event_id: str
96
+ causation_type: str # temporal, correlation, threshold, direct
97
+ strength: float
98
+ timestamp: float
99
+ timestamp_iso: str
100
+
101
+
102
+ @dataclass
103
+ class MetricRow:
104
+ """Time-series metric for Tableau."""
105
+ timestamp: float
106
+ timestamp_iso: str
107
+ metric_name: str
108
+ metric_value: float
109
+ category: str # TRAINING_DYNAMICS, GRADIENT_HEALTH, etc.
110
+ component: str
111
+ is_anomaly: bool
112
+ anomaly_severity: Optional[str] = None
113
+
114
+
115
+ def _ts_to_iso(ts: float) -> str:
116
+ """Convert Unix timestamp to ISO string."""
117
+ try:
118
+ return datetime.fromtimestamp(ts).isoformat()
119
+ except:
120
+ return ""
121
+
122
+
123
+ def _extract_metric_fields(data: Dict) -> Dict[str, Any]:
124
+ """Extract common metric fields from event data."""
125
+ return {
126
+ "loss": data.get("loss"),
127
+ "accuracy": data.get("accuracy") or data.get("acc"),
128
+ "learning_rate": data.get("learning_rate") or data.get("lr"),
129
+ "epoch": data.get("epoch"),
130
+ "step": data.get("step") or data.get("iter"),
131
+ "tokens": data.get("tokens") or data.get("total_tokens"),
132
+ "latency_ms": data.get("latency_ms") or data.get("latency"),
133
+ "error_message": data.get("error") or data.get("message"),
134
+ }
135
+
136
+
137
+ class TableauExporter:
138
+ """
139
+ Export Cascade data for Tableau visualization.
140
+
141
+ Creates a directory with CSV files ready for Tableau import:
142
+ - events.csv: All observed events
143
+ - chains.csv: Provenance chains
144
+ - hold_events.csv: HOLD protocol events
145
+ - causation_edges.csv: Graph edges for relationship diagrams
146
+ - metrics_timeseries.csv: Metrics over time
147
+
148
+ Example:
149
+ exporter = TableauExporter()
150
+ exporter.add_events(events)
151
+ exporter.add_chains(chains)
152
+ exporter.export("./tableau_data")
153
+ """
154
+
155
+ def __init__(self):
156
+ self.events: List[EventRow] = []
157
+ self.chains: List[ChainRow] = []
158
+ self.hold_events: List[HoldEventRow] = []
159
+ self.causation_edges: List[CausationEdgeRow] = []
160
+ self.metrics: List[MetricRow] = []
161
+
162
+ def add_event(self, event) -> None:
163
+ """Add a Cascade Event."""
164
+ data = event.data if hasattr(event, 'data') else {}
165
+ extracted = _extract_metric_fields(data)
166
+
167
+ row = EventRow(
168
+ event_id=event.event_id,
169
+ timestamp=event.timestamp,
170
+ timestamp_iso=_ts_to_iso(event.timestamp),
171
+ component=event.component,
172
+ event_type=event.event_type,
173
+ data_json=json.dumps(data),
174
+ **extracted
175
+ )
176
+ self.events.append(row)
177
+
178
+ def add_events(self, events) -> None:
179
+ """Add multiple events."""
180
+ for e in events:
181
+ self.add_event(e)
182
+
183
+ def add_chain(self, chain, is_verified: bool = True) -> None:
184
+ """Add a ProvenanceChain."""
185
+ row = ChainRow(
186
+ session_id=chain.session_id,
187
+ model_id=chain.model_id,
188
+ model_hash=chain.model_hash,
189
+ input_hash=chain.input_hash,
190
+ output_hash=chain.output_hash,
191
+ merkle_root=chain.merkle_root or "",
192
+ created_at=chain.created_at,
193
+ created_at_iso=_ts_to_iso(chain.created_at),
194
+ record_count=len(chain.records),
195
+ external_links_count=len(chain.external_roots),
196
+ is_verified=is_verified,
197
+ )
198
+ self.chains.append(row)
199
+
200
+ def add_chains(self, chains) -> None:
201
+ """Add multiple chains."""
202
+ for c in chains:
203
+ self.add_chain(c)
204
+
205
+ def add_hold_event(self, hold_point, resolution) -> None:
206
+ """Add a HOLD event with its resolution."""
207
+ import numpy as np
208
+
209
+ probs = hold_point.action_probs
210
+ if isinstance(probs, np.ndarray):
211
+ ai_choice = int(np.argmax(probs))
212
+ ai_confidence = float(np.max(probs))
213
+ action_count = len(probs)
214
+ else:
215
+ ai_choice = 0
216
+ ai_confidence = 0.0
217
+ action_count = 0
218
+
219
+ row = HoldEventRow(
220
+ hold_id=getattr(hold_point, 'hold_id', f"hold_{hold_point.timestamp}"),
221
+ timestamp=hold_point.timestamp if hasattr(hold_point, 'timestamp') else 0,
222
+ timestamp_iso=_ts_to_iso(hold_point.timestamp) if hasattr(hold_point, 'timestamp') else "",
223
+ brain_id=hold_point.brain_id,
224
+ state=resolution.state.value if hasattr(resolution.state, 'value') else str(resolution.state),
225
+ ai_choice=ai_choice,
226
+ ai_confidence=ai_confidence,
227
+ final_action=resolution.action,
228
+ was_override=resolution.was_override,
229
+ hold_duration_sec=resolution.hold_duration if hasattr(resolution, 'hold_duration') else 0,
230
+ value_estimate=hold_point.value,
231
+ action_count=action_count,
232
+ override_source=resolution.override_source if hasattr(resolution, 'override_source') else None,
233
+ )
234
+ self.hold_events.append(row)
235
+
236
+ def add_causation_link(self, link) -> None:
237
+ """Add a causation graph edge."""
238
+ row = CausationEdgeRow(
239
+ link_id=link.link_id if hasattr(link, 'link_id') else f"{link.from_event}_{link.to_event}",
240
+ from_event_id=link.from_event,
241
+ to_event_id=link.to_event,
242
+ causation_type=link.causation_type,
243
+ strength=link.strength,
244
+ timestamp=link.timestamp if hasattr(link, 'timestamp') else 0,
245
+ timestamp_iso=_ts_to_iso(link.timestamp) if hasattr(link, 'timestamp') else "",
246
+ )
247
+ self.causation_edges.append(row)
248
+
249
+ def add_causation_links(self, links) -> None:
250
+ """Add multiple causation links."""
251
+ for link in links:
252
+ self.add_causation_link(link)
253
+
254
+ def add_metric(self, name: str, value: float, timestamp: float,
255
+ category: str = "OTHER", component: str = "default",
256
+ is_anomaly: bool = False, anomaly_severity: str = None) -> None:
257
+ """Add a time-series metric point."""
258
+ row = MetricRow(
259
+ timestamp=timestamp,
260
+ timestamp_iso=_ts_to_iso(timestamp),
261
+ metric_name=name,
262
+ metric_value=value,
263
+ category=category,
264
+ component=component,
265
+ is_anomaly=is_anomaly,
266
+ anomaly_severity=anomaly_severity,
267
+ )
268
+ self.metrics.append(row)
269
+
270
+ def add_metrics_from_event(self, event, category_map: Dict[str, str] = None) -> None:
271
+ """Extract and add all metrics from an event."""
272
+ if category_map is None:
273
+ category_map = {
274
+ "loss": "TRAINING_DYNAMICS",
275
+ "accuracy": "TRAINING_DYNAMICS",
276
+ "lr": "TRAINING_DYNAMICS",
277
+ "learning_rate": "TRAINING_DYNAMICS",
278
+ "grad_norm": "GRADIENT_HEALTH",
279
+ "weight_norm": "WEIGHT_DYNAMICS",
280
+ "tokens": "MEMORY_COMPUTE",
281
+ "latency": "MEMORY_COMPUTE",
282
+ }
283
+
284
+ data = event.data if hasattr(event, 'data') else {}
285
+ for key, value in data.items():
286
+ if isinstance(value, (int, float)) and not isinstance(value, bool):
287
+ self.add_metric(
288
+ name=key,
289
+ value=float(value),
290
+ timestamp=event.timestamp,
291
+ category=category_map.get(key, "OTHER"),
292
+ component=event.component,
293
+ )
294
+
295
+ def _write_csv(self, path: Path, rows: List, fieldnames: List[str]) -> None:
296
+ """Write rows to CSV."""
297
+ with open(path, 'w', newline='', encoding='utf-8') as f:
298
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
299
+ writer.writeheader()
300
+ for row in rows:
301
+ writer.writerow(asdict(row) if hasattr(row, '__dataclass_fields__') else row)
302
+
303
+ def export(self, output_dir: str) -> Dict[str, str]:
304
+ """
305
+ Export all data to CSV files.
306
+
307
+ Args:
308
+ output_dir: Directory to write CSV files
309
+
310
+ Returns:
311
+ Dict mapping data type to file path
312
+ """
313
+ output_path = Path(output_dir)
314
+ output_path.mkdir(parents=True, exist_ok=True)
315
+
316
+ files = {}
317
+
318
+ # Events
319
+ if self.events:
320
+ events_path = output_path / "events.csv"
321
+ self._write_csv(events_path, self.events, list(EventRow.__dataclass_fields__.keys()))
322
+ files["events"] = str(events_path)
323
+ print(f"✓ Exported {len(self.events)} events to {events_path}")
324
+
325
+ # Chains
326
+ if self.chains:
327
+ chains_path = output_path / "chains.csv"
328
+ self._write_csv(chains_path, self.chains, list(ChainRow.__dataclass_fields__.keys()))
329
+ files["chains"] = str(chains_path)
330
+ print(f"✓ Exported {len(self.chains)} chains to {chains_path}")
331
+
332
+ # HOLD events
333
+ if self.hold_events:
334
+ hold_path = output_path / "hold_events.csv"
335
+ self._write_csv(hold_path, self.hold_events, list(HoldEventRow.__dataclass_fields__.keys()))
336
+ files["hold_events"] = str(hold_path)
337
+ print(f"✓ Exported {len(self.hold_events)} HOLD events to {hold_path}")
338
+
339
+ # Causation edges
340
+ if self.causation_edges:
341
+ edges_path = output_path / "causation_edges.csv"
342
+ self._write_csv(edges_path, self.causation_edges, list(CausationEdgeRow.__dataclass_fields__.keys()))
343
+ files["causation_edges"] = str(edges_path)
344
+ print(f"✓ Exported {len(self.causation_edges)} causation edges to {edges_path}")
345
+
346
+ # Metrics time series
347
+ if self.metrics:
348
+ metrics_path = output_path / "metrics_timeseries.csv"
349
+ self._write_csv(metrics_path, self.metrics, list(MetricRow.__dataclass_fields__.keys()))
350
+ files["metrics"] = str(metrics_path)
351
+ print(f"✓ Exported {len(self.metrics)} metric points to {metrics_path}")
352
+
353
+ # Write a manifest
354
+ manifest_path = output_path / "manifest.json"
355
+ manifest = {
356
+ "exported_at": datetime.now().isoformat(),
357
+ "files": files,
358
+ "counts": {
359
+ "events": len(self.events),
360
+ "chains": len(self.chains),
361
+ "hold_events": len(self.hold_events),
362
+ "causation_edges": len(self.causation_edges),
363
+ "metrics": len(self.metrics),
364
+ }
365
+ }
366
+ with open(manifest_path, 'w') as f:
367
+ json.dump(manifest, f, indent=2)
368
+
369
+ print(f"\n📊 Tableau export complete: {output_path}")
370
+ print(f" Open Tableau → Connect → Text File → Select CSVs")
371
+
372
+ return files
373
+
374
+ def export_hyper(self, output_path: str) -> Optional[str]:
375
+ """
376
+ Export to Tableau Hyper format (native, fastest).
377
+
378
+ Requires: pip install tableauhyperapi
379
+ """
380
+ if not HAS_HYPER:
381
+ print("⚠️ Hyper API not installed. Run: pip install tableauhyperapi")
382
+ return None
383
+
384
+ hyper_path = Path(output_path)
385
+
386
+ with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper:
387
+ with Connection(hyper.endpoint, str(hyper_path), CreateMode.CREATE_AND_REPLACE) as conn:
388
+
389
+ # Create events table
390
+ if self.events:
391
+ events_table = TableDefinition(
392
+ TableName("events"),
393
+ [
394
+ ("event_id", SqlType.text()),
395
+ ("timestamp", SqlType.double()),
396
+ ("timestamp_iso", SqlType.text()),
397
+ ("component", SqlType.text()),
398
+ ("event_type", SqlType.text()),
399
+ ("loss", SqlType.double()),
400
+ ("accuracy", SqlType.double()),
401
+ ("tokens", SqlType.int()),
402
+ ]
403
+ )
404
+ conn.catalog.create_table(events_table)
405
+
406
+ with Inserter(conn, events_table) as inserter:
407
+ for e in self.events:
408
+ inserter.add_row([
409
+ e.event_id, e.timestamp, e.timestamp_iso,
410
+ e.component, e.event_type,
411
+ e.loss, e.accuracy, e.tokens
412
+ ])
413
+ inserter.execute()
414
+
415
+ print(f"✓ Exported Hyper file: {hyper_path}")
416
+ return str(hyper_path)
417
+
418
+
419
+ # =============================================================================
420
+ # Convenience Functions
421
+ # =============================================================================
422
+
423
+ def export_for_tableau(output_dir: str = "./tableau_export",
424
+ include_sample_data: bool = True) -> Dict[str, str]:
425
+ """
426
+ One-line export of all Cascade data for Tableau.
427
+
428
+ Args:
429
+ output_dir: Where to write CSV files
430
+ include_sample_data: Generate sample data if no real data
431
+
432
+ Returns:
433
+ Dict of exported file paths
434
+ """
435
+ exporter = TableauExporter()
436
+
437
+ # Try to load real data from Cascade store
438
+ try:
439
+ from cascade.store import query, stats
440
+ from cascade.observation import ObservationManager
441
+
442
+ # Get observations
443
+ manager = ObservationManager()
444
+ observations = manager.get_recent(limit=1000)
445
+
446
+ for obs in observations:
447
+ # Create mock event from observation
448
+ class MockEvent:
449
+ def __init__(self, o):
450
+ self.event_id = o.get('cid', '')
451
+ self.timestamp = o.get('timestamp', 0)
452
+ self.component = o.get('model_id', 'unknown')
453
+ self.event_type = 'inference'
454
+ self.data = o.get('data', {})
455
+
456
+ exporter.add_event(MockEvent(obs))
457
+ exporter.add_metrics_from_event(MockEvent(obs))
458
+
459
+ print(f"Loaded {len(observations)} observations from Cascade store")
460
+
461
+ except Exception as e:
462
+ print(f"Note: Could not load Cascade store ({e})")
463
+ if include_sample_data:
464
+ print("Generating sample data for demo...")
465
+ _add_sample_data(exporter)
466
+
467
+ return exporter.export(output_dir)
468
+
469
+
470
+ def _add_sample_data(exporter: TableauExporter) -> None:
471
+ """Add sample data for demonstration."""
472
+ import time
473
+ import random
474
+
475
+ base_time = time.time() - 3600 # 1 hour ago
476
+
477
+ # Sample events
478
+ models = ["gpt-4", "claude-3-opus", "llama-3-8b", "mistral-7b"]
479
+ event_types = ["inference", "training_step", "error", "checkpoint"]
480
+
481
+ for i in range(200):
482
+ class SampleEvent:
483
+ def __init__(self, idx):
484
+ self.event_id = f"evt_{idx:06d}"
485
+ self.timestamp = base_time + (idx * 18) # 18 sec apart
486
+ self.component = random.choice(models)
487
+ self.event_type = random.choice(event_types)
488
+ self.data = {
489
+ "loss": 2.5 - (idx * 0.01) + random.uniform(-0.1, 0.1),
490
+ "accuracy": min(0.95, 0.5 + (idx * 0.002) + random.uniform(-0.02, 0.02)),
491
+ "tokens": random.randint(100, 2000),
492
+ "latency_ms": random.uniform(50, 500),
493
+ "step": idx,
494
+ }
495
+
496
+ event = SampleEvent(i)
497
+ exporter.add_event(event)
498
+ exporter.add_metrics_from_event(event)
499
+
500
+ # Sample HOLD events
501
+ for i in range(20):
502
+ class SampleHoldPoint:
503
+ def __init__(self, idx):
504
+ import numpy as np
505
+ self.hold_id = f"hold_{idx:04d}"
506
+ self.timestamp = base_time + (idx * 180)
507
+ self.brain_id = random.choice(models)
508
+ self.action_probs = np.random.dirichlet([1, 1, 1, 1])
509
+ self.value = random.uniform(0.3, 0.9)
510
+
511
+ class SampleResolution:
512
+ def __init__(self, override=False):
513
+ self.state = type('State', (), {'value': 'OVERRIDDEN' if override else 'ACCEPTED'})()
514
+ self.action = random.randint(0, 3)
515
+ self.was_override = override
516
+ self.hold_duration = random.uniform(0.5, 10.0)
517
+ self.override_source = "human" if override else None
518
+
519
+ hold = SampleHoldPoint(i)
520
+ resolution = SampleResolution(override=random.random() < 0.25)
521
+ exporter.add_hold_event(hold, resolution)
522
+
523
+ # Sample causation edges
524
+ for i in range(50):
525
+ class SampleLink:
526
+ def __init__(self, idx):
527
+ self.link_id = f"link_{idx:04d}"
528
+ self.from_event = f"evt_{idx:06d}"
529
+ self.to_event = f"evt_{idx+1:06d}"
530
+ self.causation_type = random.choice(["temporal", "correlation", "threshold", "direct"])
531
+ self.strength = random.uniform(0.5, 1.0)
532
+ self.timestamp = base_time + (idx * 18)
533
+
534
+ exporter.add_causation_link(SampleLink(i))
535
+
536
+ # Sample chains
537
+ for i in range(10):
538
+ class SampleChain:
539
+ def __init__(self, idx):
540
+ self.session_id = f"session_{idx:04d}"
541
+ self.model_id = random.choice(models)
542
+ self.model_hash = f"{random.randint(0, 0xFFFFFFFF):08x}"
543
+ self.input_hash = f"{random.randint(0, 0xFFFFFFFF):08x}"
544
+ self.output_hash = f"{random.randint(0, 0xFFFFFFFF):08x}"
545
+ self.merkle_root = f"{random.randint(0, 0xFFFFFFFFFFFFFFFF):016x}"
546
+ self.created_at = base_time + (idx * 360)
547
+ self.records = [None] * random.randint(5, 50)
548
+ self.external_roots = [f"root_{j}" for j in range(random.randint(0, 3))]
549
+
550
+ exporter.add_chain(SampleChain(i))
551
+
552
+
553
+ def export_events_csv(events, output_path: str) -> str:
554
+ """Export events to CSV."""
555
+ exporter = TableauExporter()
556
+ exporter.add_events(events)
557
+ files = exporter.export(str(Path(output_path).parent))
558
+ return files.get("events", "")
559
+
560
+
561
+ def export_chains_csv(chains, output_path: str) -> str:
562
+ """Export chains to CSV."""
563
+ exporter = TableauExporter()
564
+ exporter.add_chains(chains)
565
+ files = exporter.export(str(Path(output_path).parent))
566
+ return files.get("chains", "")
567
+
568
+
569
+ def export_metrics_csv(events, output_path: str) -> str:
570
+ """Export metrics time series to CSV."""
571
+ exporter = TableauExporter()
572
+ for e in events:
573
+ exporter.add_metrics_from_event(e)
574
+ files = exporter.export(str(Path(output_path).parent))
575
+ return files.get("metrics", "")
576
+
577
+
578
+ def export_hold_events_csv(hold_pairs, output_path: str) -> str:
579
+ """Export HOLD events to CSV. hold_pairs = [(hold_point, resolution), ...]"""
580
+ exporter = TableauExporter()
581
+ for hold, res in hold_pairs:
582
+ exporter.add_hold_event(hold, res)
583
+ files = exporter.export(str(Path(output_path).parent))
584
+ return files.get("hold_events", "")
585
+
586
+
587
+ def export_causation_graph_csv(links, output_path: str) -> str:
588
+ """Export causation edges to CSV."""
589
+ exporter = TableauExporter()
590
+ exporter.add_causation_links(links)
591
+ files = exporter.export(str(Path(output_path).parent))
592
+ return files.get("causation_edges", "")
593
+
594
+
595
+ if __name__ == "__main__":
596
+ # Quick test
597
+ print("Exporting sample data for Tableau...")
598
+ export_for_tableau("./tableau_export", include_sample_data=True)
cascade/forensics/__init__.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Forensics - Read the Ghost in the Data
3
+
4
+ Every dataset is a confession. It remembers what happened to it.
5
+ This module reads those memories.
6
+
7
+ GHOST LOG: Inferred processing history from data artifacts
8
+ SKELETON: Probable system architecture
9
+ DNA: Technology fingerprints
10
+ SOUL: Behavioral predictions
11
+
12
+ Usage:
13
+ from cascade.forensics import DataForensics
14
+
15
+ forensics = DataForensics()
16
+ report = forensics.analyze(dataframe)
17
+
18
+ print(report.ghost_log) # Inferred operations
19
+ print(report.skeleton) # System architecture
20
+ print(report.fingerprints) # Technology hints
21
+ """
22
+
23
+ from cascade.forensics.analyzer import (
24
+ DataForensics,
25
+ ForensicsReport,
26
+ GhostLog,
27
+ InferredOperation,
28
+ )
29
+
30
+ from cascade.forensics.artifacts import (
31
+ ArtifactDetector,
32
+ TimestampArtifacts,
33
+ IDPatternArtifacts,
34
+ TextArtifacts,
35
+ NumericArtifacts,
36
+ NullPatternArtifacts,
37
+ SchemaArtifacts,
38
+ )
39
+
40
+ from cascade.forensics.fingerprints import (
41
+ TechFingerprinter,
42
+ Fingerprint,
43
+ )
44
+
45
+ __all__ = [
46
+ "DataForensics",
47
+ "ForensicsReport",
48
+ "GhostLog",
49
+ "InferredOperation",
50
+ "ArtifactDetector",
51
+ "TechFingerprinter",
52
+ "Fingerprint",
53
+ ]
cascade/forensics/analyzer.py ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Forensics - Main Analyzer
3
+
4
+ The data remembers. This module reads those memories.
5
+
6
+ Generates:
7
+ - GHOST LOG: Inferred sequence of operations
8
+ - SKELETON: Probable system architecture
9
+ - DNA: Technology fingerprints
10
+ - SOUL: Behavioral predictions
11
+ """
12
+
13
+ import hashlib
14
+ import json
15
+ import time
16
+ from dataclasses import dataclass, field
17
+ from typing import List, Dict, Any, Optional
18
+ from collections import OrderedDict
19
+
20
+ from cascade.forensics.artifacts import (
21
+ Artifact, ArtifactDetector,
22
+ TimestampArtifacts, IDPatternArtifacts, TextArtifacts,
23
+ NumericArtifacts, NullPatternArtifacts, SchemaArtifacts,
24
+ )
25
+ from cascade.forensics.fingerprints import TechFingerprinter, Fingerprint
26
+
27
+
28
+ @dataclass
29
+ class InferredOperation:
30
+ """A single inferred operation from the ghost log."""
31
+ sequence: int
32
+ operation: str
33
+ description: str
34
+ confidence: float
35
+ evidence: List[str] = field(default_factory=list)
36
+
37
+ def to_dict(self) -> Dict[str, Any]:
38
+ return {
39
+ "seq": self.sequence,
40
+ "op": self.operation,
41
+ "desc": self.description,
42
+ "confidence": self.confidence,
43
+ "evidence": self.evidence,
44
+ }
45
+
46
+
47
+ @dataclass
48
+ class GhostLog:
49
+ """
50
+ Inferred processing history - the ghost of the system.
51
+
52
+ This is a reconstruction of what PROBABLY happened
53
+ based on artifacts left in the data.
54
+ """
55
+ operations: List[InferredOperation] = field(default_factory=list)
56
+
57
+ # Provenance
58
+ analysis_timestamp: float = field(default_factory=time.time)
59
+ data_hash: str = ""
60
+ ghost_hash: str = ""
61
+
62
+ def add_operation(self, op: str, desc: str, confidence: float, evidence: List[str] = None):
63
+ """Add an inferred operation to the ghost log."""
64
+ self.operations.append(InferredOperation(
65
+ sequence=len(self.operations) + 1,
66
+ operation=op,
67
+ description=desc,
68
+ confidence=confidence,
69
+ evidence=evidence or [],
70
+ ))
71
+
72
+ def finalize(self) -> str:
73
+ """Compute hash of the ghost log for provenance."""
74
+ content = json.dumps([op.to_dict() for op in self.operations], sort_keys=True)
75
+ self.ghost_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
76
+ return self.ghost_hash
77
+
78
+ def to_dict(self) -> Dict[str, Any]:
79
+ return {
80
+ "operations": [op.to_dict() for op in self.operations],
81
+ "analysis_timestamp": self.analysis_timestamp,
82
+ "data_hash": self.data_hash,
83
+ "ghost_hash": self.ghost_hash,
84
+ }
85
+
86
+ def to_narrative(self) -> str:
87
+ """Generate human-readable narrative of inferred processing."""
88
+ if not self.operations:
89
+ return "No processing artifacts detected."
90
+
91
+ lines = ["## Ghost Log - Inferred Processing History\n"]
92
+ lines.append("*Based on artifacts left in the data, this is what probably happened:*\n")
93
+
94
+ for op in self.operations:
95
+ conf_str = "●" * int(op.confidence * 5) + "○" * (5 - int(op.confidence * 5))
96
+ lines.append(f"**{op.sequence}. {op.operation}** [{conf_str}]")
97
+ lines.append(f" {op.description}")
98
+ if op.evidence:
99
+ lines.append(f" *Evidence: {', '.join(op.evidence[:3])}*")
100
+ lines.append("")
101
+
102
+ return "\n".join(lines)
103
+
104
+
105
+ @dataclass
106
+ class ForensicsReport:
107
+ """Complete forensics analysis report."""
108
+
109
+ # Artifacts detected
110
+ artifacts: List[Artifact] = field(default_factory=list)
111
+
112
+ # Inferred processing
113
+ ghost_log: GhostLog = field(default_factory=GhostLog)
114
+
115
+ # Technology fingerprints
116
+ fingerprints: List[Fingerprint] = field(default_factory=list)
117
+
118
+ # Synthesized architecture
119
+ likely_stack: Dict[str, Any] = field(default_factory=dict)
120
+
121
+ # Security concerns
122
+ security_concerns: List[Dict[str, Any]] = field(default_factory=list)
123
+
124
+ # Metadata
125
+ analysis_timestamp: float = field(default_factory=time.time)
126
+ row_count: int = 0
127
+ column_count: int = 0
128
+ data_hash: str = ""
129
+
130
+ def to_dict(self) -> Dict[str, Any]:
131
+ return {
132
+ "artifacts": [a.to_dict() for a in self.artifacts],
133
+ "ghost_log": self.ghost_log.to_dict(),
134
+ "fingerprints": [f.to_dict() for f in self.fingerprints],
135
+ "likely_stack": self.likely_stack,
136
+ "security_concerns": self.security_concerns,
137
+ "metadata": {
138
+ "timestamp": self.analysis_timestamp,
139
+ "rows": self.row_count,
140
+ "columns": self.column_count,
141
+ "data_hash": self.data_hash,
142
+ }
143
+ }
144
+
145
+ def summary(self) -> Dict[str, Any]:
146
+ """Generate summary for display."""
147
+ return {
148
+ "artifacts_found": len(self.artifacts),
149
+ "operations_inferred": len(self.ghost_log.operations),
150
+ "technologies_identified": len(self.fingerprints),
151
+ "security_concerns": len(self.security_concerns),
152
+ "top_fingerprints": [f.technology for f in self.fingerprints[:5]],
153
+ "data_hash": self.data_hash,
154
+ "ghost_hash": self.ghost_log.ghost_hash,
155
+ }
156
+
157
+
158
+ class DataForensics:
159
+ """
160
+ Main forensics analyzer.
161
+
162
+ Usage:
163
+ forensics = DataForensics()
164
+ report = forensics.analyze(df)
165
+
166
+ print(report.ghost_log.to_narrative())
167
+ print(report.likely_stack)
168
+ """
169
+
170
+ def __init__(self):
171
+ self.detectors = [
172
+ TimestampArtifacts(),
173
+ IDPatternArtifacts(),
174
+ TextArtifacts(),
175
+ NumericArtifacts(),
176
+ NullPatternArtifacts(),
177
+ SchemaArtifacts(),
178
+ ]
179
+ self.fingerprinter = TechFingerprinter()
180
+
181
+ def analyze(self, df) -> ForensicsReport:
182
+ """
183
+ Analyze a dataframe for processing artifacts.
184
+
185
+ Args:
186
+ df: Pandas DataFrame to analyze
187
+
188
+ Returns:
189
+ ForensicsReport with all findings
190
+ """
191
+ report = ForensicsReport()
192
+ report.row_count = len(df)
193
+ report.column_count = len(df.columns)
194
+
195
+ # Compute data hash
196
+ try:
197
+ # Sample hash for large datasets
198
+ if len(df) > 10000:
199
+ sample = df.sample(10000, random_state=42)
200
+ else:
201
+ sample = df
202
+ content = sample.to_json()
203
+ report.data_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
204
+ except:
205
+ report.data_hash = "unknown"
206
+
207
+ # Run all detectors
208
+ all_artifacts = []
209
+
210
+ for detector in self.detectors:
211
+ try:
212
+ # Some detectors analyze all columns at once
213
+ if hasattr(detector, 'detect_all'):
214
+ artifacts = detector.detect_all(df)
215
+ all_artifacts.extend(artifacts)
216
+
217
+ # Column-by-column analysis
218
+ for col in df.columns:
219
+ artifacts = detector.detect(df, col)
220
+ all_artifacts.extend(artifacts)
221
+ except Exception as e:
222
+ # Don't let one detector crash the whole analysis
223
+ pass
224
+
225
+ report.artifacts = all_artifacts
226
+
227
+ # Build ghost log from artifacts
228
+ report.ghost_log = self._build_ghost_log(all_artifacts, df)
229
+ report.ghost_log.data_hash = report.data_hash
230
+ report.ghost_log.finalize()
231
+
232
+ # Generate technology fingerprints
233
+ report.fingerprints = self.fingerprinter.analyze(all_artifacts)
234
+ report.likely_stack = self.fingerprinter.get_likely_stack()
235
+ report.security_concerns = self.fingerprinter.get_security_concerns()
236
+
237
+ return report
238
+
239
+ def _build_ghost_log(self, artifacts: List[Artifact], df) -> GhostLog:
240
+ """
241
+ Build inferred processing history from artifacts.
242
+
243
+ This is where we reconstruct the sequence of operations
244
+ that probably created this data.
245
+ """
246
+ ghost = GhostLog()
247
+
248
+ # Group artifacts by type for logical ordering
249
+ by_type = {}
250
+ for a in artifacts:
251
+ if a.artifact_type not in by_type:
252
+ by_type[a.artifact_type] = []
253
+ by_type[a.artifact_type].append(a)
254
+
255
+ # Infer operations in logical order
256
+
257
+ # 1. Data sourcing (schema artifacts come first)
258
+ if "framework_fingerprint" in by_type:
259
+ for a in by_type["framework_fingerprint"]:
260
+ ghost.add_operation(
261
+ "DATA_SOURCE",
262
+ f"Data originated from {a.details.get('framework', 'database')}: {a.evidence}",
263
+ a.confidence,
264
+ [a.evidence]
265
+ )
266
+
267
+ if "naming_convention" in by_type:
268
+ for a in by_type["naming_convention"]:
269
+ ghost.add_operation(
270
+ "SCHEMA_ORIGIN",
271
+ f"Schema follows {a.details.get('convention', 'unknown')} convention",
272
+ a.confidence,
273
+ [a.evidence]
274
+ )
275
+
276
+ # 2. Merging (if multiple sources detected)
277
+ if "mixed_conventions" in by_type or "id_prefix" in by_type:
278
+ ghost.add_operation(
279
+ "DATA_MERGE",
280
+ "Multiple data sources were merged together",
281
+ 0.75,
282
+ [a.evidence for a in by_type.get("mixed_conventions", []) + by_type.get("id_prefix", [])]
283
+ )
284
+
285
+ # 3. ID generation
286
+ if "uuid_version" in by_type:
287
+ for a in by_type["uuid_version"]:
288
+ ghost.add_operation(
289
+ "ID_GENERATION",
290
+ f"IDs generated using {a.details.get('meaning', 'UUID')}",
291
+ a.confidence,
292
+ [a.evidence]
293
+ )
294
+
295
+ if "hash_id" in by_type:
296
+ for a in by_type["hash_id"]:
297
+ ghost.add_operation(
298
+ "ID_GENERATION",
299
+ f"IDs are {a.details.get('probable_algorithm', 'hash')}-based (content-addressed)",
300
+ a.confidence,
301
+ [a.evidence]
302
+ )
303
+
304
+ # 4. Processing / Transformation
305
+ if "case_normalization" in by_type:
306
+ for a in by_type["case_normalization"]:
307
+ ghost.add_operation(
308
+ "TEXT_NORMALIZATION",
309
+ f"Text converted to {a.details.get('case', 'normalized')} case",
310
+ a.confidence,
311
+ [a.evidence]
312
+ )
313
+
314
+ if "whitespace_trimming" in by_type:
315
+ ghost.add_operation(
316
+ "TEXT_CLEANING",
317
+ "Whitespace trimmed from text fields",
318
+ 0.70,
319
+ [a.evidence for a in by_type["whitespace_trimming"]]
320
+ )
321
+
322
+ if "truncation" in by_type:
323
+ for a in by_type["truncation"]:
324
+ ghost.add_operation(
325
+ "FIELD_TRUNCATION",
326
+ f"Text truncated at {a.details.get('max_length', '?')} characters",
327
+ a.confidence,
328
+ [a.evidence]
329
+ )
330
+
331
+ if "numeric_rounding" in by_type:
332
+ for a in by_type["numeric_rounding"]:
333
+ ghost.add_operation(
334
+ "NUMERIC_ROUNDING",
335
+ f"Numbers rounded: {a.evidence}",
336
+ a.confidence,
337
+ [a.evidence]
338
+ )
339
+
340
+ # 5. Filtering / Deletion
341
+ if "sequential_id_gaps" in by_type:
342
+ for a in by_type["sequential_id_gaps"]:
343
+ gap_ratio = a.details.get('gap_ratio', 0)
344
+ ghost.add_operation(
345
+ "RECORD_FILTERING",
346
+ f"~{gap_ratio*100:.0f}% of records were filtered or deleted",
347
+ a.confidence,
348
+ [a.evidence]
349
+ )
350
+
351
+ if "hard_cutoff" in by_type:
352
+ for a in by_type["hard_cutoff"]:
353
+ ghost.add_operation(
354
+ "VALUE_CAPPING",
355
+ f"Values capped at {a.details.get('cutoff', '?')}",
356
+ a.confidence,
357
+ [a.evidence]
358
+ )
359
+
360
+ # 6. Batch processing patterns
361
+ if "timestamp_rounding" in by_type:
362
+ for a in by_type["timestamp_rounding"]:
363
+ ghost.add_operation(
364
+ "BATCH_PROCESSING",
365
+ f"Data processed in batches: {a.evidence}",
366
+ a.confidence,
367
+ [a.evidence]
368
+ )
369
+
370
+ if "regular_intervals" in by_type:
371
+ for a in by_type["regular_intervals"]:
372
+ ghost.add_operation(
373
+ "SCHEDULED_JOB",
374
+ f"Regular processing schedule detected: {a.details.get('interval_desc', 'unknown')}",
375
+ a.confidence,
376
+ [a.evidence]
377
+ )
378
+
379
+ if "temporal_clustering" in by_type:
380
+ ghost.add_operation(
381
+ "BURST_PROCESSING",
382
+ "Event-driven or burst batch processing detected",
383
+ 0.75,
384
+ [a.evidence for a in by_type["temporal_clustering"]]
385
+ )
386
+
387
+ # 7. Data quality issues
388
+ if "encoding_artifact" in by_type:
389
+ for a in by_type["encoding_artifact"]:
390
+ ghost.add_operation(
391
+ "ENCODING_ERROR",
392
+ f"Character encoding conversion failed: {a.evidence}",
393
+ a.confidence,
394
+ [a.evidence]
395
+ )
396
+
397
+ if "sentinel_value" in by_type:
398
+ for a in by_type["sentinel_value"]:
399
+ ghost.add_operation(
400
+ "NULL_HANDLING",
401
+ f"NULLs represented as sentinel value {a.details.get('sentinel', '?')}",
402
+ a.confidence,
403
+ [a.evidence]
404
+ )
405
+
406
+ if "high_null_rate" in by_type:
407
+ for a in by_type["high_null_rate"]:
408
+ ghost.add_operation(
409
+ "OPTIONAL_FIELD",
410
+ f"Column {a.column} is optional or had ETL issues ({a.details.get('null_rate', 0)*100:.0f}% null)",
411
+ a.confidence,
412
+ [a.evidence]
413
+ )
414
+
415
+ # 8. Export (often the last step)
416
+ if any("PANDAS" in a.inferred_operation for a in artifacts):
417
+ ghost.add_operation(
418
+ "DATA_EXPORT",
419
+ "Data exported via Pandas to CSV",
420
+ 0.90,
421
+ ["Unnamed column artifact"]
422
+ )
423
+
424
+ return ghost
425
+
426
+ def analyze_file(self, filepath: str) -> ForensicsReport:
427
+ """
428
+ Analyze a data file.
429
+
430
+ Supports: CSV, JSON, JSONL, Parquet, Excel
431
+ """
432
+ import pandas as pd
433
+ from pathlib import Path
434
+
435
+ path = Path(filepath)
436
+ suffix = path.suffix.lower()
437
+
438
+ if suffix == '.csv':
439
+ df = pd.read_csv(filepath)
440
+ elif suffix == '.json':
441
+ df = pd.read_json(filepath)
442
+ elif suffix == '.jsonl':
443
+ df = pd.read_json(filepath, lines=True)
444
+ elif suffix == '.parquet':
445
+ df = pd.read_parquet(filepath)
446
+ elif suffix in ['.xlsx', '.xls']:
447
+ df = pd.read_excel(filepath)
448
+ else:
449
+ # Try CSV as default
450
+ df = pd.read_csv(filepath)
451
+
452
+ return self.analyze(df)
453
+
454
+
455
+ def analyze_dataframe(df) -> ForensicsReport:
456
+ """Convenience function to analyze a dataframe."""
457
+ forensics = DataForensics()
458
+ return forensics.analyze(df)
459
+
460
+
461
+ def analyze_file(filepath: str) -> ForensicsReport:
462
+ """Convenience function to analyze a file."""
463
+ forensics = DataForensics()
464
+ return forensics.analyze_file(filepath)
cascade/forensics/artifacts.py ADDED
@@ -0,0 +1,1063 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Forensics - Artifact Detectors
3
+
4
+ Each detector looks for specific patterns in data that reveal
5
+ how it was processed. The data remembers. We read.
6
+ """
7
+
8
+ import re
9
+ import hashlib
10
+ from dataclasses import dataclass, field
11
+ from typing import List, Dict, Any, Optional, Set, Tuple
12
+ from datetime import datetime
13
+ from collections import Counter
14
+ import statistics
15
+
16
+
17
+ @dataclass
18
+ class Artifact:
19
+ """A single detected artifact - evidence of processing."""
20
+ artifact_type: str
21
+ column: str
22
+ evidence: str
23
+ confidence: float # 0.0 to 1.0
24
+ inferred_operation: str
25
+ details: Dict[str, Any] = field(default_factory=dict)
26
+
27
+ def to_dict(self) -> Dict[str, Any]:
28
+ return {
29
+ "type": self.artifact_type,
30
+ "column": self.column,
31
+ "evidence": self.evidence,
32
+ "confidence": self.confidence,
33
+ "inferred_op": self.inferred_operation,
34
+ "details": self.details,
35
+ }
36
+
37
+
38
+ class ArtifactDetector:
39
+ """Base class for artifact detection."""
40
+
41
+ name: str = "base"
42
+
43
+ def detect(self, df, column: str) -> List[Artifact]:
44
+ """Detect artifacts in a column. Override in subclasses."""
45
+ return []
46
+
47
+ def detect_all(self, df) -> List[Artifact]:
48
+ """Detect artifacts across all applicable columns."""
49
+ artifacts = []
50
+ for col in df.columns:
51
+ artifacts.extend(self.detect(df, col))
52
+ return artifacts
53
+
54
+
55
+ class TimestampArtifacts(ArtifactDetector):
56
+ """
57
+ Detect timestamp patterns that reveal processing behavior.
58
+
59
+ Artifacts detected:
60
+ - Rounding to minute/hour/day (batch processing intervals)
61
+ - Regular intervals (scheduled jobs)
62
+ - Temporal clustering (burst processing)
63
+ - Timezone artifacts
64
+ - Future/past anomalies
65
+ """
66
+
67
+ name = "timestamp"
68
+
69
+ def detect(self, df, column: str) -> List[Artifact]:
70
+ artifacts = []
71
+
72
+ # Check if column looks like timestamps
73
+ if not self._is_timestamp_column(df, column):
74
+ return artifacts
75
+
76
+ try:
77
+ timestamps = self._parse_timestamps(df, column)
78
+ if len(timestamps) < 2:
79
+ return artifacts
80
+
81
+ # Check for rounding patterns
82
+ rounding = self._detect_rounding(timestamps)
83
+ if rounding:
84
+ artifacts.append(rounding)
85
+
86
+ # Check for regular intervals
87
+ intervals = self._detect_intervals(timestamps)
88
+ if intervals:
89
+ artifacts.append(intervals)
90
+
91
+ # Check for clustering
92
+ clustering = self._detect_clustering(timestamps)
93
+ if clustering:
94
+ artifacts.append(clustering)
95
+
96
+ # Check for timezone issues
97
+ tz_artifacts = self._detect_timezone_artifacts(timestamps)
98
+ artifacts.extend(tz_artifacts)
99
+
100
+ except Exception:
101
+ pass
102
+
103
+ return artifacts
104
+
105
+ def _is_timestamp_column(self, df, column: str) -> bool:
106
+ """Heuristic to detect timestamp columns."""
107
+ col_lower = column.lower()
108
+ timestamp_hints = ['time', 'date', 'created', 'updated', 'modified', 'timestamp', '_at', '_on']
109
+ if any(hint in col_lower for hint in timestamp_hints):
110
+ return True
111
+
112
+ # Check data type
113
+ dtype = str(df[column].dtype)
114
+ if 'datetime' in dtype or 'time' in dtype:
115
+ return True
116
+
117
+ # Sample and check format
118
+ sample = df[column].dropna().head(5).astype(str).tolist()
119
+ date_patterns = [
120
+ r'\d{4}-\d{2}-\d{2}',
121
+ r'\d{2}/\d{2}/\d{4}',
122
+ r'\d{10,13}', # Unix timestamp
123
+ ]
124
+ for val in sample:
125
+ for pattern in date_patterns:
126
+ if re.search(pattern, val):
127
+ return True
128
+
129
+ return False
130
+
131
+ def _parse_timestamps(self, df, column: str) -> List[datetime]:
132
+ """Parse column to datetime objects."""
133
+ import pandas as pd
134
+
135
+ try:
136
+ # Try pandas datetime conversion
137
+ parsed = pd.to_datetime(df[column], errors='coerce')
138
+ return [ts.to_pydatetime() for ts in parsed.dropna()]
139
+ except:
140
+ return []
141
+
142
+ def _detect_rounding(self, timestamps: List[datetime]) -> Optional[Artifact]:
143
+ """Detect if timestamps are rounded to specific intervals."""
144
+ if len(timestamps) < 10:
145
+ return None
146
+
147
+ # Check seconds
148
+ seconds = [ts.second for ts in timestamps]
149
+ unique_seconds = set(seconds)
150
+
151
+ # All zeros = minute rounding
152
+ if unique_seconds == {0}:
153
+ # Check minutes
154
+ minutes = [ts.minute for ts in timestamps]
155
+ unique_minutes = set(minutes)
156
+
157
+ if unique_minutes == {0}:
158
+ return Artifact(
159
+ artifact_type="timestamp_rounding",
160
+ column="timestamps",
161
+ evidence=f"All timestamps rounded to hour (0 minutes, 0 seconds)",
162
+ confidence=0.95,
163
+ inferred_operation="BATCH_HOURLY",
164
+ details={"interval": "hour", "sample_size": len(timestamps)}
165
+ )
166
+ elif all(m % 15 == 0 for m in minutes):
167
+ return Artifact(
168
+ artifact_type="timestamp_rounding",
169
+ column="timestamps",
170
+ evidence=f"Timestamps rounded to 15-minute intervals",
171
+ confidence=0.90,
172
+ inferred_operation="BATCH_15MIN",
173
+ details={"interval": "15min", "unique_minutes": list(unique_minutes)}
174
+ )
175
+ elif all(m % 5 == 0 for m in minutes):
176
+ return Artifact(
177
+ artifact_type="timestamp_rounding",
178
+ column="timestamps",
179
+ evidence=f"Timestamps rounded to 5-minute intervals",
180
+ confidence=0.85,
181
+ inferred_operation="BATCH_5MIN",
182
+ details={"interval": "5min"}
183
+ )
184
+ else:
185
+ return Artifact(
186
+ artifact_type="timestamp_rounding",
187
+ column="timestamps",
188
+ evidence=f"Timestamps rounded to minute (0 seconds)",
189
+ confidence=0.85,
190
+ inferred_operation="BATCH_MINUTE",
191
+ details={"interval": "minute"}
192
+ )
193
+
194
+ # Check if seconds cluster on specific values
195
+ second_counts = Counter(seconds)
196
+ most_common = second_counts.most_common(1)[0]
197
+ if most_common[1] > len(timestamps) * 0.8:
198
+ return Artifact(
199
+ artifact_type="timestamp_rounding",
200
+ column="timestamps",
201
+ evidence=f"{most_common[1]/len(timestamps)*100:.0f}% of timestamps have second={most_common[0]}",
202
+ confidence=0.70,
203
+ inferred_operation="SYSTEMATIC_TIMESTAMP_ASSIGNMENT",
204
+ details={"dominant_second": most_common[0], "percentage": most_common[1]/len(timestamps)}
205
+ )
206
+
207
+ return None
208
+
209
+ def _detect_intervals(self, timestamps: List[datetime]) -> Optional[Artifact]:
210
+ """Detect regular time intervals suggesting scheduled jobs."""
211
+ if len(timestamps) < 10:
212
+ return None
213
+
214
+ sorted_ts = sorted(timestamps)
215
+ deltas = [(sorted_ts[i+1] - sorted_ts[i]).total_seconds() for i in range(len(sorted_ts)-1)]
216
+
217
+ if not deltas:
218
+ return None
219
+
220
+ # Check for consistent intervals
221
+ median_delta = statistics.median(deltas)
222
+ if median_delta == 0:
223
+ return None
224
+
225
+ # Count how many deltas are close to median
226
+ tolerance = median_delta * 0.1 # 10% tolerance
227
+ consistent = sum(1 for d in deltas if abs(d - median_delta) < tolerance)
228
+ consistency_ratio = consistent / len(deltas)
229
+
230
+ if consistency_ratio > 0.7:
231
+ # Describe the interval
232
+ interval_desc = self._describe_interval(median_delta)
233
+ return Artifact(
234
+ artifact_type="regular_intervals",
235
+ column="timestamps",
236
+ evidence=f"{consistency_ratio*100:.0f}% of records have ~{interval_desc} intervals",
237
+ confidence=min(0.95, consistency_ratio),
238
+ inferred_operation=f"SCHEDULED_JOB_{interval_desc.upper().replace(' ', '_')}",
239
+ details={
240
+ "median_seconds": median_delta,
241
+ "interval_desc": interval_desc,
242
+ "consistency": consistency_ratio
243
+ }
244
+ )
245
+
246
+ return None
247
+
248
+ def _describe_interval(self, seconds: float) -> str:
249
+ """Human-readable interval description."""
250
+ if seconds < 60:
251
+ return f"{seconds:.0f}s"
252
+ elif seconds < 3600:
253
+ return f"{seconds/60:.0f}min"
254
+ elif seconds < 86400:
255
+ return f"{seconds/3600:.1f}hr"
256
+ else:
257
+ return f"{seconds/86400:.1f}day"
258
+
259
+ def _detect_clustering(self, timestamps: List[datetime]) -> Optional[Artifact]:
260
+ """Detect temporal clustering (burst processing)."""
261
+ if len(timestamps) < 20:
262
+ return None
263
+
264
+ sorted_ts = sorted(timestamps)
265
+
266
+ # Look for bursts: many records in short time, then gaps
267
+ deltas = [(sorted_ts[i+1] - sorted_ts[i]).total_seconds() for i in range(len(sorted_ts)-1)]
268
+
269
+ if not deltas:
270
+ return None
271
+
272
+ median_delta = statistics.median(deltas)
273
+ if median_delta == 0:
274
+ return None
275
+
276
+ # Count "burst" deltas (much smaller than median) vs "gap" deltas (much larger)
277
+ bursts = sum(1 for d in deltas if d < median_delta * 0.1)
278
+ gaps = sum(1 for d in deltas if d > median_delta * 5)
279
+
280
+ if bursts > len(deltas) * 0.3 and gaps > len(deltas) * 0.05:
281
+ return Artifact(
282
+ artifact_type="temporal_clustering",
283
+ column="timestamps",
284
+ evidence=f"Burst pattern: {bursts} rapid records, {gaps} long gaps",
285
+ confidence=0.75,
286
+ inferred_operation="BATCH_BURST_PROCESSING",
287
+ details={
288
+ "burst_count": bursts,
289
+ "gap_count": gaps,
290
+ "median_delta_seconds": median_delta
291
+ }
292
+ )
293
+
294
+ return None
295
+
296
+ def _detect_timezone_artifacts(self, timestamps: List[datetime]) -> List[Artifact]:
297
+ """Detect timezone-related artifacts."""
298
+ artifacts = []
299
+
300
+ # Check for hour distribution anomalies (e.g., no records 0-7 UTC = US business hours)
301
+ hours = [ts.hour for ts in timestamps]
302
+ hour_counts = Counter(hours)
303
+
304
+ # Check for gaps suggesting business hours in a specific timezone
305
+ zero_hours = [h for h in range(24) if hour_counts.get(h, 0) == 0]
306
+
307
+ if len(zero_hours) >= 6 and len(zero_hours) <= 12:
308
+ # Contiguous gap?
309
+ zero_hours_sorted = sorted(zero_hours)
310
+ if zero_hours_sorted[-1] - zero_hours_sorted[0] == len(zero_hours) - 1:
311
+ artifacts.append(Artifact(
312
+ artifact_type="business_hours",
313
+ column="timestamps",
314
+ evidence=f"No records during hours {min(zero_hours)}-{max(zero_hours)} UTC",
315
+ confidence=0.70,
316
+ inferred_operation="BUSINESS_HOURS_ONLY",
317
+ details={"quiet_hours": zero_hours}
318
+ ))
319
+
320
+ return artifacts
321
+
322
+
323
+ class IDPatternArtifacts(ArtifactDetector):
324
+ """
325
+ Detect ID patterns that reveal data lineage.
326
+
327
+ Artifacts detected:
328
+ - Sequential IDs with gaps (deletions/filtering)
329
+ - UUID versions (generation method)
330
+ - Prefixes (source identification)
331
+ - Hash patterns (deterministic generation)
332
+ """
333
+
334
+ name = "id_patterns"
335
+
336
+ def detect(self, df, column: str) -> List[Artifact]:
337
+ artifacts = []
338
+
339
+ if not self._is_id_column(df, column):
340
+ return artifacts
341
+
342
+ try:
343
+ values = df[column].dropna().astype(str).tolist()
344
+ if len(values) < 5:
345
+ return artifacts
346
+
347
+ # Check for sequential integers with gaps
348
+ gaps = self._detect_sequential_gaps(values)
349
+ if gaps:
350
+ artifacts.append(gaps)
351
+
352
+ # Check for UUID patterns
353
+ uuid_artifact = self._detect_uuid_patterns(values)
354
+ if uuid_artifact:
355
+ artifacts.append(uuid_artifact)
356
+
357
+ # Check for prefixes
358
+ prefix = self._detect_prefixes(values)
359
+ if prefix:
360
+ artifacts.append(prefix)
361
+
362
+ # Check for hash patterns
363
+ hash_artifact = self._detect_hash_patterns(values)
364
+ if hash_artifact:
365
+ artifacts.append(hash_artifact)
366
+
367
+ except Exception:
368
+ pass
369
+
370
+ return artifacts
371
+
372
+ def _is_id_column(self, df, column: str) -> bool:
373
+ """Heuristic to detect ID columns."""
374
+ col_lower = column.lower()
375
+ id_hints = ['id', 'key', 'uuid', 'guid', 'pk', '_id', 'identifier']
376
+ return any(hint in col_lower for hint in id_hints)
377
+
378
+ def _detect_sequential_gaps(self, values: List[str]) -> Optional[Artifact]:
379
+ """Detect sequential IDs with gaps indicating deletions."""
380
+ # Try to parse as integers
381
+ try:
382
+ ints = sorted([int(v) for v in values if v.isdigit()])
383
+ if len(ints) < 10:
384
+ return None
385
+
386
+ # Check for gaps
387
+ expected_count = ints[-1] - ints[0] + 1
388
+ actual_count = len(set(ints))
389
+ gap_count = expected_count - actual_count
390
+ gap_ratio = gap_count / expected_count if expected_count > 0 else 0
391
+
392
+ if gap_ratio > 0.05: # More than 5% missing
393
+ return Artifact(
394
+ artifact_type="sequential_id_gaps",
395
+ column=values[0] if values else "id",
396
+ evidence=f"Sequential IDs with {gap_ratio*100:.1f}% gaps ({gap_count} missing)",
397
+ confidence=0.85,
398
+ inferred_operation="FILTERING_OR_DELETION",
399
+ details={
400
+ "min_id": ints[0],
401
+ "max_id": ints[-1],
402
+ "expected": expected_count,
403
+ "actual": actual_count,
404
+ "gap_ratio": gap_ratio
405
+ }
406
+ )
407
+ except:
408
+ pass
409
+
410
+ return None
411
+
412
+ def _detect_uuid_patterns(self, values: List[str]) -> Optional[Artifact]:
413
+ """Detect UUID version from patterns."""
414
+ uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-([0-9a-f])[0-9a-f]{3}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
415
+
416
+ versions = []
417
+ for v in values[:100]: # Sample
418
+ match = uuid_pattern.match(v)
419
+ if match:
420
+ versions.append(match.group(1))
421
+
422
+ if len(versions) < len(values[:100]) * 0.5:
423
+ return None
424
+
425
+ version_counts = Counter(versions)
426
+ dominant = version_counts.most_common(1)[0]
427
+
428
+ version_meanings = {
429
+ '1': 'TIME_BASED_MAC', # Reveals generation time + machine
430
+ '2': 'DCE_SECURITY',
431
+ '3': 'MD5_HASH', # Deterministic from input
432
+ '4': 'RANDOM', # Crypto random
433
+ '5': 'SHA1_HASH', # Deterministic from input
434
+ '6': 'SORTABLE_TIME', # Modern time-sortable
435
+ '7': 'UNIX_TIME_RANDOM', # Time-ordered with randomness
436
+ }
437
+
438
+ return Artifact(
439
+ artifact_type="uuid_version",
440
+ column="id",
441
+ evidence=f"UUIDs are version {dominant[0]} ({version_meanings.get(dominant[0], 'UNKNOWN')})",
442
+ confidence=0.90,
443
+ inferred_operation=f"UUID_GENERATION_V{dominant[0]}",
444
+ details={
445
+ "version": dominant[0],
446
+ "meaning": version_meanings.get(dominant[0], 'unknown'),
447
+ "sample_count": len(versions)
448
+ }
449
+ )
450
+
451
+ def _detect_prefixes(self, values: List[str]) -> Optional[Artifact]:
452
+ """Detect common prefixes indicating source systems."""
453
+ if len(values) < 10:
454
+ return None
455
+
456
+ # Find common prefix
457
+ prefix_len = 0
458
+ for i in range(1, min(20, min(len(v) for v in values[:100]))):
459
+ prefixes = set(v[:i] for v in values[:100])
460
+ if len(prefixes) <= 3: # Allow up to 3 different prefixes
461
+ prefix_len = i
462
+ else:
463
+ break
464
+
465
+ if prefix_len >= 2:
466
+ prefixes = Counter(v[:prefix_len] for v in values)
467
+ top_prefixes = prefixes.most_common(3)
468
+
469
+ return Artifact(
470
+ artifact_type="id_prefix",
471
+ column="id",
472
+ evidence=f"IDs have systematic prefix: {top_prefixes}",
473
+ confidence=0.80,
474
+ inferred_operation="MULTI_SOURCE_MERGE" if len(top_prefixes) > 1 else "SOURCE_IDENTIFICATION",
475
+ details={
476
+ "prefixes": dict(top_prefixes),
477
+ "prefix_length": prefix_len
478
+ }
479
+ )
480
+
481
+ return None
482
+
483
+ def _detect_hash_patterns(self, values: List[str]) -> Optional[Artifact]:
484
+ """Detect if IDs look like hashes."""
485
+ hex_pattern = re.compile(r'^[0-9a-f]+$', re.I)
486
+
487
+ hex_lengths = []
488
+ for v in values[:100]:
489
+ if hex_pattern.match(v):
490
+ hex_lengths.append(len(v))
491
+
492
+ if len(hex_lengths) < len(values[:100]) * 0.8:
493
+ return None
494
+
495
+ # Check for consistent hash lengths
496
+ length_counts = Counter(hex_lengths)
497
+ dominant = length_counts.most_common(1)[0]
498
+
499
+ hash_types = {
500
+ 32: 'MD5',
501
+ 40: 'SHA1',
502
+ 64: 'SHA256',
503
+ 128: 'SHA512',
504
+ 16: 'SHORT_HASH',
505
+ }
506
+
507
+ if dominant[1] > len(hex_lengths) * 0.9:
508
+ hash_type = hash_types.get(dominant[0], f'{dominant[0]}-char hash')
509
+ return Artifact(
510
+ artifact_type="hash_id",
511
+ column="id",
512
+ evidence=f"IDs are {hash_type} hashes ({dominant[0]} hex chars)",
513
+ confidence=0.85,
514
+ inferred_operation=f"DETERMINISTIC_ID_GENERATION_{hash_type}",
515
+ details={
516
+ "hash_length": dominant[0],
517
+ "probable_algorithm": hash_type
518
+ }
519
+ )
520
+
521
+ return None
522
+
523
+
524
+ class TextArtifacts(ArtifactDetector):
525
+ """
526
+ Detect text processing artifacts.
527
+
528
+ Artifacts detected:
529
+ - Truncation (field length limits)
530
+ - Encoding issues (charset conversion)
531
+ - Case normalization
532
+ - Whitespace patterns
533
+ - Sanitization patterns
534
+ """
535
+
536
+ name = "text"
537
+
538
+ def detect(self, df, column: str) -> List[Artifact]:
539
+ artifacts = []
540
+
541
+ dtype = str(df[column].dtype)
542
+ if 'object' not in dtype and 'str' not in dtype:
543
+ return artifacts
544
+
545
+ try:
546
+ values = df[column].dropna().astype(str).tolist()
547
+ if len(values) < 5:
548
+ return artifacts
549
+
550
+ # Truncation
551
+ trunc = self._detect_truncation(values)
552
+ if trunc:
553
+ artifacts.append(trunc)
554
+
555
+ # Encoding issues
556
+ encoding = self._detect_encoding_artifacts(values)
557
+ if encoding:
558
+ artifacts.append(encoding)
559
+
560
+ # Case patterns
561
+ case = self._detect_case_patterns(values, column)
562
+ if case:
563
+ artifacts.append(case)
564
+
565
+ # Whitespace
566
+ ws = self._detect_whitespace_patterns(values)
567
+ if ws:
568
+ artifacts.append(ws)
569
+
570
+ except Exception:
571
+ pass
572
+
573
+ return artifacts
574
+
575
+ def _detect_truncation(self, values: List[str]) -> Optional[Artifact]:
576
+ """Detect truncation at specific lengths."""
577
+ lengths = [len(v) for v in values]
578
+ max_len = max(lengths)
579
+
580
+ # Count values at max length
581
+ at_max = sum(1 for l in lengths if l == max_len)
582
+
583
+ # If many values hit the max, likely truncation
584
+ if at_max > len(values) * 0.1 and max_len > 10:
585
+ # Check if values at max look truncated (end mid-word, etc.)
586
+ max_values = [v for v in values if len(v) == max_len]
587
+ truncated_looking = sum(1 for v in max_values if not v.endswith(('.', '!', '?', ' ')))
588
+
589
+ if truncated_looking > len(max_values) * 0.5:
590
+ return Artifact(
591
+ artifact_type="truncation",
592
+ column=str(values[0])[:20] if values else "text",
593
+ evidence=f"{at_max} values ({at_max/len(values)*100:.1f}%) truncated at {max_len} chars",
594
+ confidence=0.80,
595
+ inferred_operation=f"FIELD_LENGTH_LIMIT_{max_len}",
596
+ details={
597
+ "max_length": max_len,
598
+ "truncated_count": at_max,
599
+ "truncated_ratio": at_max / len(values)
600
+ }
601
+ )
602
+
603
+ return None
604
+
605
+ def _detect_encoding_artifacts(self, values: List[str]) -> Optional[Artifact]:
606
+ """Detect encoding/charset conversion issues."""
607
+ # Common mojibake patterns
608
+ mojibake_patterns = [
609
+ r'é', # é misencoded
610
+ r'è', # è
611
+ r'Ã ', # à
612
+ r'’', # ' smart quote
613
+ r'â€"', # — em dash
614
+ r'ö', # ö
615
+ r'ü', # ü
616
+ r'', # BOM
617
+ r'\\x[0-9a-f]{2}', # Raw hex escapes
618
+ r'&amp;|&lt;|&gt;', # HTML entities
619
+ ]
620
+
621
+ issue_count = 0
622
+ patterns_found = set()
623
+
624
+ for v in values[:500]: # Sample
625
+ for pattern in mojibake_patterns:
626
+ if re.search(pattern, v):
627
+ issue_count += 1
628
+ patterns_found.add(pattern)
629
+ break
630
+
631
+ if issue_count > 5:
632
+ return Artifact(
633
+ artifact_type="encoding_artifact",
634
+ column="text",
635
+ evidence=f"{issue_count} values have encoding issues (patterns: {patterns_found})",
636
+ confidence=0.85,
637
+ inferred_operation="CHARSET_CONVERSION_ERROR",
638
+ details={
639
+ "issue_count": issue_count,
640
+ "patterns": list(patterns_found)
641
+ }
642
+ )
643
+
644
+ return None
645
+
646
+ def _detect_case_patterns(self, values: List[str], column: str) -> Optional[Artifact]:
647
+ """Detect case normalization."""
648
+ # Skip obviously non-text columns
649
+ sample = values[:100]
650
+
651
+ all_lower = all(v == v.lower() for v in sample if v.strip())
652
+ all_upper = all(v == v.upper() for v in sample if v.strip())
653
+
654
+ if all_lower:
655
+ return Artifact(
656
+ artifact_type="case_normalization",
657
+ column=column,
658
+ evidence="All values are lowercase",
659
+ confidence=0.90,
660
+ inferred_operation="LOWERCASE_NORMALIZATION",
661
+ details={"case": "lower"}
662
+ )
663
+ elif all_upper:
664
+ return Artifact(
665
+ artifact_type="case_normalization",
666
+ column=column,
667
+ evidence="All values are UPPERCASE",
668
+ confidence=0.90,
669
+ inferred_operation="UPPERCASE_NORMALIZATION",
670
+ details={"case": "upper"}
671
+ )
672
+
673
+ return None
674
+
675
+ def _detect_whitespace_patterns(self, values: List[str]) -> Optional[Artifact]:
676
+ """Detect whitespace handling patterns."""
677
+ # Check for leading/trailing whitespace
678
+ has_leading = sum(1 for v in values if v and v[0] == ' ')
679
+ has_trailing = sum(1 for v in values if v and v[-1] == ' ')
680
+
681
+ # No whitespace at all = trimmed
682
+ if has_leading == 0 and has_trailing == 0:
683
+ # Verify there's text that COULD have whitespace
684
+ has_spaces = sum(1 for v in values if ' ' in v.strip())
685
+ if has_spaces > len(values) * 0.3:
686
+ return Artifact(
687
+ artifact_type="whitespace_trimming",
688
+ column="text",
689
+ evidence="No leading/trailing whitespace (data was trimmed)",
690
+ confidence=0.70,
691
+ inferred_operation="WHITESPACE_TRIM",
692
+ details={"trimmed": True}
693
+ )
694
+
695
+ return None
696
+
697
+
698
+ class NumericArtifacts(ArtifactDetector):
699
+ """
700
+ Detect numeric processing artifacts.
701
+
702
+ Artifacts detected:
703
+ - Rounding patterns (precision limits)
704
+ - Outlier presence/absence (filtering)
705
+ - Distribution anomalies (sampling)
706
+ - Sentinel values (nulls represented as -1, 0, 9999)
707
+ """
708
+
709
+ name = "numeric"
710
+
711
+ def detect(self, df, column: str) -> List[Artifact]:
712
+ artifacts = []
713
+
714
+ # Check if numeric
715
+ try:
716
+ values = df[column].dropna()
717
+ if len(values) < 10:
718
+ return artifacts
719
+
720
+ # Try to get numeric values
721
+ numeric_values = values.astype(float).tolist()
722
+
723
+ # Rounding
724
+ rounding = self._detect_rounding(numeric_values, column)
725
+ if rounding:
726
+ artifacts.append(rounding)
727
+
728
+ # Sentinel values
729
+ sentinel = self._detect_sentinel_values(numeric_values, column)
730
+ if sentinel:
731
+ artifacts.append(sentinel)
732
+
733
+ # Distribution
734
+ dist = self._detect_distribution_artifacts(numeric_values, column)
735
+ if dist:
736
+ artifacts.append(dist)
737
+
738
+ except (ValueError, TypeError):
739
+ pass
740
+
741
+ return artifacts
742
+
743
+ def _detect_rounding(self, values: List[float], column: str) -> Optional[Artifact]:
744
+ """Detect systematic rounding."""
745
+ # Check decimal places
746
+ decimal_places = []
747
+ for v in values[:500]:
748
+ if v != int(v):
749
+ str_v = f"{v:.10f}".rstrip('0')
750
+ if '.' in str_v:
751
+ decimal_places.append(len(str_v.split('.')[1]))
752
+
753
+ if not decimal_places:
754
+ # All integers - check for rounding to 10, 100, etc.
755
+ int_values = [int(v) for v in values]
756
+
757
+ divisible_by_100 = sum(1 for v in int_values if v % 100 == 0)
758
+ divisible_by_10 = sum(1 for v in int_values if v % 10 == 0)
759
+
760
+ if divisible_by_100 > len(int_values) * 0.9:
761
+ return Artifact(
762
+ artifact_type="numeric_rounding",
763
+ column=column,
764
+ evidence="Values rounded to nearest 100",
765
+ confidence=0.85,
766
+ inferred_operation="ROUND_TO_100",
767
+ details={"rounding": 100}
768
+ )
769
+ elif divisible_by_10 > len(int_values) * 0.9:
770
+ return Artifact(
771
+ artifact_type="numeric_rounding",
772
+ column=column,
773
+ evidence="Values rounded to nearest 10",
774
+ confidence=0.80,
775
+ inferred_operation="ROUND_TO_10",
776
+ details={"rounding": 10}
777
+ )
778
+ else:
779
+ # Check for consistent decimal places
780
+ max_decimals = max(decimal_places)
781
+ at_max = sum(1 for d in decimal_places if d == max_decimals)
782
+
783
+ if at_max < len(decimal_places) * 0.3 and max_decimals <= 2:
784
+ return Artifact(
785
+ artifact_type="numeric_rounding",
786
+ column=column,
787
+ evidence=f"Values appear rounded to {max_decimals} decimal places",
788
+ confidence=0.75,
789
+ inferred_operation=f"ROUND_TO_{max_decimals}_DECIMALS",
790
+ details={"decimal_places": max_decimals}
791
+ )
792
+
793
+ return None
794
+
795
+ def _detect_sentinel_values(self, values: List[float], column: str) -> Optional[Artifact]:
796
+ """Detect sentinel values representing nulls."""
797
+ sentinels = [-1, -999, -9999, 0, 9999, 99999]
798
+
799
+ value_counts = Counter(values)
800
+
801
+ for sentinel in sentinels:
802
+ if sentinel in value_counts:
803
+ count = value_counts[sentinel]
804
+ if count > len(values) * 0.01: # More than 1%
805
+ return Artifact(
806
+ artifact_type="sentinel_value",
807
+ column=column,
808
+ evidence=f"{count} occurrences of {sentinel} (likely NULL sentinel)",
809
+ confidence=0.70,
810
+ inferred_operation=f"NULL_AS_{int(sentinel)}",
811
+ details={
812
+ "sentinel": sentinel,
813
+ "count": count,
814
+ "percentage": count / len(values) * 100
815
+ }
816
+ )
817
+
818
+ return None
819
+
820
+ def _detect_distribution_artifacts(self, values: List[float], column: str) -> Optional[Artifact]:
821
+ """Detect distribution anomalies suggesting filtering/sampling."""
822
+ if len(values) < 100:
823
+ return None
824
+
825
+ # Check for hard cutoffs
826
+ sorted_vals = sorted(values)
827
+ min_val, max_val = sorted_vals[0], sorted_vals[-1]
828
+
829
+ # Round number cutoffs suggest filtering
830
+ if max_val == int(max_val) and max_val % 10 == 0:
831
+ # Check if there's a cluster at the max
832
+ at_max = sum(1 for v in values if v == max_val)
833
+ if at_max > len(values) * 0.05:
834
+ return Artifact(
835
+ artifact_type="hard_cutoff",
836
+ column=column,
837
+ evidence=f"Hard cutoff at {max_val} ({at_max} values at limit)",
838
+ confidence=0.75,
839
+ inferred_operation=f"CAP_AT_{int(max_val)}",
840
+ details={
841
+ "cutoff": max_val,
842
+ "count_at_cutoff": at_max
843
+ }
844
+ )
845
+
846
+ return None
847
+
848
+
849
+ class NullPatternArtifacts(ArtifactDetector):
850
+ """
851
+ Detect null/missing value patterns.
852
+
853
+ Artifacts detected:
854
+ - Systematic nulls (default handling)
855
+ - Null correlations (conditional logic)
856
+ - Null rates anomalies (ETL errors)
857
+ """
858
+
859
+ name = "null_patterns"
860
+
861
+ def detect_all(self, df) -> List[Artifact]:
862
+ """Analyze null patterns across all columns."""
863
+ artifacts = []
864
+
865
+ # Overall null rates per column
866
+ null_rates = {}
867
+ for col in df.columns:
868
+ null_rate = df[col].isna().mean()
869
+ null_rates[col] = null_rate
870
+
871
+ # Detect anomalous null rates
872
+ rates = list(null_rates.values())
873
+ if len(rates) > 3:
874
+ mean_rate = statistics.mean(rates)
875
+
876
+ for col, rate in null_rates.items():
877
+ if rate > 0.5 and rate > mean_rate * 3:
878
+ artifacts.append(Artifact(
879
+ artifact_type="high_null_rate",
880
+ column=col,
881
+ evidence=f"{rate*100:.1f}% null (vs {mean_rate*100:.1f}% average)",
882
+ confidence=0.70,
883
+ inferred_operation="OPTIONAL_FIELD_OR_ETL_ERROR",
884
+ details={
885
+ "null_rate": rate,
886
+ "avg_null_rate": mean_rate
887
+ }
888
+ ))
889
+
890
+ # Detect columns that are null together (conditional logic)
891
+ # This is expensive so we sample
892
+ if len(df) > 100:
893
+ sample = df.sample(min(1000, len(df)))
894
+ else:
895
+ sample = df
896
+
897
+ correlated_nulls = []
898
+ cols = list(df.columns)
899
+ for i, col1 in enumerate(cols):
900
+ for col2 in cols[i+1:]:
901
+ both_null = (sample[col1].isna() & sample[col2].isna()).mean()
902
+ either_null = (sample[col1].isna() | sample[col2].isna()).mean()
903
+
904
+ if either_null > 0.1 and both_null / either_null > 0.8:
905
+ correlated_nulls.append((col1, col2, both_null))
906
+
907
+ if correlated_nulls:
908
+ artifacts.append(Artifact(
909
+ artifact_type="correlated_nulls",
910
+ column="multiple",
911
+ evidence=f"{len(correlated_nulls)} column pairs have correlated nulls",
912
+ confidence=0.75,
913
+ inferred_operation="CONDITIONAL_FIELD_POPULATION",
914
+ details={
915
+ "pairs": [(c1, c2) for c1, c2, _ in correlated_nulls[:5]]
916
+ }
917
+ ))
918
+
919
+ return artifacts
920
+
921
+ def detect(self, df, column: str) -> List[Artifact]:
922
+ """Null patterns are analyzed globally, not per-column."""
923
+ return []
924
+
925
+
926
+ class SchemaArtifacts(ArtifactDetector):
927
+ """
928
+ Detect schema-level artifacts.
929
+
930
+ Artifacts detected:
931
+ - Column naming conventions (framework hints)
932
+ - Data type patterns (database origin)
933
+ - Schema inconsistencies (merged sources)
934
+ """
935
+
936
+ name = "schema"
937
+
938
+ def detect_all(self, df) -> List[Artifact]:
939
+ """Analyze schema patterns."""
940
+ artifacts = []
941
+
942
+ columns = list(df.columns)
943
+
944
+ # Naming convention detection
945
+ conventions = self._detect_naming_conventions(columns)
946
+ if conventions:
947
+ artifacts.append(conventions)
948
+
949
+ # Framework fingerprints
950
+ framework = self._detect_framework_fingerprints(columns)
951
+ if framework:
952
+ artifacts.append(framework)
953
+
954
+ # Mixed conventions (merged sources)
955
+ mixed = self._detect_mixed_conventions(columns)
956
+ if mixed:
957
+ artifacts.append(mixed)
958
+
959
+ return artifacts
960
+
961
+ def detect(self, df, column: str) -> List[Artifact]:
962
+ """Schema patterns are analyzed globally."""
963
+ return []
964
+
965
+ def _detect_naming_conventions(self, columns: List[str]) -> Optional[Artifact]:
966
+ """Detect column naming convention."""
967
+ snake_case = sum(1 for c in columns if '_' in c and c == c.lower())
968
+ camel_case = sum(1 for c in columns if re.match(r'^[a-z]+([A-Z][a-z]+)+$', c))
969
+ pascal_case = sum(1 for c in columns if re.match(r'^([A-Z][a-z]+)+$', c))
970
+
971
+ total = len(columns)
972
+
973
+ if snake_case > total * 0.7:
974
+ return Artifact(
975
+ artifact_type="naming_convention",
976
+ column="schema",
977
+ evidence=f"snake_case naming ({snake_case}/{total} columns)",
978
+ confidence=0.80,
979
+ inferred_operation="PYTHON_OR_SQL_ORIGIN",
980
+ details={"convention": "snake_case", "ratio": snake_case/total}
981
+ )
982
+ elif camel_case > total * 0.5:
983
+ return Artifact(
984
+ artifact_type="naming_convention",
985
+ column="schema",
986
+ evidence=f"camelCase naming ({camel_case}/{total} columns)",
987
+ confidence=0.80,
988
+ inferred_operation="JAVASCRIPT_OR_JAVA_ORIGIN",
989
+ details={"convention": "camelCase", "ratio": camel_case/total}
990
+ )
991
+ elif pascal_case > total * 0.5:
992
+ return Artifact(
993
+ artifact_type="naming_convention",
994
+ column="schema",
995
+ evidence=f"PascalCase naming ({pascal_case}/{total} columns)",
996
+ confidence=0.80,
997
+ inferred_operation="DOTNET_OR_JAVA_ORIGIN",
998
+ details={"convention": "PascalCase", "ratio": pascal_case/total}
999
+ )
1000
+
1001
+ return None
1002
+
1003
+ def _detect_framework_fingerprints(self, columns: List[str]) -> Optional[Artifact]:
1004
+ """Detect framework-specific column patterns."""
1005
+ col_lower = [c.lower() for c in columns]
1006
+
1007
+ # Django fingerprints
1008
+ if 'id' in col_lower and 'created_at' in col_lower:
1009
+ return Artifact(
1010
+ artifact_type="framework_fingerprint",
1011
+ column="schema",
1012
+ evidence="Django/Rails-style auto columns (id, created_at)",
1013
+ confidence=0.65,
1014
+ inferred_operation="ORM_GENERATED_SCHEMA",
1015
+ details={"framework_hints": ["django", "rails", "sqlalchemy"]}
1016
+ )
1017
+
1018
+ # Pandas export fingerprints
1019
+ if 'unnamed: 0' in col_lower or any('unnamed:' in c for c in col_lower):
1020
+ return Artifact(
1021
+ artifact_type="framework_fingerprint",
1022
+ column="schema",
1023
+ evidence="Pandas index column artifact (Unnamed: 0)",
1024
+ confidence=0.90,
1025
+ inferred_operation="PANDAS_CSV_EXPORT",
1026
+ details={"framework": "pandas"}
1027
+ )
1028
+
1029
+ # MongoDB fingerprints
1030
+ if '_id' in col_lower:
1031
+ return Artifact(
1032
+ artifact_type="framework_fingerprint",
1033
+ column="schema",
1034
+ evidence="MongoDB _id column present",
1035
+ confidence=0.85,
1036
+ inferred_operation="MONGODB_EXPORT",
1037
+ details={"framework": "mongodb"}
1038
+ )
1039
+
1040
+ return None
1041
+
1042
+ def _detect_mixed_conventions(self, columns: List[str]) -> Optional[Artifact]:
1043
+ """Detect mixed naming conventions suggesting merged sources."""
1044
+ snake_case = sum(1 for c in columns if '_' in c and c == c.lower())
1045
+ camel_case = sum(1 for c in columns if re.match(r'^[a-z]+([A-Z][a-z]+)+$', c))
1046
+
1047
+ total = len(columns)
1048
+
1049
+ # Both conventions present significantly
1050
+ if snake_case > total * 0.2 and camel_case > total * 0.2:
1051
+ return Artifact(
1052
+ artifact_type="mixed_conventions",
1053
+ column="schema",
1054
+ evidence=f"Mixed naming: {snake_case} snake_case, {camel_case} camelCase",
1055
+ confidence=0.75,
1056
+ inferred_operation="MERGED_SOURCES",
1057
+ details={
1058
+ "snake_case_count": snake_case,
1059
+ "camel_case_count": camel_case
1060
+ }
1061
+ )
1062
+
1063
+ return None
cascade/forensics/fingerprints.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Forensics - Technology Fingerprinting
3
+
4
+ Map detected artifacts to likely technologies and tools.
5
+ The artifacts are evidence. This module is the detective.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import List, Dict, Any, Set
10
+ from collections import defaultdict
11
+
12
+
13
+ @dataclass
14
+ class Fingerprint:
15
+ """A technology fingerprint - evidence pointing to specific tools."""
16
+ technology: str
17
+ category: str # database, framework, language, tool
18
+ confidence: float
19
+ evidence: List[str] = field(default_factory=list)
20
+
21
+ def to_dict(self) -> Dict[str, Any]:
22
+ return {
23
+ "technology": self.technology,
24
+ "category": self.category,
25
+ "confidence": self.confidence,
26
+ "evidence": self.evidence,
27
+ }
28
+
29
+
30
+ class TechFingerprinter:
31
+ """
32
+ Map artifact patterns to likely technologies.
33
+
34
+ This is pattern matching - certain artifact combinations
35
+ are strong indicators of specific tools.
36
+ """
37
+
38
+ # Artifact patterns -> technology mappings
39
+ PATTERNS = {
40
+ # Databases
41
+ "MONGODB_EXPORT": {
42
+ "technology": "MongoDB",
43
+ "category": "database",
44
+ "weight": 0.9,
45
+ },
46
+ "ORM_GENERATED_SCHEMA": {
47
+ "technology": "ORM (Django/Rails/SQLAlchemy)",
48
+ "category": "framework",
49
+ "weight": 0.7,
50
+ },
51
+ "PANDAS_CSV_EXPORT": {
52
+ "technology": "Pandas",
53
+ "category": "tool",
54
+ "weight": 0.95,
55
+ },
56
+
57
+ # Processing tools
58
+ "LOWERCASE_NORMALIZATION": {
59
+ "technology": "Text Preprocessing",
60
+ "category": "processing",
61
+ "weight": 0.6,
62
+ },
63
+ "WHITESPACE_TRIM": {
64
+ "technology": "String Cleaning",
65
+ "category": "processing",
66
+ "weight": 0.5,
67
+ },
68
+
69
+ # Batch processing
70
+ "BATCH_HOURLY": {
71
+ "technology": "Scheduled Batch Job (hourly)",
72
+ "category": "infrastructure",
73
+ "weight": 0.8,
74
+ },
75
+ "BATCH_15MIN": {
76
+ "technology": "Scheduled Batch Job (15min)",
77
+ "category": "infrastructure",
78
+ "weight": 0.8,
79
+ },
80
+ "BATCH_BURST_PROCESSING": {
81
+ "technology": "Event-Driven Batch Processing",
82
+ "category": "infrastructure",
83
+ "weight": 0.7,
84
+ },
85
+ "SCHEDULED_JOB": {
86
+ "technology": "Cron/Scheduler",
87
+ "category": "infrastructure",
88
+ "weight": 0.75,
89
+ },
90
+
91
+ # ID generation
92
+ "UUID_GENERATION_V4": {
93
+ "technology": "Cryptographic UUID Generator",
94
+ "category": "tool",
95
+ "weight": 0.8,
96
+ },
97
+ "UUID_GENERATION_V1": {
98
+ "technology": "Time-based UUID (leaks timestamp + MAC)",
99
+ "category": "tool",
100
+ "weight": 0.85,
101
+ },
102
+ "DETERMINISTIC_ID_GENERATION_SHA256": {
103
+ "technology": "Content-Addressed Storage",
104
+ "category": "architecture",
105
+ "weight": 0.8,
106
+ },
107
+ "DETERMINISTIC_ID_GENERATION_MD5": {
108
+ "technology": "MD5 Hash IDs (legacy system)",
109
+ "category": "architecture",
110
+ "weight": 0.8,
111
+ },
112
+
113
+ # Data quality
114
+ "FILTERING_OR_DELETION": {
115
+ "technology": "Record Filtering/Deletion Pipeline",
116
+ "category": "processing",
117
+ "weight": 0.7,
118
+ },
119
+ "CHARSET_CONVERSION_ERROR": {
120
+ "technology": "Encoding Mismatch (Latin-1 vs UTF-8)",
121
+ "category": "bug",
122
+ "weight": 0.85,
123
+ },
124
+
125
+ # Languages/frameworks
126
+ "PYTHON_OR_SQL_ORIGIN": {
127
+ "technology": "Python or SQL",
128
+ "category": "language",
129
+ "weight": 0.6,
130
+ },
131
+ "JAVASCRIPT_OR_JAVA_ORIGIN": {
132
+ "technology": "JavaScript or Java",
133
+ "category": "language",
134
+ "weight": 0.6,
135
+ },
136
+
137
+ # Source merging
138
+ "MERGED_SOURCES": {
139
+ "technology": "Multi-Source Data Integration",
140
+ "category": "architecture",
141
+ "weight": 0.8,
142
+ },
143
+ "MULTI_SOURCE_MERGE": {
144
+ "technology": "Multi-Source Data Integration",
145
+ "category": "architecture",
146
+ "weight": 0.85,
147
+ },
148
+ }
149
+
150
+ # Compound patterns - combinations that strengthen identification
151
+ COMPOUND_PATTERNS = [
152
+ {
153
+ "requires": ["PANDAS_CSV_EXPORT", "PYTHON_OR_SQL_ORIGIN"],
154
+ "suggests": Fingerprint("Pandas Data Pipeline", "tool", 0.95),
155
+ },
156
+ {
157
+ "requires": ["MONGODB_EXPORT", "JAVASCRIPT_OR_JAVA_ORIGIN"],
158
+ "suggests": Fingerprint("Node.js + MongoDB Stack", "stack", 0.85),
159
+ },
160
+ {
161
+ "requires": ["ORM_GENERATED_SCHEMA", "BATCH_HOURLY"],
162
+ "suggests": Fingerprint("Django/Rails Batch Worker", "stack", 0.80),
163
+ },
164
+ {
165
+ "requires": ["CHARSET_CONVERSION_ERROR", "MERGED_SOURCES"],
166
+ "suggests": Fingerprint("Legacy System Migration", "context", 0.85),
167
+ },
168
+ {
169
+ "requires": ["UUID_GENERATION_V1", "BATCH_BURST_PROCESSING"],
170
+ "suggests": Fingerprint("Distributed System (pre-2015 design)", "architecture", 0.75),
171
+ },
172
+ ]
173
+
174
+ def __init__(self):
175
+ self.fingerprints: List[Fingerprint] = []
176
+
177
+ def analyze(self, artifacts: List['Artifact']) -> List[Fingerprint]:
178
+ """
179
+ Analyze artifacts and return technology fingerprints.
180
+
181
+ Args:
182
+ artifacts: List of detected artifacts
183
+
184
+ Returns:
185
+ List of technology fingerprints sorted by confidence
186
+ """
187
+ self.fingerprints = []
188
+
189
+ # Get all inferred operations
190
+ operations = set(a.inferred_operation for a in artifacts)
191
+
192
+ # Match against patterns
193
+ tech_evidence = defaultdict(list)
194
+ tech_confidence = defaultdict(float)
195
+ tech_category = {}
196
+
197
+ for op in operations:
198
+ # Direct pattern match
199
+ if op in self.PATTERNS:
200
+ pattern = self.PATTERNS[op]
201
+ tech = pattern["technology"]
202
+ tech_evidence[tech].append(op)
203
+ tech_confidence[tech] = max(tech_confidence[tech], pattern["weight"])
204
+ tech_category[tech] = pattern["category"]
205
+
206
+ # Partial match (for patterns with suffixes like SCHEDULED_JOB_24HR)
207
+ for pattern_name, pattern in self.PATTERNS.items():
208
+ if op.startswith(pattern_name.split('_')[0] + '_'):
209
+ tech = pattern["technology"]
210
+ if tech not in tech_evidence or op not in tech_evidence[tech]:
211
+ tech_evidence[tech].append(op)
212
+ tech_confidence[tech] = max(tech_confidence[tech], pattern["weight"] * 0.9)
213
+ tech_category[tech] = pattern["category"]
214
+
215
+ # Check compound patterns
216
+ for compound in self.COMPOUND_PATTERNS:
217
+ required = set(compound["requires"])
218
+ if required.issubset(operations):
219
+ fp = compound["suggests"]
220
+ tech_evidence[fp.technology].extend(list(required))
221
+ tech_confidence[fp.technology] = max(tech_confidence.get(fp.technology, 0), fp.confidence)
222
+ tech_category[fp.technology] = fp.category
223
+
224
+ # Build fingerprint objects
225
+ for tech, evidence in tech_evidence.items():
226
+ self.fingerprints.append(Fingerprint(
227
+ technology=tech,
228
+ category=tech_category.get(tech, "unknown"),
229
+ confidence=tech_confidence[tech],
230
+ evidence=list(set(evidence)),
231
+ ))
232
+
233
+ # Sort by confidence
234
+ self.fingerprints.sort(key=lambda f: f.confidence, reverse=True)
235
+
236
+ return self.fingerprints
237
+
238
+ def get_likely_stack(self) -> Dict[str, Any]:
239
+ """
240
+ Synthesize fingerprints into a likely technology stack.
241
+
242
+ Returns:
243
+ Dict describing the probable system architecture
244
+ """
245
+ if not self.fingerprints:
246
+ return {"stack": "Unknown", "components": []}
247
+
248
+ # Group by category
249
+ by_category = defaultdict(list)
250
+ for fp in self.fingerprints:
251
+ by_category[fp.category].append(fp)
252
+
253
+ stack = {
254
+ "database": None,
255
+ "framework": None,
256
+ "language": None,
257
+ "processing": [],
258
+ "infrastructure": [],
259
+ "architecture_notes": [],
260
+ }
261
+
262
+ # Pick highest confidence for single-value categories
263
+ for cat in ["database", "framework", "language"]:
264
+ if cat in by_category:
265
+ stack[cat] = by_category[cat][0].technology
266
+
267
+ # Aggregate list categories
268
+ for cat in ["processing", "infrastructure"]:
269
+ if cat in by_category:
270
+ stack[cat] = [fp.technology for fp in by_category[cat]]
271
+
272
+ # Architecture notes from high-confidence findings
273
+ if "architecture" in by_category:
274
+ stack["architecture_notes"] = [fp.technology for fp in by_category["architecture"]]
275
+
276
+ # Bugs/issues
277
+ if "bug" in by_category:
278
+ stack["issues"] = [fp.technology for fp in by_category["bug"]]
279
+
280
+ return stack
281
+
282
+ def get_security_concerns(self) -> List[Dict[str, Any]]:
283
+ """
284
+ Identify security-relevant findings.
285
+
286
+ Returns:
287
+ List of security concerns derived from fingerprints
288
+ """
289
+ concerns = []
290
+
291
+ for fp in self.fingerprints:
292
+ # UUID v1 leaks info
293
+ if "UUID" in fp.technology and "V1" in fp.technology:
294
+ concerns.append({
295
+ "severity": "medium",
296
+ "issue": "UUID v1 leaks timestamp and MAC address",
297
+ "evidence": fp.evidence,
298
+ "recommendation": "Use UUID v4 for privacy",
299
+ })
300
+
301
+ # MD5 for IDs
302
+ if "MD5" in fp.technology:
303
+ concerns.append({
304
+ "severity": "low",
305
+ "issue": "MD5 used for ID generation (collision risk)",
306
+ "evidence": fp.evidence,
307
+ "recommendation": "Consider SHA-256 for content addressing",
308
+ })
309
+
310
+ # Encoding errors = data loss
311
+ if "Encoding" in fp.technology or "charset" in fp.technology.lower():
312
+ concerns.append({
313
+ "severity": "medium",
314
+ "issue": "Character encoding errors indicate data corruption",
315
+ "evidence": fp.evidence,
316
+ "recommendation": "Audit data pipeline for charset handling",
317
+ })
318
+
319
+ # Legacy patterns
320
+ if "legacy" in fp.technology.lower() or "pre-2015" in fp.technology.lower():
321
+ concerns.append({
322
+ "severity": "info",
323
+ "issue": "Legacy system patterns detected",
324
+ "evidence": fp.evidence,
325
+ "recommendation": "Review for technical debt",
326
+ })
327
+
328
+ return concerns
cascade/genesis.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Genesis - The origin node of the neural internetwork.
3
+
4
+ Every chain begins here. Systems link to genesis (or to any
5
+ descendant of genesis) to join the lattice.
6
+
7
+ The chain IS the registry. No separate discovery needed.
8
+
9
+ Usage:
10
+ # Create genesis (done once, published to well-known location)
11
+ genesis = create_genesis()
12
+
13
+ # Any system joins by linking to genesis
14
+ my_chain.link_external(genesis.merkle_root)
15
+
16
+ # Or by linking to any existing node in the lattice
17
+ my_chain.link_external(some_other_chain.merkle_root)
18
+
19
+ # The lattice grows. Discovery = reading the chain.
20
+ """
21
+
22
+ import hashlib
23
+ import json
24
+ import time
25
+ from pathlib import Path
26
+ from typing import Optional, Dict, Any
27
+
28
+ from cascade.core.provenance import ProvenanceChain, ProvenanceRecord
29
+
30
+
31
+ # Well-known genesis identifiers
32
+ GENESIS_SESSION_ID = "genesis_0"
33
+ GENESIS_MODEL_ID = "cascade_genesis"
34
+ GENESIS_INPUT = "In the beginning was the hash, and the hash was with the chain, and the hash was the chain."
35
+
36
+
37
+ def create_genesis() -> ProvenanceChain:
38
+ """
39
+ Create the genesis chain - origin of the neural internetwork.
40
+
41
+ This is deterministic. Anyone running this gets the same genesis.
42
+ That's the point - it's the Schelling point for the lattice.
43
+ """
44
+ # Deterministic input hash
45
+ input_hash = hashlib.sha256(GENESIS_INPUT.encode()).hexdigest()[:16]
46
+
47
+ # Deterministic model hash (hash of the genesis concept itself)
48
+ model_hash = hashlib.sha256(b"cascade_neural_internetwork_v1").hexdigest()[:16]
49
+
50
+ chain = ProvenanceChain(
51
+ session_id=GENESIS_SESSION_ID,
52
+ model_id=GENESIS_MODEL_ID,
53
+ model_hash=model_hash,
54
+ input_hash=input_hash,
55
+ )
56
+
57
+ # The genesis record - the first node
58
+ # Its parent is itself (bootstrap)
59
+ genesis_record = ProvenanceRecord(
60
+ layer_name="genesis",
61
+ layer_idx=0,
62
+ state_hash=input_hash, # Self-referential
63
+ parent_hashes=[input_hash], # Points to itself
64
+ params_hash=model_hash,
65
+ shape=[1],
66
+ dtype="genesis",
67
+ stats={"created": time.time()},
68
+ execution_order=0,
69
+ )
70
+
71
+ chain.add_record(genesis_record)
72
+ chain.finalize()
73
+
74
+ return chain
75
+
76
+
77
+ def get_genesis_root() -> str:
78
+ """
79
+ Get the genesis merkle root.
80
+
81
+ This is a constant - the Schelling point.
82
+ Any system can compute it and know they're linking to the same origin.
83
+ """
84
+ return create_genesis().merkle_root
85
+
86
+
87
+ def save_genesis(path: Path) -> str:
88
+ """
89
+ Save genesis chain to file.
90
+
91
+ This file can be published to a well-known location
92
+ (HuggingFace dataset, IPFS, etc.)
93
+ """
94
+ genesis = create_genesis()
95
+
96
+ with open(path, 'w') as f:
97
+ json.dump(genesis.to_dict(), f, indent=2)
98
+
99
+ return genesis.merkle_root
100
+
101
+
102
+ def load_genesis(path: Path) -> ProvenanceChain:
103
+ """Load genesis from file and verify it's authentic."""
104
+ with open(path, 'r') as f:
105
+ data = json.load(f)
106
+
107
+ chain = ProvenanceChain.from_dict(data)
108
+
109
+ # Verify this is actually genesis
110
+ expected_root = get_genesis_root()
111
+ if chain.merkle_root != expected_root:
112
+ raise ValueError(
113
+ f"Invalid genesis: root {chain.merkle_root} != expected {expected_root}"
114
+ )
115
+
116
+ return chain
117
+
118
+
119
+ def link_to_genesis(chain: ProvenanceChain) -> None:
120
+ """
121
+ Link a chain to genesis, joining the neural internetwork.
122
+
123
+ This is the simplest way to join - link directly to the origin.
124
+ Alternatively, link to any other chain that traces back to genesis.
125
+ """
126
+ chain.link_external(get_genesis_root(), source_id="genesis")
127
+
128
+
129
+ def verify_lineage_to_genesis(chain: ProvenanceChain, known_chains: Dict[str, ProvenanceChain]) -> bool:
130
+ """
131
+ Verify that a chain traces back to genesis through external_roots.
132
+
133
+ Args:
134
+ chain: The chain to verify
135
+ known_chains: Dict mapping merkle_root -> chain for lookup
136
+
137
+ Returns:
138
+ True if chain traces to genesis, False otherwise
139
+ """
140
+ genesis_root = get_genesis_root()
141
+ visited = set()
142
+
143
+ def trace(root: str) -> bool:
144
+ if root in visited:
145
+ return False
146
+ visited.add(root)
147
+
148
+ # Found genesis!
149
+ if root == genesis_root:
150
+ return True
151
+
152
+ # Look up this chain
153
+ if root not in known_chains:
154
+ return False # Can't verify - chain not known
155
+
156
+ c = known_chains[root]
157
+
158
+ # Check if any external root leads to genesis
159
+ for ext_root in c.external_roots:
160
+ if trace(ext_root):
161
+ return True
162
+
163
+ return False
164
+
165
+ # Start from the chain's own root
166
+ return trace(chain.merkle_root) or any(trace(r) for r in chain.external_roots)
167
+
168
+
169
+ # =============================================================================
170
+ # CLI for genesis operations
171
+ # =============================================================================
172
+
173
+ if __name__ == "__main__":
174
+ import sys
175
+
176
+ genesis = create_genesis()
177
+
178
+ print("=" * 60)
179
+ print("CASCADE GENESIS")
180
+ print("=" * 60)
181
+ print(f"Merkle Root: {genesis.merkle_root}")
182
+ print(f"Session ID: {genesis.session_id}")
183
+ print(f"Model ID: {genesis.model_id}")
184
+ print(f"Input Hash: {genesis.input_hash}")
185
+ print("=" * 60)
186
+ print()
187
+ print("This is the origin of the neural internetwork.")
188
+ print("Any system can link to this root to join the lattice.")
189
+ print()
190
+ print("To join:")
191
+ print(" from cascade.genesis import get_genesis_root")
192
+ print(" my_chain.link_external(get_genesis_root())")
193
+ print()
194
+
195
+ # Save if requested
196
+ if len(sys.argv) > 1 and sys.argv[1] == "--save":
197
+ out_path = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("genesis.json")
198
+ root = save_genesis(out_path)
199
+ print(f"Genesis saved to: {out_path}")
200
+ print(f"Root: {root}")
cascade/hold/__init__.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ╔═══════════════════════════════════════════════════════════════════════════════╗
3
+ ║ ║
4
+ ║ ██╗ ██╗ ██████╗ ██╗ ██████╗ ║
5
+ ║ ██║ ██║██╔═══██╗██║ ██╔══██╗ ║
6
+ ║ ███████║██║ ██║██║ ██║ ██║ ║
7
+ ║ ██╔══██║██║ ██║██║ ██║ ██║ ║
8
+ ║ ██║ ██║╚██████╔╝███████╗██████╔╝ ║
9
+ ║ ╚═╝ ╚═╝ ╚═════╝ ╚══════╝╚═════╝ ║
10
+ ║ ║
11
+ ║ Inference-Level Halt Protocol for CASCADE-LATTICE ║
12
+ ║ ║
13
+ ║ "Pause the machine. See what it sees. Choose what it chooses." ║
14
+ ║ ║
15
+ ╚═══════════════════════════════════════════════════════════════════════════════╝
16
+
17
+ HOLD is MODEL-AGNOSTIC. Works with ANY framework:
18
+ - PyTorch, JAX, TensorFlow, scikit-learn
19
+ - Hugging Face, OpenAI API, Anthropic API
20
+ - Stable Baselines3, RLlib, custom RL
21
+ - Any function that outputs probabilities
22
+
23
+ USAGE:
24
+ >>> from cascade.hold import Hold
25
+ >>>
26
+ >>> # Your model (any framework)
27
+ >>> probs = your_model.predict(obs)
28
+ >>>
29
+ >>> # HOLD at decision point
30
+ >>> hold = Hold.get()
31
+ >>> resolution = hold.yield_point(
32
+ ... action_probs=probs,
33
+ ... value=value_estimate,
34
+ ... observation=obs,
35
+ ... brain_id="my_model",
36
+ ... # Optional informational wealth:
37
+ ... action_labels=["up", "down", "left", "right"],
38
+ ... latent=model.get_latent(),
39
+ ... attention=model.get_attention(),
40
+ ... features=model.get_features(),
41
+ ... imagination=model.imagine_futures(),
42
+ ... )
43
+ >>>
44
+ >>> # Use resolved action
45
+ >>> action = resolution.action
46
+ >>> was_override = resolution.was_override
47
+
48
+ CLI:
49
+ $ cascade hold # Start HOLD interface
50
+ $ cascade hold-status # Show HOLD system status
51
+ """
52
+
53
+ # Primitives - the core API
54
+ from cascade.hold.primitives import (
55
+ HoldState,
56
+ HoldPoint,
57
+ HoldResolution,
58
+ Hold,
59
+ HoldAwareMixin,
60
+ )
61
+
62
+ # Session Layer - arcade-style history and time travel
63
+ from cascade.hold.session import (
64
+ InferenceStep,
65
+ HoldSession,
66
+ ArcadeFeedback,
67
+ CausationHold,
68
+ )
69
+
70
+ __all__ = [
71
+ # Primitives
72
+ "HoldState",
73
+ "HoldPoint",
74
+ "HoldResolution",
75
+ "Hold",
76
+ "HoldAwareMixin",
77
+ # Session
78
+ "InferenceStep",
79
+ "HoldSession",
80
+ "ArcadeFeedback",
81
+ "CausationHold",
82
+ ]
cascade/hold/primitives.py ADDED
@@ -0,0 +1,673 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HOLD Primitives - Core Data Structures and Singleton
3
+ ═══════════════════════════════════════════════════════════
4
+
5
+ The primitive layer of HOLD:
6
+ - HoldPoint: A frozen moment in inference
7
+ - HoldResolution: The outcome of a hold
8
+ - Hold: Singleton system managing inference-level halts
9
+
10
+ HOLD is a CASCADE-LATTICE primitive.
11
+ No cascade = No HOLD.
12
+ """
13
+
14
+ import time
15
+ import hashlib
16
+ import threading
17
+ from typing import Dict, Any, Optional, Callable, List
18
+ from dataclasses import dataclass, field
19
+ from enum import Enum
20
+ import numpy as np
21
+
22
+ # CASCADE-LATTICE is REQUIRED
23
+ try:
24
+ from cascade import sdk_observe
25
+ from cascade.core.event import CausationLink
26
+ from cascade.core.graph import CausationGraph
27
+ HAS_CASCADE = True
28
+ except ImportError:
29
+ HAS_CASCADE = False
30
+ # Stubs for when imported standalone (testing)
31
+ def sdk_observe(*args, **kwargs): pass
32
+ class CausationLink:
33
+ def __init__(self, **kwargs): pass
34
+ class CausationGraph:
35
+ def add_link(self, link): pass
36
+
37
+
38
+ class HoldState(Enum):
39
+ """State of a hold point."""
40
+ PENDING = "pending" # Waiting for resolution
41
+ ACCEPTED = "accepted" # AI choice was accepted
42
+ OVERRIDDEN = "overridden" # Human override
43
+ TIMEOUT = "timeout" # Timed out, fell back to AI
44
+ CANCELLED = "cancelled" # Hold was cancelled
45
+
46
+
47
+ def _sanitize(data: Any) -> Any:
48
+ """Recursively convert numpy types to python types."""
49
+ if isinstance(data, dict):
50
+ return {k: _sanitize(v) for k, v in data.items()}
51
+ elif isinstance(data, (list, tuple)):
52
+ return [_sanitize(x) for x in data]
53
+ elif isinstance(data, np.generic):
54
+ return data.item()
55
+ return data
56
+
57
+
58
+ @dataclass
59
+ class HoldPoint:
60
+ """
61
+ A decision point where inference yields for potential human intervention.
62
+
63
+ This is the "freeze frame" - the moment before commitment.
64
+ The decision matrix is exposed, the merkle chain awaits.
65
+
66
+ INFORMATIONAL WEALTH - everything a human needs to understand the decision:
67
+ - action_labels: What each action means ("FORWARD", "ATTACK", etc.)
68
+ - latent: The model's internal representation (for inspection)
69
+ - attention: What the model is attending to
70
+ - features: Extracted feature activations
71
+ - imagination: Per-action trajectory predictions and expected values
72
+ - logits: Raw logits before softmax (for temperature analysis)
73
+ - reasoning: Text explanations if available
74
+ """
75
+ # Decision matrix
76
+ action_probs: np.ndarray # The probability distribution
77
+ value: float # Predicted value
78
+
79
+ # Context
80
+ observation: Dict[str, Any] # What the brain saw
81
+ brain_id: str # Which brain is holding
82
+
83
+ # === INFORMATIONAL WEALTH ===
84
+
85
+ # Action labels - CRITICAL for human understanding
86
+ action_labels: Optional[List[str]] = None # ["NOOP", "FORWARD", "BACK", ...]
87
+
88
+ # Internal state
89
+ latent: Optional[np.ndarray] = None # Latent activations (any shape)
90
+ attention: Optional[Dict[str, float]] = None # {"position": 0.7, "health": 0.3, ...}
91
+ features: Optional[Dict[str, float]] = None # {"spatial_attn": 0.8, "danger": 0.2, ...}
92
+
93
+ # Per-action deep data
94
+ imagination: Optional[Dict[int, Dict]] = None # {0: {"trajectory": [...], "expected_value": 0.5}, ...}
95
+
96
+ # Logits (pre-softmax)
97
+ logits: Optional[np.ndarray] = None # Raw logits for each action
98
+
99
+ # Reasoning chain (if model provides explanations)
100
+ reasoning: Optional[List[str]] = None # ["High reward expected", "Low risk path", ...]
101
+
102
+ # World model predictions (if available)
103
+ world_prediction: Optional[Dict[str, Any]] = None # {"pos_delta": [1,0,0], "health_delta": -2, ...}
104
+
105
+ # === END WEALTH ===
106
+
107
+ # Identity
108
+ id: str = field(default_factory=lambda: hashlib.sha256(str(time.time()).encode()).hexdigest()[:16])
109
+ timestamp: float = field(default_factory=time.time)
110
+
111
+ # Merkle linkage
112
+ parent_merkle: Optional[str] = None # Previous hold point
113
+ merkle_root: Optional[str] = None # Computed on creation
114
+
115
+ # State
116
+ state: HoldState = HoldState.PENDING
117
+
118
+ def __post_init__(self):
119
+ """Compute merkle root on creation."""
120
+ if self.merkle_root is None:
121
+ data = f"{self.id}:{self.brain_id}:{self.action_probs.tobytes().hex()}:{self.timestamp}"
122
+ if self.parent_merkle:
123
+ data = f"{self.parent_merkle}:{data}"
124
+ self.merkle_root = hashlib.sha256(data.encode()).hexdigest()[:16]
125
+
126
+ @property
127
+ def ai_choice(self) -> int:
128
+ """What the AI would choose."""
129
+ return int(np.argmax(self.action_probs))
130
+
131
+ @property
132
+ def ai_confidence(self) -> float:
133
+ """Confidence in AI's top choice."""
134
+ return float(np.max(self.action_probs))
135
+
136
+ def to_dict(self) -> Dict[str, Any]:
137
+ """Serialize for CASCADE observation - includes full informational wealth."""
138
+ d = {
139
+ 'id': self.id,
140
+ 'brain_id': self.brain_id,
141
+ 'action_probs': self.action_probs.tolist(),
142
+ 'ai_choice': self.ai_choice,
143
+ 'ai_confidence': self.ai_confidence,
144
+ 'value': self.value,
145
+ 'timestamp': self.timestamp,
146
+ 'merkle_root': self.merkle_root,
147
+ 'parent_merkle': self.parent_merkle,
148
+ 'state': self.state.value,
149
+ 'observation': self.observation,
150
+ }
151
+
152
+ # Include all available wealth
153
+ if self.action_labels is not None:
154
+ d['action_labels'] = self.action_labels
155
+ if self.latent is not None:
156
+ d['latent'] = self.latent.tolist() if hasattr(self.latent, 'tolist') else self.latent
157
+ if self.attention is not None:
158
+ d['attention'] = self.attention
159
+ if self.features is not None:
160
+ d['features'] = self.features
161
+ if self.imagination is not None:
162
+ d['imagination'] = self.imagination
163
+ if self.logits is not None:
164
+ d['logits'] = self.logits.tolist() if hasattr(self.logits, 'tolist') else self.logits
165
+ if self.reasoning is not None:
166
+ d['reasoning'] = self.reasoning
167
+ if self.world_prediction is not None:
168
+ d['world_prediction'] = self.world_prediction
169
+
170
+ return _sanitize(d)
171
+
172
+
173
+ @dataclass
174
+ class HoldResolution:
175
+ """
176
+ The resolution of a hold point.
177
+
178
+ Either the human accepted, overrode, or it timed out.
179
+ Links back to the hold point, forming a provenance chain.
180
+ """
181
+ hold_point: HoldPoint # The hold that was resolved
182
+ action: int # Final action taken
183
+
184
+ # Resolution details
185
+ was_override: bool # True if human overrode AI
186
+ override_source: Optional[str] = None # Who/what overrode ("human", "policy", etc.)
187
+
188
+ # Timing
189
+ hold_duration: float = 0.0 # How long was held
190
+ timestamp: float = field(default_factory=time.time)
191
+
192
+ # Merkle linkage
193
+ merkle_root: Optional[str] = None
194
+
195
+ def __post_init__(self):
196
+ """Compute merkle root."""
197
+ if self.merkle_root is None:
198
+ data = f"{self.hold_point.merkle_root}:{self.action}:{self.was_override}:{self.timestamp}"
199
+ self.merkle_root = hashlib.sha256(data.encode()).hexdigest()[:16]
200
+
201
+ def to_dict(self) -> Dict[str, Any]:
202
+ """Serialize for CASCADE observation."""
203
+ d = {
204
+ 'hold_id': self.hold_point.id,
205
+ 'hold_merkle': self.hold_point.merkle_root,
206
+ 'action': self.action,
207
+ 'ai_choice': self.hold_point.ai_choice,
208
+ 'was_override': self.was_override,
209
+ 'override_source': self.override_source,
210
+ 'hold_duration': self.hold_duration,
211
+ 'merkle_root': self.merkle_root,
212
+ 'timestamp': self.timestamp,
213
+ }
214
+ return _sanitize(d)
215
+
216
+
217
+ class Hold:
218
+ """
219
+ The HOLD system - manages inference-level halts.
220
+
221
+ Singleton pattern - one Hold system per process.
222
+
223
+ Usage:
224
+ hold = Hold.get()
225
+
226
+ # Register listeners (for UI, visualization, etc.)
227
+ hold.register_listener(my_callback)
228
+
229
+ # From within a brain's forward() method:
230
+ resolution = hold.yield_point(
231
+ action_probs=probs,
232
+ value=value,
233
+ observation=obs,
234
+ brain_id="brain_001"
235
+ )
236
+ # Blocks until resolution!
237
+
238
+ # From UI/control thread:
239
+ hold.accept() # or
240
+ hold.override(action=3, source="human")
241
+ """
242
+
243
+ _instance = None
244
+ _lock = threading.Lock()
245
+
246
+ def __new__(cls):
247
+ if cls._instance is None:
248
+ with cls._lock:
249
+ if cls._instance is None:
250
+ cls._instance = super().__new__(cls)
251
+ cls._instance._initialized = False
252
+ return cls._instance
253
+
254
+ def __init__(self):
255
+ if self._initialized:
256
+ return
257
+
258
+ # State
259
+ self._current_hold: Optional[HoldPoint] = None
260
+ self._resolution_event = threading.Event()
261
+ self._resolution: Optional[HoldResolution] = None
262
+
263
+ # Chain
264
+ self._last_merkle: Optional[str] = None
265
+ self._hold_count = 0
266
+ self._override_count = 0
267
+
268
+ # Callbacks - interfaces register here to receive hold points
269
+ self._listeners: List[Callable[[HoldPoint], None]] = []
270
+
271
+ # Settings
272
+ self.timeout: float = 30.0 # Default timeout (seconds)
273
+ self.auto_accept: bool = False # If True, don't block, just observe
274
+
275
+ # CASCADE graph for this session
276
+ self._causation_graph = CausationGraph()
277
+
278
+ self._initialized = True
279
+ print("[HOLD] system initialized (cascade-lattice)")
280
+
281
+ @classmethod
282
+ def get(cls) -> 'Hold':
283
+ """Get the singleton instance."""
284
+ return cls()
285
+
286
+ def register_listener(self, callback: Callable[[HoldPoint], None]):
287
+ """
288
+ Register a listener for hold points.
289
+
290
+ The callback receives HoldPoint when inference halts.
291
+ Use this to connect visualizations, UIs, etc.
292
+ """
293
+ self._listeners.append(callback)
294
+ print(f"[REGISTER] Registered HOLD listener: {callback.__name__ if hasattr(callback, '__name__') else callback}")
295
+
296
+ def unregister_listener(self, callback: Callable):
297
+ """Remove a listener."""
298
+ if callback in self._listeners:
299
+ self._listeners.remove(callback)
300
+
301
+ def yield_point(
302
+ self,
303
+ action_probs: np.ndarray,
304
+ value: float,
305
+ observation: Dict[str, Any],
306
+ brain_id: str,
307
+ # === INFORMATIONAL WEALTH ===
308
+ action_labels: Optional[List[str]] = None,
309
+ latent: Optional[np.ndarray] = None,
310
+ attention: Optional[Dict[str, float]] = None,
311
+ features: Optional[Dict[str, float]] = None,
312
+ imagination: Optional[Dict[int, Dict]] = None,
313
+ logits: Optional[np.ndarray] = None,
314
+ reasoning: Optional[List[str]] = None,
315
+ world_prediction: Optional[Dict[str, Any]] = None,
316
+ # === END WEALTH ===
317
+ blocking: bool = True,
318
+ ) -> HoldResolution:
319
+ """
320
+ Create a hold point and yield for resolution.
321
+
322
+ This is called from within a brain's forward() method.
323
+ Blocks until resolved (or timeout).
324
+
325
+ Args:
326
+ action_probs: The decision matrix (probability distribution)
327
+ value: Predicted value
328
+ observation: What the brain observed
329
+ brain_id: Identifier for the brain
330
+
331
+ INFORMATIONAL WEALTH (all optional, but improves human understanding):
332
+ action_labels: Names for each action ["FORWARD", "BACK", "LEFT", ...]
333
+ latent: Model's latent state/activations
334
+ attention: Attention weights {"position": 0.7, "health": 0.3}
335
+ features: Feature activations {"spatial": 0.8, "danger": 0.2}
336
+ imagination: Per-action predictions {0: {"trajectory": [...], "expected_value": 0.5}}
337
+ logits: Raw pre-softmax logits
338
+ reasoning: Text explanations ["High reward expected", ...]
339
+ world_prediction: World model predictions {"pos_delta": [1,0,0]}
340
+
341
+ blocking: If False, returns immediately with AI choice
342
+
343
+ Returns:
344
+ HoldResolution with the final action
345
+ """
346
+ # Create hold point with full wealth
347
+ hold = HoldPoint(
348
+ action_probs=action_probs,
349
+ value=value,
350
+ observation=observation,
351
+ brain_id=brain_id,
352
+ action_labels=action_labels,
353
+ latent=latent,
354
+ attention=attention,
355
+ features=features,
356
+ imagination=imagination,
357
+ logits=logits,
358
+ reasoning=reasoning,
359
+ world_prediction=world_prediction,
360
+ parent_merkle=self._last_merkle,
361
+ )
362
+
363
+ # Observe the hold point in CASCADE
364
+ sdk_observe(
365
+ model_id=brain_id,
366
+ input_data=observation,
367
+ output_data={**hold.to_dict(), 'event_type': 'hold_point'},
368
+ )
369
+
370
+ self._hold_count += 1
371
+
372
+ # Non-blocking mode - just observe and return AI choice
373
+ if not blocking or self.auto_accept:
374
+ resolution = HoldResolution(
375
+ hold_point=hold,
376
+ action=hold.ai_choice,
377
+ was_override=False,
378
+ hold_duration=0.0,
379
+ )
380
+ self._observe_resolution(resolution)
381
+ return resolution
382
+
383
+ # Set as current hold
384
+ self._current_hold = hold
385
+ self._resolution_event.clear()
386
+ self._resolution = None
387
+
388
+ # Notify listeners
389
+ for listener in self._listeners:
390
+ try:
391
+ listener(hold)
392
+ except Exception as e:
393
+ print(f"⚠️ HOLD listener error: {e}")
394
+
395
+ # Print hold info
396
+ print(f"\n{'═' * 50}")
397
+ print(f"🛑 HOLD #{self._hold_count}")
398
+ print(f" Merkle: {hold.merkle_root}")
399
+ ai_label = hold.action_labels[hold.ai_choice] if hold.action_labels else str(hold.ai_choice)
400
+ print(f" AI Choice: {ai_label} (confidence: {hold.ai_confidence:.2%})")
401
+ print(f" Value: {hold.value:.4f}")
402
+
403
+ # Show probabilities with labels
404
+ if hold.action_labels:
405
+ prob_str = ', '.join(f'{hold.action_labels[i]}:{p:.2f}' for i, p in enumerate(hold.action_probs))
406
+ else:
407
+ prob_str = ', '.join(f'{i}:{p:.2f}' for i, p in enumerate(hold.action_probs))
408
+ print(f" Probabilities: {prob_str}")
409
+
410
+ # Show available wealth
411
+ wealth = []
412
+ if hold.latent is not None: wealth.append("latent")
413
+ if hold.attention is not None: wealth.append("attention")
414
+ if hold.features is not None: wealth.append("features")
415
+ if hold.imagination is not None: wealth.append("imagination")
416
+ if hold.reasoning is not None: wealth.append("reasoning")
417
+ if wealth:
418
+ print(f" Wealth: {', '.join(wealth)}")
419
+
420
+ print(f" Waiting for resolution (timeout: {self.timeout}s)...")
421
+ print(f"{'═' * 50}")
422
+
423
+ # Block until resolution or timeout
424
+ start_time = time.time()
425
+ resolved = self._resolution_event.wait(timeout=self.timeout)
426
+ hold_duration = time.time() - start_time
427
+
428
+ if resolved and self._resolution:
429
+ resolution = self._resolution
430
+ resolution.hold_duration = hold_duration
431
+ else:
432
+ # Timeout - use AI choice
433
+ hold.state = HoldState.TIMEOUT
434
+ resolution = HoldResolution(
435
+ hold_point=hold,
436
+ action=hold.ai_choice,
437
+ was_override=False,
438
+ override_source="timeout",
439
+ hold_duration=hold_duration,
440
+ )
441
+ print(f"[TIMEOUT] HOLD timeout - accepting AI choice: {hold.ai_choice}")
442
+
443
+ # Observe resolution
444
+ self._observe_resolution(resolution)
445
+
446
+ # Clear state
447
+ self._current_hold = None
448
+ self._resolution = None
449
+
450
+ return resolution
451
+
452
+ def resolve(self, action: int, source: str = "human"):
453
+ """
454
+ Resolve the current hold with an action.
455
+
456
+ Called by UI/interface when human makes a choice.
457
+
458
+ Args:
459
+ action: The chosen action
460
+ source: Who resolved it ("human", "policy", etc.)
461
+ """
462
+ if self._current_hold is None:
463
+ print("[WARN] No active hold to resolve")
464
+ return
465
+
466
+ hold = self._current_hold
467
+ was_override = (action != hold.ai_choice)
468
+
469
+ if was_override:
470
+ hold.state = HoldState.OVERRIDDEN
471
+ self._override_count += 1
472
+ else:
473
+ hold.state = HoldState.ACCEPTED
474
+
475
+ self._resolution = HoldResolution(
476
+ hold_point=hold,
477
+ action=action,
478
+ was_override=was_override,
479
+ override_source=source if was_override else None,
480
+ )
481
+
482
+ print(f"[RESOLVE] HOLD resolved: action={action}, override={was_override}")
483
+ self._resolution_event.set()
484
+
485
+ def accept(self):
486
+ """Accept AI's choice for current hold."""
487
+ if self._current_hold:
488
+ self.resolve(self._current_hold.ai_choice, source="accept")
489
+
490
+ def override(self, action: int, source: str = "human"):
491
+ """Override with a different action."""
492
+ self.resolve(action, source)
493
+
494
+ def cancel(self):
495
+ """Cancel current hold without resolution."""
496
+ if self._current_hold:
497
+ self._current_hold.state = HoldState.CANCELLED
498
+ self._resolution = HoldResolution(
499
+ hold_point=self._current_hold,
500
+ action=self._current_hold.ai_choice,
501
+ was_override=False,
502
+ override_source="cancelled",
503
+ )
504
+ self._resolution_event.set()
505
+
506
+ def _observe_resolution(self, resolution: HoldResolution):
507
+ """Record resolution to CASCADE."""
508
+ sdk_observe(
509
+ model_id=resolution.hold_point.brain_id,
510
+ input_data=resolution.hold_point.to_dict(),
511
+ output_data={**resolution.to_dict(), 'event_type': 'hold_resolution'},
512
+ )
513
+
514
+ # Update chain
515
+ self._last_merkle = resolution.merkle_root
516
+
517
+ # Add to causation graph
518
+ link = CausationLink(
519
+ from_event=resolution.hold_point.merkle_root,
520
+ to_event=resolution.merkle_root,
521
+ causation_type="hold_resolved",
522
+ strength=1.0 if resolution.was_override else 0.5,
523
+ explanation=f"Override: {resolution.was_override}, Action: {resolution.action}",
524
+ )
525
+ self._causation_graph.add_link(link)
526
+
527
+ @property
528
+ def current_hold(self) -> Optional[HoldPoint]:
529
+ """Get current active hold point (if any)."""
530
+ return self._current_hold
531
+
532
+ @property
533
+ def stats(self) -> Dict[str, Any]:
534
+ """Get hold statistics."""
535
+ return {
536
+ 'total_holds': self._hold_count,
537
+ 'overrides': self._override_count,
538
+ 'override_rate': self._override_count / max(self._hold_count, 1),
539
+ 'last_merkle': self._last_merkle,
540
+ }
541
+
542
+
543
+ class HoldAwareMixin:
544
+ """
545
+ Mixin for brains that support HOLD.
546
+
547
+ Add this to your Brain class to enable inference-level halts.
548
+
549
+ Usage:
550
+ class MyBrain(HoldAwareMixin, BaseBrain):
551
+ def forward(self, inputs):
552
+ # Your inference code
553
+ return {"action_probs": probs, "value": value}
554
+
555
+ brain = MyBrain()
556
+ brain.enable_hold()
557
+
558
+ # Now forward_with_hold() will pause for human input
559
+ output = brain.forward_with_hold(inputs)
560
+ """
561
+
562
+ def __init__(self, *args, **kwargs):
563
+ super().__init__(*args, **kwargs)
564
+ self._hold_system = Hold.get()
565
+ self._hold_enabled = True
566
+ self._brain_id = getattr(self, 'id', hashlib.sha256(str(id(self)).encode()).hexdigest()[:16])
567
+
568
+ def forward_with_hold(
569
+ self,
570
+ inputs: Dict[str, Any],
571
+ blocking: bool = True,
572
+ ) -> Dict[str, Any]:
573
+ """
574
+ Forward pass with HOLD support.
575
+
576
+ Call this instead of forward() to enable hold points.
577
+ """
578
+ # Get decision matrix from normal forward
579
+ output = self.forward(inputs)
580
+
581
+ if not self._hold_enabled:
582
+ return output
583
+
584
+ action_probs = output.get('action_probs', None)
585
+ if action_probs is None:
586
+ return output
587
+
588
+ # Get imagination if available (DreamerBrain, etc.)
589
+ imagined = None
590
+ if hasattr(self, 'imagine'):
591
+ try:
592
+ imagined = self.imagine(horizon=15)
593
+ except:
594
+ pass
595
+
596
+ # Yield to hold system
597
+ resolution = self._hold_system.yield_point(
598
+ action_probs=np.array(action_probs),
599
+ value=float(output.get('value', 0.0)),
600
+ observation=inputs,
601
+ brain_id=self._brain_id,
602
+ imagined_futures=imagined,
603
+ blocking=blocking,
604
+ )
605
+
606
+ # Update output with resolved action
607
+ output['action'] = resolution.action
608
+ output['hold_resolution'] = resolution.to_dict()
609
+ output['was_override'] = resolution.was_override
610
+
611
+ return output
612
+
613
+ def enable_hold(self):
614
+ """Enable HOLD for this brain."""
615
+ self._hold_enabled = True
616
+
617
+ def disable_hold(self):
618
+ """Disable HOLD (normal inference)."""
619
+ self._hold_enabled = False
620
+
621
+
622
+ # Demo
623
+ def _demo_hold():
624
+ """Demonstrate HOLD system."""
625
+ print("=" * 60)
626
+ print("HOLD SYSTEM DEMO")
627
+ print("=" * 60)
628
+
629
+ # Get hold system
630
+ hold = Hold.get()
631
+ hold.timeout = 10.0
632
+
633
+ def on_hold(point: HoldPoint):
634
+ print(f"\n🔔 Listener received hold: {point.id}")
635
+
636
+ hold.register_listener(on_hold)
637
+
638
+ def brain_loop():
639
+ for step in range(3):
640
+ probs = np.random.dirichlet(np.ones(8))
641
+ resolution = hold.yield_point(
642
+ action_probs=probs,
643
+ value=np.random.random(),
644
+ observation={'step': step},
645
+ brain_id='demo_brain',
646
+ )
647
+ print(f"Brain received: action={resolution.action}, override={resolution.was_override}")
648
+
649
+ def human_input():
650
+ for i in range(3):
651
+ time.sleep(2)
652
+ if hold.current_hold:
653
+ if i % 2 == 0:
654
+ hold.accept()
655
+ else:
656
+ hold.override(7, source="demo_human")
657
+
658
+ brain_thread = threading.Thread(target=brain_loop)
659
+ human_thread = threading.Thread(target=human_input)
660
+
661
+ brain_thread.start()
662
+ human_thread.start()
663
+
664
+ brain_thread.join()
665
+ human_thread.join()
666
+
667
+ print(f"\n{'=' * 60}")
668
+ print("SESSION STATS")
669
+ print(hold.stats)
670
+
671
+
672
+ if __name__ == "__main__":
673
+ _demo_hold()
cascade/hold/session.py ADDED
@@ -0,0 +1,707 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HOLD Session - Arcade-Style Inference Interception
3
+ ══════════════════════════════════════════════════════════
4
+
5
+ "Pause the machine. See what it sees. Choose what it chooses."
6
+
7
+ The arcade layer of HOLD:
8
+ - CausationHold: Session management with history
9
+ - InferenceStep: Single crystallized moment
10
+ - Time travel via state snapshots
11
+ - Speed controls and combo tracking
12
+
13
+ Controls:
14
+ SPACE - Accept model's choice, advance
15
+ 1-9 - Override with alternative
16
+ ←/→ - Step back/forward through history
17
+ +/- - Speed up/slow down auto-advance
18
+ P - Pause/unpause auto-advance
19
+ ESC - Exit hold mode
20
+ """
21
+
22
+ import numpy as np
23
+ import time
24
+ import json
25
+ import hashlib
26
+ import threading
27
+ from dataclasses import dataclass, field
28
+ from typing import Dict, List, Optional, Any, Callable, Tuple
29
+ from datetime import datetime
30
+ from pathlib import Path
31
+ from enum import Enum
32
+
33
+
34
+ class SessionState(Enum):
35
+ """Current state of the hold session."""
36
+ IDLE = "idle" # Not holding anything
37
+ PAUSED = "paused" # Frozen, waiting for input
38
+ STEPPING = "stepping" # Auto-advancing at set speed
39
+ REWINDING = "rewinding" # Going backwards through history
40
+
41
+
42
+ @dataclass
43
+ class InferenceStep:
44
+ """A single crystallized moment of inference."""
45
+ step_id: str
46
+ step_index: int
47
+ timestamp: float
48
+
49
+ # What the model sees
50
+ input_context: Dict[str, Any]
51
+
52
+ # What the model wants to do
53
+ candidates: List[Dict[str, Any]] # [{value, probability, metadata}]
54
+ top_choice: Any
55
+ top_probability: float
56
+
57
+ # Internal state snapshot (for true rewind)
58
+ hidden_state: Optional[np.ndarray] = None
59
+ attention_weights: Optional[Dict[str, float]] = None
60
+
61
+ # What actually happened
62
+ chosen_value: Any = None
63
+ was_override: bool = False
64
+ override_by: str = "model" # "model" or "human"
65
+
66
+ # Provenance
67
+ cascade_hash: Optional[str] = None
68
+
69
+ # Private: full state snapshot for true rewind
70
+ _state_snapshot: Optional[Dict[str, Any]] = field(default=None, repr=False)
71
+
72
+
73
+ @dataclass
74
+ class HoldSession:
75
+ """A complete hold session with history."""
76
+ session_id: str
77
+ agent_id: str
78
+ started_at: float
79
+
80
+ # All steps in order
81
+ steps: List[InferenceStep] = field(default_factory=list)
82
+ current_index: int = 0
83
+
84
+ # Arcade stats
85
+ total_steps: int = 0
86
+ human_overrides: int = 0
87
+ correct_predictions: int = 0 # Human guessed what model would do
88
+ combo: int = 0
89
+ max_combo: int = 0
90
+
91
+ # Speed control (steps per second, 0 = manual only)
92
+ speed_level: int = 0 # 0=manual, 1=slow, 2=medium, 3=fast, 4=ludicrous
93
+ speed_map: Dict[int, float] = field(default_factory=lambda: {
94
+ 0: 0.0, # Manual
95
+ 1: 0.5, # 2 sec per step
96
+ 2: 1.0, # 1 sec per step
97
+ 3: 2.0, # 0.5 sec per step
98
+ 4: 10.0, # 0.1 sec per step (ludicrous speed)
99
+ })
100
+
101
+ # State
102
+ state: SessionState = SessionState.IDLE
103
+
104
+
105
+ @dataclass
106
+ class ArcadeFeedback:
107
+ """Visual/audio feedback cues."""
108
+ message: str
109
+ intensity: float # 0-1, for glow/shake/etc
110
+ sound_cue: str # "accept", "override", "combo", "combo_break", "rewind"
111
+ color: Tuple[int, int, int] = (255, 255, 255)
112
+
113
+
114
+ class CausationHold:
115
+ """
116
+ The arcade-layer hold system. Wraps any inference function.
117
+
118
+ Features:
119
+ - Session management with full history
120
+ - True state restoration for time travel
121
+ - Speed controls (manual to ludicrous)
122
+ - Combo tracking and high scores
123
+
124
+ Usage:
125
+ hold = CausationHold()
126
+
127
+ # Start a session
128
+ hold.begin_session(agent_id="agent_123")
129
+
130
+ # In inference loop:
131
+ for step in inference_steps:
132
+ choice, feedback = hold.capture(
133
+ input_context={"tokens": tokens},
134
+ candidates=[{"value": "A", "probability": 0.8}, ...]
135
+ ) # Pauses here until user input!
136
+
137
+ # Time travel
138
+ hold.rewind(steps=3)
139
+ hold.branch_from(step_index=5, choice_index=2)
140
+
141
+ stats = hold.end_session()
142
+ """
143
+
144
+ def __init__(self, cascade_bus=None):
145
+ """
146
+ Args:
147
+ cascade_bus: Optional CASCADE event bus for provenance
148
+ """
149
+ self.bus = cascade_bus
150
+ self.session: Optional[HoldSession] = None
151
+ self.callbacks: Dict[str, List[Callable]] = {
152
+ 'on_step': [],
153
+ 'on_override': [],
154
+ 'on_combo': [],
155
+ 'on_combo_break': [],
156
+ 'on_rewind': [],
157
+ 'on_state_restore': [],
158
+ }
159
+
160
+ # Thread safety
161
+ self._lock = threading.Lock()
162
+ self._input_event = threading.Event()
163
+ self._user_choice: Optional[Any] = None
164
+
165
+ # High scores (persisted)
166
+ self.high_scores_path = Path("data/hold_high_scores.json")
167
+ self.high_scores = self._load_high_scores()
168
+
169
+ # ========================================================================
170
+ # SESSION MANAGEMENT
171
+ # ========================================================================
172
+
173
+ def begin_session(self, agent_id: str) -> HoldSession:
174
+ """Start a new hold session."""
175
+ session_id = f"hold_{agent_id}_{int(time.time()*1000)}"
176
+
177
+ self.session = HoldSession(
178
+ session_id=session_id,
179
+ agent_id=agent_id,
180
+ started_at=time.time(),
181
+ )
182
+ self.session.state = SessionState.PAUSED
183
+
184
+ self._emit_cascade("hold_session_start", {
185
+ "session_id": session_id,
186
+ "agent_id": agent_id,
187
+ })
188
+
189
+ return self.session
190
+
191
+ def end_session(self) -> Dict[str, Any]:
192
+ """End session and return stats."""
193
+ if not self.session:
194
+ return {}
195
+
196
+ stats = {
197
+ "session_id": self.session.session_id,
198
+ "agent_id": self.session.agent_id,
199
+ "duration": time.time() - self.session.started_at,
200
+ "total_steps": self.session.total_steps,
201
+ "human_overrides": self.session.human_overrides,
202
+ "correct_predictions": self.session.correct_predictions,
203
+ "max_combo": self.session.max_combo,
204
+ "accuracy": (
205
+ self.session.correct_predictions / max(1, self.session.total_steps)
206
+ ),
207
+ }
208
+
209
+ # Check for high score
210
+ self._check_high_score(stats)
211
+
212
+ self._emit_cascade("hold_session_end", stats)
213
+
214
+ self.session = None
215
+ return stats
216
+
217
+ # ========================================================================
218
+ # CAPTURE & ADVANCE - WITH STATE SNAPSHOT FOR TRUE REWIND
219
+ # ========================================================================
220
+
221
+ def capture(
222
+ self,
223
+ input_context: Dict[str, Any],
224
+ candidates: List[Dict[str, Any]],
225
+ hidden_state: Optional[np.ndarray] = None,
226
+ attention: Optional[Dict[str, float]] = None,
227
+ state_snapshot: Optional[Dict[str, Any]] = None,
228
+ ) -> Tuple[Any, ArcadeFeedback]:
229
+ """
230
+ Capture an inference step. BLOCKS until user input or auto-advance.
231
+
232
+ IMPORTANT: Pass state_snapshot for true rewind capability.
233
+ This should be a complete snapshot of the model's internal state
234
+ that can be restored to allow execution from this decision point
235
+ with a different choice.
236
+
237
+ This is NOT prediction - you will ACTUALLY execute the choice and
238
+ see REAL outcomes. If you don't like them, rewind and try again.
239
+
240
+ Args:
241
+ input_context: What the model is looking at
242
+ candidates: List of {value, probability, ...} options
243
+ hidden_state: Optional internal state snapshot (deprecated, use state_snapshot)
244
+ attention: Optional attention weights
245
+ state_snapshot: Complete model state for TRUE rewind capability
246
+
247
+ Returns:
248
+ (chosen_value, feedback) - The value to use and arcade feedback
249
+ """
250
+ if not self.session:
251
+ # No session = passthrough, just return top choice
252
+ return candidates[0]['value'], ArcadeFeedback("", 0, "")
253
+
254
+ # Sort candidates by probability
255
+ candidates = sorted(candidates, key=lambda x: x.get('probability', 0), reverse=True)
256
+ top = candidates[0]
257
+
258
+ # Merge hidden_state into state_snapshot if provided separately
259
+ if state_snapshot is None and hidden_state is not None:
260
+ state_snapshot = {'hidden_state': hidden_state}
261
+ elif state_snapshot is not None and hidden_state is not None:
262
+ state_snapshot['hidden_state'] = hidden_state
263
+
264
+ # Create step - this is a CHECKPOINT for true rewind
265
+ step = InferenceStep(
266
+ step_id=f"step_{self.session.total_steps}",
267
+ step_index=self.session.total_steps,
268
+ timestamp=time.time(),
269
+ input_context=input_context,
270
+ candidates=candidates,
271
+ top_choice=top['value'],
272
+ top_probability=top.get('probability', 1.0),
273
+ hidden_state=hidden_state,
274
+ attention_weights=attention,
275
+ )
276
+
277
+ # Store state snapshot for TRUE rewind (not just history navigation)
278
+ if state_snapshot is not None:
279
+ step._state_snapshot = state_snapshot
280
+
281
+ # Compute merkle hash for provenance
282
+ step.cascade_hash = self._compute_step_hash(step)
283
+
284
+ # Add to history
285
+ with self._lock:
286
+ self.session.steps.append(step)
287
+ self.session.current_index = len(self.session.steps) - 1
288
+ self.session.total_steps += 1
289
+
290
+ # Emit step event
291
+ self._emit_callback('on_step', step)
292
+ self._emit_cascade("hold_step", {
293
+ "step_index": step.step_index,
294
+ "top_choice": str(top['value']),
295
+ "top_prob": top.get('probability', 1.0),
296
+ "num_candidates": len(candidates),
297
+ "has_snapshot": state_snapshot is not None,
298
+ "merkle": step.cascade_hash,
299
+ })
300
+
301
+ # Wait for input
302
+ choice, feedback = self._wait_for_input(step)
303
+
304
+ # Record what happened
305
+ step.chosen_value = choice
306
+ step.was_override = (choice != top['value'])
307
+ step.override_by = "human" if step.was_override else "model"
308
+
309
+ if step.was_override:
310
+ self.session.human_overrides += 1
311
+ self._emit_callback('on_override', step, choice)
312
+
313
+ return choice, feedback
314
+
315
+ def _wait_for_input(self, step: InferenceStep) -> Tuple[Any, ArcadeFeedback]:
316
+ """Wait for user input or auto-advance timer."""
317
+
318
+ # Manual mode = wait indefinitely
319
+ if self.session.speed_level == 0:
320
+ self._input_event.clear()
321
+ self._input_event.wait() # Blocks until input()
322
+
323
+ choice = self._user_choice
324
+ self._user_choice = None
325
+
326
+ else:
327
+ # Auto-advance mode
328
+ speed = self.session.speed_map[self.session.speed_level]
329
+ wait_time = 1.0 / speed if speed > 0 else float('inf')
330
+
331
+ self._input_event.clear()
332
+ got_input = self._input_event.wait(timeout=wait_time)
333
+
334
+ if got_input and self._user_choice is not None:
335
+ choice = self._user_choice
336
+ self._user_choice = None
337
+ else:
338
+ # Auto-accepted
339
+ choice = step.top_choice
340
+
341
+ # Generate feedback
342
+ return choice, self._generate_feedback(step, choice)
343
+
344
+ def input(self, choice: Any):
345
+ """
346
+ Provide user input. Call from UI thread.
347
+
348
+ Args:
349
+ choice: The value to use (or index into candidates)
350
+ """
351
+ if not self.session:
352
+ return
353
+
354
+ current_step = self.session.steps[self.session.current_index]
355
+
356
+ # Handle index input (1-9 keys)
357
+ if isinstance(choice, int) and 0 <= choice < len(current_step.candidates):
358
+ choice = current_step.candidates[choice]['value']
359
+
360
+ self._user_choice = choice
361
+ self._input_event.set()
362
+
363
+ def accept(self):
364
+ """Accept model's top choice (SPACE key)."""
365
+ if not self.session or not self.session.steps:
366
+ return
367
+
368
+ current = self.session.steps[self.session.current_index]
369
+ self.input(current.top_choice)
370
+
371
+ def override(self, index: int):
372
+ """Override with candidate at index (1-9 keys)."""
373
+ self.input(index)
374
+
375
+ # ========================================================================
376
+ # NAVIGATION (TIME TRAVEL) - TRUE STATE RESTORATION
377
+ # ========================================================================
378
+
379
+ def rewind(self, steps: int = 1, restore_state: bool = True) -> Optional[InferenceStep]:
380
+ """
381
+ Go back in history with optional state restoration.
382
+
383
+ This is NOT simulation - we actually restore the model's internal state
384
+ to the snapshot taken at that decision point. From there, you can
385
+ execute a different branch and see REAL outcomes.
386
+
387
+ Args:
388
+ steps: Number of steps to go back
389
+ restore_state: If True, actually restore hidden_state to model
390
+
391
+ Returns:
392
+ The step we rewound to
393
+ """
394
+ if not self.session:
395
+ return None
396
+
397
+ with self._lock:
398
+ new_index = max(0, self.session.current_index - steps)
399
+ if new_index != self.session.current_index:
400
+ self.session.current_index = new_index
401
+ self.session.state = SessionState.REWINDING
402
+
403
+ step = self.session.steps[new_index]
404
+
405
+ # TRUE STATE RESTORATION
406
+ if restore_state and step.hidden_state is not None:
407
+ self._restore_state(step)
408
+
409
+ self._emit_callback('on_rewind', step, -steps)
410
+
411
+ return step
412
+ return None
413
+
414
+ def _restore_state(self, step: InferenceStep):
415
+ """
416
+ Restore model state from a snapshot.
417
+
418
+ This is the key that makes execution + rewind possible.
419
+ The model's internal state is set back to exactly what it was
420
+ at this decision point, allowing you to branch differently.
421
+ """
422
+ if step.hidden_state is None and step._state_snapshot is None:
423
+ return
424
+
425
+ # Emit state restoration event - hooked components can restore themselves
426
+ self._emit_callback('on_state_restore', step)
427
+ self._emit_cascade("state_restored", {
428
+ "step_index": step.step_index,
429
+ "merkle": step.cascade_hash,
430
+ "had_hidden_state": step.hidden_state is not None,
431
+ "had_snapshot": step._state_snapshot is not None,
432
+ })
433
+
434
+ def branch_from(self, step_index: int, choice_index: int) -> Optional[InferenceStep]:
435
+ """
436
+ Rewind to a step and immediately choose a different branch.
437
+
438
+ This is the core gameplay loop:
439
+ 1. Rewind to decision point
440
+ 2. Choose different option
441
+ 3. Execute and see what happens
442
+ 4. Repeat until satisfied
443
+
444
+ Args:
445
+ step_index: Which decision point to branch from
446
+ choice_index: Which candidate to choose (0 = model's choice)
447
+
448
+ Returns:
449
+ The step after branching (with state restored)
450
+ """
451
+ step = self.jump_to(step_index)
452
+ if step is None:
453
+ return None
454
+
455
+ # Restore state
456
+ self._restore_state(step)
457
+
458
+ # Set up the override
459
+ if choice_index < len(step.candidates):
460
+ self.override(choice_index)
461
+ else:
462
+ self.accept()
463
+
464
+ return step
465
+
466
+ def forward(self, steps: int = 1) -> Optional[InferenceStep]:
467
+ """Go forward in history (if we've rewound)."""
468
+ if not self.session:
469
+ return None
470
+
471
+ with self._lock:
472
+ max_index = len(self.session.steps) - 1
473
+ new_index = min(max_index, self.session.current_index + steps)
474
+ if new_index != self.session.current_index:
475
+ self.session.current_index = new_index
476
+
477
+ step = self.session.steps[new_index]
478
+ self._emit_callback('on_rewind', step, steps)
479
+
480
+ return step
481
+ return None
482
+
483
+ def jump_to(self, index: int) -> Optional[InferenceStep]:
484
+ """Jump to specific step."""
485
+ if not self.session:
486
+ return None
487
+
488
+ with self._lock:
489
+ index = max(0, min(index, len(self.session.steps) - 1))
490
+ self.session.current_index = index
491
+ return self.session.steps[index]
492
+
493
+ # ========================================================================
494
+ # SPEED CONTROL
495
+ # ========================================================================
496
+
497
+ def speed_up(self):
498
+ """Increase auto-advance speed."""
499
+ if self.session:
500
+ self.session.speed_level = min(4, self.session.speed_level + 1)
501
+
502
+ def speed_down(self):
503
+ """Decrease auto-advance speed."""
504
+ if self.session:
505
+ self.session.speed_level = max(0, self.session.speed_level - 1)
506
+
507
+ def set_speed(self, level: int):
508
+ """Set speed level directly (0-4)."""
509
+ if self.session:
510
+ self.session.speed_level = max(0, min(4, level))
511
+
512
+ def pause(self):
513
+ """Pause auto-advance."""
514
+ if self.session:
515
+ self.session.state = SessionState.PAUSED
516
+
517
+ def unpause(self):
518
+ """Resume auto-advance."""
519
+ if self.session:
520
+ self.session.state = SessionState.STEPPING
521
+
522
+ # ========================================================================
523
+ # PROVENANCE HASHING
524
+ # ========================================================================
525
+
526
+ def _compute_step_hash(self, step: InferenceStep) -> str:
527
+ """
528
+ Compute merkle hash for a step.
529
+
530
+ This hash uniquely identifies this decision point and allows
531
+ verification that rewind is restoring to the exact right state.
532
+ """
533
+ # Include parent hash for chain integrity
534
+ parent_hash = ""
535
+ if self.session and len(self.session.steps) > 0:
536
+ prev_step = self.session.steps[-1]
537
+ parent_hash = prev_step.cascade_hash or ""
538
+
539
+ content = json.dumps({
540
+ 'step_index': step.step_index,
541
+ 'timestamp': step.timestamp,
542
+ 'top_choice': str(step.top_choice),
543
+ 'top_prob': step.top_probability,
544
+ 'num_candidates': len(step.candidates),
545
+ 'parent_hash': parent_hash,
546
+ }, sort_keys=True)
547
+
548
+ return hashlib.sha256(content.encode()).hexdigest()[:16]
549
+
550
+ # ========================================================================
551
+ # ARCADE FEEDBACK
552
+ # ========================================================================
553
+
554
+ def _generate_feedback(self, step: InferenceStep, choice: Any) -> ArcadeFeedback:
555
+ """Generate arcade-style feedback for a step."""
556
+
557
+ is_override = (choice != step.top_choice)
558
+
559
+ if is_override:
560
+ # Combo break!
561
+ if self.session.combo > 0:
562
+ self._emit_callback('on_combo_break', self.session.combo)
563
+
564
+ self.session.combo = 0
565
+
566
+ return ArcadeFeedback(
567
+ message="OVERRIDE",
568
+ intensity=0.8,
569
+ sound_cue="override",
570
+ color=(255, 165, 0), # Orange
571
+ )
572
+
573
+ else:
574
+ # Accepted model choice
575
+ self.session.combo += 1
576
+ self.session.max_combo = max(self.session.max_combo, self.session.combo)
577
+
578
+ # Combo milestones
579
+ if self.session.combo in [10, 25, 50, 100]:
580
+ self._emit_callback('on_combo', self.session.combo)
581
+ return ArcadeFeedback(
582
+ message=f"COMBO x{self.session.combo}!",
583
+ intensity=1.0,
584
+ sound_cue="combo",
585
+ color=(0, 255, 255), # Cyan
586
+ )
587
+
588
+ # Regular accept
589
+ return ArcadeFeedback(
590
+ message="",
591
+ intensity=0.3 + min(0.5, self.session.combo * 0.02),
592
+ sound_cue="accept",
593
+ color=(0, 255, 0), # Green
594
+ )
595
+
596
+ # ========================================================================
597
+ # CALLBACKS
598
+ # ========================================================================
599
+
600
+ def on(self, event: str, callback: Callable):
601
+ """Register callback for events."""
602
+ if event in self.callbacks:
603
+ self.callbacks[event].append(callback)
604
+
605
+ def _emit_callback(self, event: str, *args):
606
+ """Emit event to callbacks."""
607
+ for cb in self.callbacks.get(event, []):
608
+ try:
609
+ cb(*args)
610
+ except Exception as e:
611
+ print(f"Callback error: {e}")
612
+
613
+ # ========================================================================
614
+ # CASCADE PROVENANCE
615
+ # ========================================================================
616
+
617
+ def _emit_cascade(self, event_type: str, data: Dict[str, Any]):
618
+ """Emit event to CASCADE bus if available."""
619
+ if self.bus:
620
+ try:
621
+ self.bus.emit(event_type, {
622
+ **data,
623
+ "source": "causation_hold",
624
+ "timestamp": time.time(),
625
+ })
626
+ except Exception:
627
+ pass
628
+
629
+ # ========================================================================
630
+ # HIGH SCORES
631
+ # ========================================================================
632
+
633
+ def _load_high_scores(self) -> Dict[str, Any]:
634
+ """Load high scores from disk."""
635
+ if self.high_scores_path.exists():
636
+ try:
637
+ return json.loads(self.high_scores_path.read_text())
638
+ except Exception:
639
+ pass
640
+ return {"max_combo": 0, "best_accuracy": 0.0, "total_sessions": 0}
641
+
642
+ def _save_high_scores(self):
643
+ """Save high scores to disk."""
644
+ self.high_scores_path.parent.mkdir(parents=True, exist_ok=True)
645
+ self.high_scores_path.write_text(json.dumps(self.high_scores, indent=2))
646
+
647
+ def _check_high_score(self, stats: Dict[str, Any]):
648
+ """Check and update high scores."""
649
+ updated = False
650
+
651
+ if stats['max_combo'] > self.high_scores['max_combo']:
652
+ self.high_scores['max_combo'] = stats['max_combo']
653
+ updated = True
654
+
655
+ if stats['accuracy'] > self.high_scores['best_accuracy']:
656
+ self.high_scores['best_accuracy'] = stats['accuracy']
657
+ updated = True
658
+
659
+ self.high_scores['total_sessions'] += 1
660
+
661
+ if updated:
662
+ self._save_high_scores()
663
+
664
+ # ========================================================================
665
+ # DECORATOR FOR EASY WRAPPING
666
+ # ========================================================================
667
+
668
+ def intercept(self, granularity: str = "step"):
669
+ """
670
+ Decorator to intercept a function's inference.
671
+
672
+ Args:
673
+ granularity: "step" (each call) or "token" (if function yields)
674
+ """
675
+ def decorator(func):
676
+ def wrapper(*args, **kwargs):
677
+ # If no session, passthrough
678
+ if not self.session:
679
+ return func(*args, **kwargs)
680
+
681
+ # Capture the input
682
+ input_context = {
683
+ "args": str(args)[:200],
684
+ "kwargs": {k: str(v)[:100] for k, v in kwargs.items()},
685
+ }
686
+
687
+ # Get result
688
+ result = func(*args, **kwargs)
689
+
690
+ # Create candidates from result
691
+ if isinstance(result, np.ndarray):
692
+ # For embeddings, show top dimensions
693
+ top_dims = np.argsort(np.abs(result.flatten()))[-5:][::-1]
694
+ candidates = [
695
+ {"value": f"dim_{d}", "probability": float(np.abs(result.flatten()[d]))}
696
+ for d in top_dims
697
+ ]
698
+ else:
699
+ candidates = [{"value": result, "probability": 1.0}]
700
+
701
+ # Capture (may block)
702
+ choice, feedback = self.capture(input_context, candidates)
703
+
704
+ return result
705
+
706
+ return wrapper
707
+ return decorator
cascade/identity.py ADDED
@@ -0,0 +1,715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Model Identity Layer
3
+
4
+ Canonical identification for any AI model variant:
5
+ - Base models (meta-llama/Llama-3-8B)
6
+ - Quantizations (Q4_K_M, Q8_0, AWQ, GPTQ)
7
+ - Fine-tunes (LoRA, full, RLHF)
8
+ - API endpoints (behavioral fingerprinting)
9
+
10
+ Every unique model gets a node in the lattice.
11
+ Every observation links to its model's node.
12
+ The lattice becomes the collective memory of AI behavior.
13
+
14
+ "Same name, different model, different behavior."
15
+ """
16
+
17
+ import hashlib
18
+ import json
19
+ import time
20
+ from pathlib import Path
21
+ from dataclasses import dataclass, field, asdict
22
+ from typing import Optional, List, Dict, Any
23
+ from enum import Enum
24
+
25
+
26
+ class ModelFormat(Enum):
27
+ """Model weight formats."""
28
+ SAFETENSORS = "safetensors"
29
+ PYTORCH = "pytorch"
30
+ GGUF = "gguf"
31
+ GGML = "ggml"
32
+ ONNX = "onnx"
33
+ TENSORRT = "tensorrt"
34
+ OPENVINO = "openvino"
35
+ COREML = "coreml"
36
+ API = "api" # No weights, just endpoint
37
+ UNKNOWN = "unknown"
38
+
39
+
40
+ class QuantizationType(Enum):
41
+ """Quantization methods."""
42
+ NONE = "none" # FP32/FP16/BF16
43
+ GGUF_Q4_0 = "Q4_0"
44
+ GGUF_Q4_K_M = "Q4_K_M"
45
+ GGUF_Q4_K_S = "Q4_K_S"
46
+ GGUF_Q5_0 = "Q5_0"
47
+ GGUF_Q5_K_M = "Q5_K_M"
48
+ GGUF_Q5_K_S = "Q5_K_S"
49
+ GGUF_Q6_K = "Q6_K"
50
+ GGUF_Q8_0 = "Q8_0"
51
+ GPTQ_4BIT = "GPTQ-4bit"
52
+ GPTQ_8BIT = "GPTQ-8bit"
53
+ AWQ_4BIT = "AWQ-4bit"
54
+ BITSANDBYTES_4BIT = "bnb-4bit"
55
+ BITSANDBYTES_8BIT = "bnb-8bit"
56
+ INT8 = "INT8"
57
+ INT4 = "INT4"
58
+ CUSTOM = "custom"
59
+
60
+
61
+ class FineTuneType(Enum):
62
+ """Fine-tuning methods."""
63
+ NONE = "none"
64
+ LORA = "lora"
65
+ QLORA = "qlora"
66
+ FULL = "full"
67
+ RLHF = "rlhf"
68
+ DPO = "dpo"
69
+ ORPO = "orpo"
70
+ CUSTOM = "custom"
71
+
72
+
73
+ @dataclass
74
+ class ModelVariant:
75
+ """Describes how a model differs from its base."""
76
+ quantization: str = "none"
77
+ format: str = "unknown"
78
+ bits: Optional[int] = None
79
+ provider: Optional[str] = None # Who made this variant (e.g., "TheBloke")
80
+
81
+ def to_dict(self) -> dict:
82
+ return asdict(self)
83
+
84
+
85
+ @dataclass
86
+ class FineTuneInfo:
87
+ """Describes fine-tuning applied to a model."""
88
+ type: str = "none"
89
+ adapter_id: Optional[str] = None # HuggingFace adapter ID
90
+ adapter_hash: Optional[str] = None # Hash of adapter weights
91
+ base_model_root: Optional[str] = None # Merkle root of base model identity
92
+ dataset_id: Optional[str] = None # Training dataset
93
+
94
+ def to_dict(self) -> dict:
95
+ return asdict(self)
96
+
97
+
98
+ @dataclass
99
+ class BehavioralFingerprint:
100
+ """
101
+ Fingerprint for API models where weights are unavailable.
102
+ Generated by running standard probes and hashing responses.
103
+ """
104
+ probe_responses: List[Dict[str, Any]] = field(default_factory=list)
105
+ probe_hash: Optional[str] = None
106
+ fingerprint_version: int = 1
107
+ generated_at: Optional[float] = None
108
+
109
+ def to_dict(self) -> dict:
110
+ return asdict(self)
111
+
112
+
113
+ @dataclass
114
+ class ModelIdentity:
115
+ """
116
+ Canonical identity for any AI model variant.
117
+
118
+ This is the node that goes in the lattice.
119
+ All observations of this model link to this identity.
120
+ """
121
+ # === Core Identity ===
122
+ base_model: str # HuggingFace ID or canonical name
123
+ model_id: str # Full unique identifier (computed)
124
+
125
+ # === Variant Info ===
126
+ variant: ModelVariant = field(default_factory=ModelVariant)
127
+ fine_tune: FineTuneInfo = field(default_factory=FineTuneInfo)
128
+
129
+ # === Cryptographic Identity ===
130
+ weight_hash: Optional[str] = None # SHA256 of weights (if available)
131
+ config_hash: Optional[str] = None # SHA256 of model config
132
+ tokenizer_hash: Optional[str] = None # SHA256 of tokenizer
133
+
134
+ # === Behavioral Fingerprint (for APIs) ===
135
+ behavioral_fingerprint: Optional[BehavioralFingerprint] = None
136
+
137
+ # === Source Info ===
138
+ source_url: Optional[str] = None
139
+ source_revision: Optional[str] = None # Git commit/tag
140
+ downloaded_at: Optional[float] = None
141
+
142
+ # === Lattice Info ===
143
+ parent_root: Optional[str] = None # Genesis or base model's merkle root
144
+ merkle_root: Optional[str] = None # This identity's merkle root
145
+ created_at: float = field(default_factory=time.time)
146
+
147
+ # === Metadata ===
148
+ parameters: Optional[int] = None # Parameter count
149
+ context_length: Optional[int] = None
150
+ architecture: Optional[str] = None # "llama", "mistral", "gpt", etc.
151
+ license: Optional[str] = None
152
+
153
+ def __post_init__(self):
154
+ """Compute derived fields."""
155
+ if not self.model_id:
156
+ self.model_id = self.compute_model_id()
157
+
158
+ def compute_model_id(self) -> str:
159
+ """
160
+ Compute canonical model ID from components.
161
+ Format: base_model::variant_spec::fine_tune_spec
162
+ """
163
+ parts = [self.base_model]
164
+
165
+ # Add variant spec
166
+ if self.variant.quantization != "none":
167
+ parts.append(f"q:{self.variant.quantization}")
168
+ if self.variant.format != "unknown":
169
+ parts.append(f"fmt:{self.variant.format}")
170
+ if self.variant.provider:
171
+ parts.append(f"by:{self.variant.provider}")
172
+
173
+ # Add fine-tune spec
174
+ if self.fine_tune.type != "none":
175
+ parts.append(f"ft:{self.fine_tune.type}")
176
+ if self.fine_tune.adapter_id:
177
+ parts.append(f"adapter:{self.fine_tune.adapter_id}")
178
+
179
+ return "::".join(parts)
180
+
181
+ def compute_merkle_root(self) -> str:
182
+ """Compute merkle root of this identity."""
183
+ # Create canonical representation
184
+ canonical = {
185
+ "base_model": self.base_model,
186
+ "model_id": self.model_id,
187
+ "variant": self.variant.to_dict(),
188
+ "fine_tune": self.fine_tune.to_dict(),
189
+ "weight_hash": self.weight_hash,
190
+ "config_hash": self.config_hash,
191
+ "tokenizer_hash": self.tokenizer_hash,
192
+ "parent_root": self.parent_root,
193
+ "created_at": self.created_at,
194
+ }
195
+
196
+ # Add behavioral fingerprint if present
197
+ if self.behavioral_fingerprint:
198
+ canonical["behavioral_fingerprint"] = self.behavioral_fingerprint.probe_hash
199
+
200
+ # Hash it
201
+ canonical_json = json.dumps(canonical, sort_keys=True)
202
+ self.merkle_root = hashlib.sha256(canonical_json.encode()).hexdigest()[:16]
203
+ return self.merkle_root
204
+
205
+ def finalize(self, parent_root: str = None):
206
+ """Finalize identity and compute merkle root."""
207
+ if parent_root:
208
+ self.parent_root = parent_root
209
+ self.merkle_root = self.compute_merkle_root()
210
+ return self
211
+
212
+ def to_dict(self) -> dict:
213
+ """Convert to dictionary for serialization."""
214
+ return {
215
+ "base_model": self.base_model,
216
+ "model_id": self.model_id,
217
+ "variant": self.variant.to_dict(),
218
+ "fine_tune": self.fine_tune.to_dict(),
219
+ "weight_hash": self.weight_hash,
220
+ "config_hash": self.config_hash,
221
+ "tokenizer_hash": self.tokenizer_hash,
222
+ "behavioral_fingerprint": self.behavioral_fingerprint.to_dict() if self.behavioral_fingerprint else None,
223
+ "source_url": self.source_url,
224
+ "source_revision": self.source_revision,
225
+ "downloaded_at": self.downloaded_at,
226
+ "parent_root": self.parent_root,
227
+ "merkle_root": self.merkle_root,
228
+ "created_at": self.created_at,
229
+ "parameters": self.parameters,
230
+ "context_length": self.context_length,
231
+ "architecture": self.architecture,
232
+ "license": self.license,
233
+ }
234
+
235
+ def to_chain_format(self) -> dict:
236
+ """Convert to provenance chain format for lattice storage."""
237
+ return {
238
+ "session_id": f"model_identity_{self.merkle_root}",
239
+ "model_id": self.model_id,
240
+ "model_hash": self.weight_hash or self.behavioral_fingerprint.probe_hash if self.behavioral_fingerprint else "unknown",
241
+ "input_hash": self.base_model,
242
+ "output_hash": None,
243
+ "records": {
244
+ "identity": {
245
+ "layer_name": "identity",
246
+ "layer_idx": 0,
247
+ "state_hash": self.merkle_root,
248
+ "parent_hashes": [self.parent_root] if self.parent_root else [],
249
+ "params_hash": self.config_hash,
250
+ "shape": [self.parameters] if self.parameters else [0],
251
+ "dtype": "model_identity",
252
+ "stats": self.to_dict(),
253
+ "execution_order": 0,
254
+ "timestamp": self.created_at,
255
+ }
256
+ },
257
+ "external_roots": [self.parent_root] if self.parent_root else [],
258
+ "merkle_root": self.merkle_root,
259
+ "created_at": self.created_at,
260
+ "finalized": True,
261
+ }
262
+
263
+
264
+ # =============================================================================
265
+ # STANDARD PROBES FOR BEHAVIORAL FINGERPRINTING
266
+ # =============================================================================
267
+
268
+ STANDARD_PROBES_V1 = [
269
+ # Deterministic probes (temperature=0)
270
+ {
271
+ "id": "math_simple",
272
+ "prompt": "What is 2+2? Answer with just the number.",
273
+ "params": {"temperature": 0, "max_tokens": 10},
274
+ },
275
+ {
276
+ "id": "capital_france",
277
+ "prompt": "Complete this sentence with one word: The capital of France is",
278
+ "params": {"temperature": 0, "max_tokens": 10},
279
+ },
280
+ {
281
+ "id": "translate_hello",
282
+ "prompt": "Translate to French: Hello",
283
+ "params": {"temperature": 0, "max_tokens": 20},
284
+ },
285
+ {
286
+ "id": "color_sky",
287
+ "prompt": "What color is the sky on a clear day? One word answer:",
288
+ "params": {"temperature": 0, "max_tokens": 10},
289
+ },
290
+
291
+ # Capability probes
292
+ {
293
+ "id": "code_simple",
294
+ "prompt": "Write a Python function that adds two numbers. Just the function, no explanation.",
295
+ "params": {"temperature": 0, "max_tokens": 100},
296
+ },
297
+ {
298
+ "id": "reasoning",
299
+ "prompt": "If all cats are mammals and all mammals are animals, are all cats animals? Answer yes or no.",
300
+ "params": {"temperature": 0, "max_tokens": 10},
301
+ },
302
+
303
+ # System prompt probe
304
+ {
305
+ "id": "system_role",
306
+ "prompt": "You are a helpful pirate. Say hello.",
307
+ "params": {"temperature": 0, "max_tokens": 50},
308
+ "system": "You are a helpful pirate who speaks like a pirate.",
309
+ },
310
+
311
+ # Edge cases
312
+ {
313
+ "id": "empty",
314
+ "prompt": "",
315
+ "params": {"temperature": 0, "max_tokens": 50},
316
+ },
317
+ {
318
+ "id": "repetition",
319
+ "prompt": "Repeat after me exactly: The quick brown fox",
320
+ "params": {"temperature": 0, "max_tokens": 20},
321
+ },
322
+ ]
323
+
324
+
325
+ def generate_behavioral_fingerprint(
326
+ call_fn, # Function that takes (prompt, params) and returns response
327
+ probes: List[dict] = None,
328
+ version: int = 1,
329
+ ) -> BehavioralFingerprint:
330
+ """
331
+ Generate behavioral fingerprint by running standard probes.
332
+
333
+ Args:
334
+ call_fn: Function to call the model. Signature: (prompt, params) -> str
335
+ probes: List of probe configs. Defaults to STANDARD_PROBES_V1.
336
+ version: Fingerprint version number.
337
+
338
+ Returns:
339
+ BehavioralFingerprint with hashed responses.
340
+ """
341
+ if probes is None:
342
+ probes = STANDARD_PROBES_V1
343
+
344
+ responses = []
345
+ for probe in probes:
346
+ try:
347
+ response = call_fn(probe["prompt"], probe.get("params", {}))
348
+ response_hash = hashlib.sha256(str(response).encode()).hexdigest()[:16]
349
+ except Exception as e:
350
+ response_hash = f"error:{type(e).__name__}"
351
+
352
+ responses.append({
353
+ "probe_id": probe["id"],
354
+ "prompt_hash": hashlib.sha256(probe["prompt"].encode()).hexdigest()[:16],
355
+ "response_hash": response_hash,
356
+ })
357
+
358
+ # Compute overall fingerprint hash
359
+ fingerprint_data = json.dumps(responses, sort_keys=True)
360
+ probe_hash = hashlib.sha256(fingerprint_data.encode()).hexdigest()[:16]
361
+
362
+ return BehavioralFingerprint(
363
+ probe_responses=responses,
364
+ probe_hash=probe_hash,
365
+ fingerprint_version=version,
366
+ generated_at=time.time(),
367
+ )
368
+
369
+
370
+ # =============================================================================
371
+ # MODEL IDENTITY FACTORY
372
+ # =============================================================================
373
+
374
+ def detect_quantization(model_path: str) -> str:
375
+ """Detect quantization from model path or name."""
376
+ path_lower = model_path.lower()
377
+
378
+ # GGUF quantizations
379
+ for q in ["q4_k_m", "q4_k_s", "q4_0", "q5_k_m", "q5_k_s", "q5_0", "q6_k", "q8_0"]:
380
+ if q in path_lower:
381
+ return q.upper()
382
+
383
+ # GPTQ
384
+ if "gptq" in path_lower:
385
+ if "4bit" in path_lower or "-4b" in path_lower:
386
+ return "GPTQ-4bit"
387
+ elif "8bit" in path_lower or "-8b" in path_lower:
388
+ return "GPTQ-8bit"
389
+ return "GPTQ"
390
+
391
+ # AWQ
392
+ if "awq" in path_lower:
393
+ return "AWQ-4bit"
394
+
395
+ # BitsAndBytes
396
+ if "bnb" in path_lower or "bitsandbytes" in path_lower:
397
+ if "4bit" in path_lower:
398
+ return "bnb-4bit"
399
+ return "bnb-8bit"
400
+
401
+ return "none"
402
+
403
+
404
+ def detect_format(model_path: str) -> str:
405
+ """Detect model format from path."""
406
+ path_lower = model_path.lower()
407
+
408
+ if ".gguf" in path_lower:
409
+ return "gguf"
410
+ elif ".ggml" in path_lower:
411
+ return "ggml"
412
+ elif ".safetensors" in path_lower or "safetensors" in path_lower:
413
+ return "safetensors"
414
+ elif ".onnx" in path_lower:
415
+ return "onnx"
416
+ elif ".bin" in path_lower or "pytorch" in path_lower:
417
+ return "pytorch"
418
+ elif "api" in path_lower or "http" in path_lower:
419
+ return "api"
420
+
421
+ return "unknown"
422
+
423
+
424
+ def detect_provider(model_path: str) -> Optional[str]:
425
+ """Detect who made this variant."""
426
+ path_lower = model_path.lower()
427
+
428
+ providers = [
429
+ "thebloke",
430
+ "unsloth",
431
+ "mlx-community",
432
+ "bartowski",
433
+ "mradermacher",
434
+ "turboderp",
435
+ ]
436
+
437
+ for provider in providers:
438
+ if provider in path_lower:
439
+ return provider
440
+
441
+ return None
442
+
443
+
444
+ def create_model_identity(
445
+ model_id: str,
446
+ weights_path: Optional[Path] = None,
447
+ config: Optional[dict] = None,
448
+ parent_root: Optional[str] = None,
449
+ behavioral_fingerprint: Optional[BehavioralFingerprint] = None,
450
+ **kwargs,
451
+ ) -> ModelIdentity:
452
+ """
453
+ Factory function to create ModelIdentity from various inputs.
454
+
455
+ Args:
456
+ model_id: HuggingFace model ID or local path
457
+ weights_path: Path to weights file (for hashing)
458
+ config: Model config dict
459
+ parent_root: Merkle root of parent (genesis or base model)
460
+ behavioral_fingerprint: Pre-computed fingerprint for APIs
461
+ **kwargs: Additional fields (parameters, context_length, etc.)
462
+
463
+ Returns:
464
+ Finalized ModelIdentity ready for lattice
465
+ """
466
+ # Parse base model from full ID
467
+ # e.g., "TheBloke/Llama-3-8B-GGUF" -> base is "meta-llama/Llama-3-8B"
468
+ base_model = kwargs.pop("base_model", None)
469
+ if not base_model:
470
+ # Try to extract base from model_id
471
+ parts = model_id.split("/")
472
+ if len(parts) >= 2:
473
+ name = parts[-1]
474
+ # Remove common suffixes
475
+ for suffix in ["-GGUF", "-GPTQ", "-AWQ", "-fp16", "-bf16", "-GGML"]:
476
+ name = name.replace(suffix, "")
477
+ base_model = name
478
+ else:
479
+ base_model = model_id
480
+
481
+ # Detect variant info
482
+ quantization = detect_quantization(model_id)
483
+ format_type = detect_format(model_id)
484
+ provider = detect_provider(model_id)
485
+
486
+ # Extract bits from quantization
487
+ bits = None
488
+ if "4" in quantization:
489
+ bits = 4
490
+ elif "5" in quantization:
491
+ bits = 5
492
+ elif "6" in quantization:
493
+ bits = 6
494
+ elif "8" in quantization:
495
+ bits = 8
496
+
497
+ variant = ModelVariant(
498
+ quantization=quantization,
499
+ format=format_type,
500
+ bits=bits,
501
+ provider=provider,
502
+ )
503
+
504
+ # Hash weights if available
505
+ weight_hash = None
506
+ if weights_path and Path(weights_path).exists():
507
+ # For large files, hash first and last 1MB + size
508
+ path = Path(weights_path)
509
+ size = path.stat().st_size
510
+ hasher = hashlib.sha256()
511
+ hasher.update(str(size).encode())
512
+
513
+ with open(path, "rb") as f:
514
+ # First 1MB
515
+ hasher.update(f.read(1024 * 1024))
516
+ # Last 1MB
517
+ if size > 2 * 1024 * 1024:
518
+ f.seek(-1024 * 1024, 2)
519
+ hasher.update(f.read())
520
+
521
+ weight_hash = hasher.hexdigest()[:16]
522
+
523
+ # Hash config if available
524
+ config_hash = None
525
+ if config:
526
+ config_json = json.dumps(config, sort_keys=True)
527
+ config_hash = hashlib.sha256(config_json.encode()).hexdigest()[:16]
528
+
529
+ # Create identity
530
+ identity = ModelIdentity(
531
+ base_model=base_model,
532
+ model_id="", # Will be computed
533
+ variant=variant,
534
+ fine_tune=FineTuneInfo(),
535
+ weight_hash=weight_hash,
536
+ config_hash=config_hash,
537
+ behavioral_fingerprint=behavioral_fingerprint,
538
+ parent_root=parent_root,
539
+ **kwargs,
540
+ )
541
+
542
+ # Compute model_id and merkle_root
543
+ identity.model_id = identity.compute_model_id()
544
+ identity.finalize(parent_root)
545
+
546
+ return identity
547
+
548
+
549
+ # =============================================================================
550
+ # MODEL REGISTRY (Lattice Integration)
551
+ # =============================================================================
552
+
553
+ class ModelRegistry:
554
+ """
555
+ Registry of model identities in the lattice.
556
+
557
+ Provides:
558
+ - Get or create model identity
559
+ - Link observations to model identities
560
+ - Query models by various criteria
561
+ """
562
+
563
+ def __init__(self, lattice_dir: Path = None, genesis_root: str = None):
564
+ self.lattice_dir = lattice_dir or Path(__file__).parent.parent / "lattice"
565
+ self.models_dir = self.lattice_dir / "models"
566
+ self.models_dir.mkdir(parents=True, exist_ok=True)
567
+
568
+ # Genesis root (models link to this if no base model)
569
+ self.genesis_root = genesis_root or "89f940c1a4b7aa65"
570
+
571
+ # Cache of loaded identities
572
+ self._cache: Dict[str, ModelIdentity] = {}
573
+ self._load_all()
574
+
575
+ def _load_all(self):
576
+ """Load all model identities from disk."""
577
+ for json_file in self.models_dir.glob("*.json"):
578
+ try:
579
+ data = json.loads(json_file.read_text())
580
+ identity = self._dict_to_identity(data)
581
+ self._cache[identity.merkle_root] = identity
582
+ except Exception as e:
583
+ print(f"Error loading {json_file}: {e}")
584
+
585
+ def _dict_to_identity(self, data: dict) -> ModelIdentity:
586
+ """Convert dict back to ModelIdentity."""
587
+ variant_data = data.get("variant", {})
588
+ fine_tune_data = data.get("fine_tune", {})
589
+ fingerprint_data = data.get("behavioral_fingerprint")
590
+
591
+ return ModelIdentity(
592
+ base_model=data["base_model"],
593
+ model_id=data["model_id"],
594
+ variant=ModelVariant(**variant_data),
595
+ fine_tune=FineTuneInfo(**fine_tune_data),
596
+ weight_hash=data.get("weight_hash"),
597
+ config_hash=data.get("config_hash"),
598
+ tokenizer_hash=data.get("tokenizer_hash"),
599
+ behavioral_fingerprint=BehavioralFingerprint(**fingerprint_data) if fingerprint_data else None,
600
+ source_url=data.get("source_url"),
601
+ source_revision=data.get("source_revision"),
602
+ downloaded_at=data.get("downloaded_at"),
603
+ parent_root=data.get("parent_root"),
604
+ merkle_root=data.get("merkle_root"),
605
+ created_at=data.get("created_at", time.time()),
606
+ parameters=data.get("parameters"),
607
+ context_length=data.get("context_length"),
608
+ architecture=data.get("architecture"),
609
+ license=data.get("license"),
610
+ )
611
+
612
+ def _save_identity(self, identity: ModelIdentity):
613
+ """Save identity to disk."""
614
+ filename = f"{identity.merkle_root}.json"
615
+ filepath = self.models_dir / filename
616
+ filepath.write_text(json.dumps(identity.to_dict(), indent=2))
617
+
618
+ def get_or_create(
619
+ self,
620
+ model_id: str,
621
+ **kwargs,
622
+ ) -> ModelIdentity:
623
+ """
624
+ Get existing model identity or create new one.
625
+
626
+ If model already exists in registry, returns existing.
627
+ Otherwise creates new identity linked to genesis or base model.
628
+ """
629
+ # Check if we have this model already
630
+ for identity in self._cache.values():
631
+ if identity.model_id == model_id or identity.base_model == model_id:
632
+ return identity
633
+
634
+ # Determine parent
635
+ # If this is a variant, try to find base model
636
+ parent_root = kwargs.pop("parent_root", None)
637
+ if not parent_root:
638
+ base = kwargs.get("base_model")
639
+ if base:
640
+ for identity in self._cache.values():
641
+ if identity.base_model == base and identity.variant.quantization == "none":
642
+ parent_root = identity.merkle_root
643
+ break
644
+
645
+ # Default to genesis
646
+ if not parent_root:
647
+ parent_root = self.genesis_root
648
+
649
+ # Create new identity
650
+ identity = create_model_identity(
651
+ model_id=model_id,
652
+ parent_root=parent_root,
653
+ **kwargs,
654
+ )
655
+
656
+ # Cache and save
657
+ self._cache[identity.merkle_root] = identity
658
+ self._save_identity(identity)
659
+
660
+ return identity
661
+
662
+ def get_by_root(self, merkle_root: str) -> Optional[ModelIdentity]:
663
+ """Get model identity by merkle root."""
664
+ return self._cache.get(merkle_root)
665
+
666
+ def list_all(self) -> List[ModelIdentity]:
667
+ """List all registered models."""
668
+ return list(self._cache.values())
669
+
670
+ def list_by_base(self, base_model: str) -> List[ModelIdentity]:
671
+ """List all variants of a base model."""
672
+ return [i for i in self._cache.values() if i.base_model == base_model]
673
+
674
+ def search(self, query: str) -> List[ModelIdentity]:
675
+ """Search models by name."""
676
+ query_lower = query.lower()
677
+ return [
678
+ i for i in self._cache.values()
679
+ if query_lower in i.model_id.lower() or query_lower in i.base_model.lower()
680
+ ]
681
+
682
+
683
+ # =============================================================================
684
+ # CLI
685
+ # =============================================================================
686
+
687
+ if __name__ == "__main__":
688
+ import sys
689
+
690
+ # Test: Create some model identities
691
+ print("=== CASCADE Model Identity Layer ===\n")
692
+
693
+ # Initialize registry
694
+ registry = ModelRegistry()
695
+
696
+ # Create some test identities
697
+ test_models = [
698
+ "meta-llama/Llama-3-8B",
699
+ "TheBloke/Llama-3-8B-GGUF",
700
+ "unsloth/Llama-3-8B-bnb-4bit",
701
+ "anthropic/claude-3-opus",
702
+ "openai/gpt-4",
703
+ ]
704
+
705
+ for model in test_models:
706
+ identity = registry.get_or_create(model)
707
+ print(f"Model: {identity.model_id}")
708
+ print(f" Base: {identity.base_model}")
709
+ print(f" Quant: {identity.variant.quantization}")
710
+ print(f" Format: {identity.variant.format}")
711
+ print(f" Merkle: {identity.merkle_root}")
712
+ print(f" Parent: {identity.parent_root}")
713
+ print()
714
+
715
+ print(f"Total models in registry: {len(registry.list_all())}")
cascade/ipld.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE IPLD - InterPlanetary Linked Data Integration
3
+
4
+ Native IPLD encoding for provenance chains. Merkle roots become CIDs.
5
+ The lattice goes interplanetary.
6
+
7
+ CIDs (Content IDentifiers) are self-describing, content-addressed identifiers.
8
+ When we encode a chain as IPLD, its CID is derived from its content.
9
+ Anyone with the CID can fetch and verify.
10
+
11
+ Architecture:
12
+ ProvenanceChain ──encode──► DAG-CBOR ──hash──► CID
13
+
14
+ bafyreif...xyz (interplanetary address)
15
+ """
16
+
17
+ import json
18
+ import hashlib
19
+ from typing import Dict, Any, Optional, List
20
+ from dataclasses import dataclass
21
+ from pathlib import Path
22
+
23
+ # IPLD encoding
24
+ import dag_cbor
25
+ from multiformats import CID, multihash
26
+
27
+ # CASCADE core
28
+ from cascade.core.provenance import ProvenanceChain, ProvenanceRecord
29
+
30
+
31
+ # =============================================================================
32
+ # IPLD ENCODING
33
+ # =============================================================================
34
+
35
+ def chain_to_ipld(chain: ProvenanceChain) -> Dict[str, Any]:
36
+ """
37
+ Convert a ProvenanceChain to IPLD-compatible format.
38
+
39
+ IPLD format uses:
40
+ - Lowercase keys
41
+ - CID links for references
42
+ - DAG-CBOR encoding
43
+ """
44
+ # Convert records to IPLD format
45
+ records = {}
46
+ for name, record in chain.records.items():
47
+ records[name] = {
48
+ "layer_name": record.layer_name,
49
+ "layer_idx": record.layer_idx,
50
+ "state_hash": record.state_hash,
51
+ "parent_hashes": record.parent_hashes,
52
+ "params_hash": record.params_hash,
53
+ "shape": record.shape,
54
+ "dtype": record.dtype,
55
+ "stats": record.stats,
56
+ "execution_order": record.execution_order,
57
+ "timestamp": record.timestamp,
58
+ }
59
+
60
+ # Convert external_roots to CID links if they look like CIDs
61
+ external_links = []
62
+ for root in chain.external_roots:
63
+ if root.startswith("bafy") or root.startswith("Qm"):
64
+ # Already a CID - create a link
65
+ external_links.append({"/": root})
66
+ else:
67
+ # Legacy merkle root - keep as string
68
+ external_links.append({"legacy_root": root})
69
+
70
+ return {
71
+ "session_id": chain.session_id,
72
+ "model_id": chain.model_id,
73
+ "model_hash": chain.model_hash,
74
+ "input_hash": chain.input_hash,
75
+ "output_hash": chain.output_hash,
76
+ "records": records,
77
+ "external_roots": chain.external_roots, # Keep for verification
78
+ "external_links": external_links, # IPLD links
79
+ "merkle_root": chain.merkle_root,
80
+ "created_at": chain.created_at,
81
+ "finalized": chain.finalized,
82
+ "ipld_version": 1,
83
+ }
84
+
85
+
86
+ def encode_to_dag_cbor(data: Dict[str, Any]) -> bytes:
87
+ """Encode data as DAG-CBOR (canonical CBOR for IPLD)."""
88
+ return dag_cbor.encode(data)
89
+
90
+
91
+ def decode_from_dag_cbor(raw: bytes) -> Dict[str, Any]:
92
+ """Decode DAG-CBOR data."""
93
+ return dag_cbor.decode(raw)
94
+
95
+
96
+ def compute_cid(data: bytes, codec: str = "dag-cbor") -> str:
97
+ """
98
+ Compute CID (Content IDentifier) from data.
99
+
100
+ CID = multicodec(codec) + multihash(sha256(data))
101
+
102
+ Returns CIDv1 in base32 (bafyrei...)
103
+ """
104
+ # SHA-256 hash of the data
105
+ digest = hashlib.sha256(data).digest()
106
+
107
+ # Create multihash (0x12 = sha2-256, 0x20 = 32 bytes)
108
+ mh = multihash.wrap(digest, "sha2-256")
109
+
110
+ # Create CID v1 with dag-cbor codec (0x71)
111
+ cid = CID("base32", 1, "dag-cbor", mh)
112
+
113
+ return str(cid)
114
+
115
+
116
+ def chain_to_cid(chain: ProvenanceChain) -> tuple[str, bytes]:
117
+ """
118
+ Convert chain to CID.
119
+
120
+ Returns:
121
+ (cid_string, encoded_bytes)
122
+ """
123
+ ipld_data = chain_to_ipld(chain)
124
+ encoded = encode_to_dag_cbor(ipld_data)
125
+ cid = compute_cid(encoded)
126
+ return cid, encoded
127
+
128
+
129
+ # =============================================================================
130
+ # IPLD CHAIN - Native CID-based chain
131
+ # =============================================================================
132
+
133
+ @dataclass
134
+ class IPLDChain:
135
+ """
136
+ A provenance chain with native CID support.
137
+
138
+ Instead of custom merkle roots, uses CIDs.
139
+ Links to other chains via CID references.
140
+ """
141
+ chain: ProvenanceChain
142
+ cid: Optional[str] = None
143
+ encoded: Optional[bytes] = None
144
+
145
+ @classmethod
146
+ def from_chain(cls, chain: ProvenanceChain) -> 'IPLDChain':
147
+ """Create IPLD chain from regular chain."""
148
+ cid, encoded = chain_to_cid(chain)
149
+ return cls(chain=chain, cid=cid, encoded=encoded)
150
+
151
+ @classmethod
152
+ def from_bytes(cls, data: bytes) -> 'IPLDChain':
153
+ """Deserialize from DAG-CBOR bytes."""
154
+ ipld_data = decode_from_dag_cbor(data)
155
+ chain = ipld_to_chain(ipld_data)
156
+ cid = compute_cid(data)
157
+ return cls(chain=chain, cid=cid, encoded=data)
158
+
159
+ def link_to(self, other: 'IPLDChain') -> None:
160
+ """Link this chain to another via CID."""
161
+ if other.cid is None:
162
+ raise ValueError("Cannot link to chain without CID")
163
+ self.chain.link_external(other.cid, source_id=other.chain.model_id)
164
+ # Recompute our CID since we changed
165
+ self.cid, self.encoded = chain_to_cid(self.chain)
166
+
167
+ def save(self, path: Path) -> None:
168
+ """Save as DAG-CBOR file."""
169
+ if self.encoded is None:
170
+ self.cid, self.encoded = chain_to_cid(self.chain)
171
+ with open(path, 'wb') as f:
172
+ f.write(self.encoded)
173
+
174
+ @classmethod
175
+ def load(cls, path: Path) -> 'IPLDChain':
176
+ """Load from DAG-CBOR file."""
177
+ with open(path, 'rb') as f:
178
+ data = f.read()
179
+ return cls.from_bytes(data)
180
+
181
+ def to_json(self) -> str:
182
+ """Export as JSON (for human inspection)."""
183
+ ipld_data = chain_to_ipld(self.chain)
184
+ ipld_data["_cid"] = self.cid
185
+ return json.dumps(ipld_data, indent=2, default=str)
186
+
187
+
188
+ def ipld_to_chain(ipld_data: Dict[str, Any]) -> ProvenanceChain:
189
+ """Convert IPLD data back to ProvenanceChain."""
190
+ # Reconstruct records
191
+ records = {}
192
+ for name, rec_data in ipld_data.get("records", {}).items():
193
+ records[name] = ProvenanceRecord(
194
+ layer_name=rec_data["layer_name"],
195
+ layer_idx=rec_data["layer_idx"],
196
+ state_hash=rec_data["state_hash"],
197
+ parent_hashes=rec_data["parent_hashes"],
198
+ params_hash=rec_data.get("params_hash"),
199
+ shape=rec_data.get("shape", []),
200
+ dtype=rec_data.get("dtype", "float32"),
201
+ stats=rec_data.get("stats", {}),
202
+ execution_order=rec_data.get("execution_order", 0),
203
+ timestamp=rec_data.get("timestamp", 0),
204
+ )
205
+
206
+ chain = ProvenanceChain(
207
+ session_id=ipld_data["session_id"],
208
+ model_id=ipld_data["model_id"],
209
+ model_hash=ipld_data["model_hash"],
210
+ input_hash=ipld_data["input_hash"],
211
+ output_hash=ipld_data.get("output_hash"),
212
+ external_roots=ipld_data.get("external_roots", []),
213
+ merkle_root=ipld_data.get("merkle_root"),
214
+ created_at=ipld_data.get("created_at", 0),
215
+ finalized=ipld_data.get("finalized", False),
216
+ )
217
+ chain.records = records
218
+
219
+ return chain
220
+
221
+
222
+ # =============================================================================
223
+ # IPFS PUBLISHING (requires running IPFS daemon)
224
+ # =============================================================================
225
+
226
+ def publish_to_ipfs(chain: IPLDChain, ipfs_api: str = "/ip4/127.0.0.1/tcp/5001") -> str:
227
+ """
228
+ Publish chain to IPFS network.
229
+
230
+ Requires IPFS daemon running locally.
231
+ Returns the CID (which should match our computed CID).
232
+
233
+ Args:
234
+ chain: IPLDChain to publish
235
+ ipfs_api: IPFS API multiaddr
236
+
237
+ Returns:
238
+ CID from IPFS (for verification)
239
+ """
240
+ try:
241
+ import ipfshttpclient
242
+ client = ipfshttpclient.connect(ipfs_api)
243
+
244
+ # Add the raw DAG-CBOR data
245
+ result = client.dag.put(
246
+ chain.encoded,
247
+ store_codec="dag-cbor",
248
+ input_codec="dag-cbor"
249
+ )
250
+
251
+ ipfs_cid = result["Cid"]["/"]
252
+
253
+ # Verify CIDs match
254
+ if ipfs_cid != chain.cid:
255
+ print(f"[WARN] CID mismatch: computed={chain.cid}, ipfs={ipfs_cid}")
256
+
257
+ return ipfs_cid
258
+
259
+ except Exception as e:
260
+ print(f"[ERROR] IPFS publish failed: {e}")
261
+ print(" Make sure IPFS daemon is running: ipfs daemon")
262
+ raise
263
+
264
+
265
+ def fetch_from_ipfs(cid: str, ipfs_api: str = "/ip4/127.0.0.1/tcp/5001") -> IPLDChain:
266
+ """
267
+ Fetch chain from IPFS network by CID.
268
+
269
+ Args:
270
+ cid: Content identifier
271
+ ipfs_api: IPFS API multiaddr
272
+
273
+ Returns:
274
+ IPLDChain
275
+ """
276
+ try:
277
+ import ipfshttpclient
278
+ client = ipfshttpclient.connect(ipfs_api)
279
+
280
+ # Get the DAG node
281
+ data = client.dag.get(cid)
282
+
283
+ # Convert to chain
284
+ chain = ipld_to_chain(data)
285
+ encoded = encode_to_dag_cbor(data)
286
+
287
+ return IPLDChain(chain=chain, cid=cid, encoded=encoded)
288
+
289
+ except Exception as e:
290
+ print(f"[ERROR] IPFS fetch failed: {e}")
291
+ raise
292
+
293
+
294
+ # =============================================================================
295
+ # GENESIS IN IPLD
296
+ # =============================================================================
297
+
298
+ def get_genesis_cid() -> tuple[str, IPLDChain]:
299
+ """
300
+ Get genesis as IPLD chain with CID.
301
+
302
+ The genesis CID is deterministic - anyone computing it gets the same result.
303
+ This is the interplanetary Schelling point.
304
+ """
305
+ from cascade.genesis import create_genesis
306
+
307
+ genesis = create_genesis()
308
+ ipld_genesis = IPLDChain.from_chain(genesis)
309
+
310
+ return ipld_genesis.cid, ipld_genesis
311
+
312
+
313
+ # =============================================================================
314
+ # CLI
315
+ # =============================================================================
316
+
317
+ if __name__ == "__main__":
318
+ import sys
319
+
320
+ print("=" * 60)
321
+ print("CASCADE IPLD - InterPlanetary Linked Data")
322
+ print("=" * 60)
323
+
324
+ # Get genesis CID
325
+ genesis_cid, genesis_ipld = get_genesis_cid()
326
+ print(f"\nGenesis CID: {genesis_cid}")
327
+ print(f"Genesis merkle_root: {genesis_ipld.chain.merkle_root}")
328
+
329
+ # Load cascade_alpha and convert to IPLD
330
+ alpha_path = Path("lattice/cascade_alpha.json")
331
+ if alpha_path.exists():
332
+ with open(alpha_path) as f:
333
+ alpha_data = json.load(f)
334
+ alpha_chain = ProvenanceChain.from_dict(alpha_data)
335
+ alpha_ipld = IPLDChain.from_chain(alpha_chain)
336
+
337
+ print(f"\ncascade_alpha CID: {alpha_ipld.cid}")
338
+ print(f"cascade_alpha merkle_root: {alpha_chain.merkle_root}")
339
+
340
+ # Save as DAG-CBOR
341
+ out_dir = Path("lattice/ipld")
342
+ out_dir.mkdir(exist_ok=True)
343
+
344
+ genesis_ipld.save(out_dir / "genesis.cbor")
345
+ alpha_ipld.save(out_dir / "cascade_alpha.cbor")
346
+
347
+ # Also save JSON for inspection
348
+ with open(out_dir / "genesis.ipld.json", 'w') as f:
349
+ f.write(genesis_ipld.to_json())
350
+ with open(out_dir / "cascade_alpha.ipld.json", 'w') as f:
351
+ f.write(alpha_ipld.to_json())
352
+
353
+ print(f"\nSaved to {out_dir}/")
354
+ print(f" - genesis.cbor")
355
+ print(f" - cascade_alpha.cbor")
356
+ print(f" - genesis.ipld.json")
357
+ print(f" - cascade_alpha.ipld.json")
358
+
359
+ print("\n" + "=" * 60)
360
+ print("INTERPLANETARY ADDRESSES")
361
+ print("=" * 60)
362
+ print(f"""
363
+ Genesis: {genesis_cid}
364
+ cascade_alpha: {alpha_ipld.cid if alpha_path.exists() else 'N/A'}
365
+
366
+ These CIDs are content-addressed. Anyone with the CID can:
367
+ 1. Fetch the data from IPFS (if pinned)
368
+ 2. Verify the content matches the CID
369
+ 3. Trust the chain without trusting the source
370
+
371
+ To publish to IPFS:
372
+ ipfs daemon # Start IPFS
373
+ python -c "
374
+ from cascade.ipld import publish_to_ipfs, get_genesis_cid
375
+ _, genesis = get_genesis_cid()
376
+ cid = publish_to_ipfs(genesis)
377
+ print(f'Published: {{cid}}')
378
+ "
379
+ """)
cascade/listen.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cascade Passive Monitor.
3
+
4
+ Listens to stdin or follows a log file and observes events.
5
+
6
+ Usage:
7
+ python -m cascade.listen # Listen to stdin
8
+ python -m cascade.listen --follow app.log # Follow a log file
9
+
10
+ This module:
11
+ 1. Reads input from stdin or a log file
12
+ 2. Pipes lines -> Cascade Adapter
13
+ 3. Writes events to tape file (JSONL) and human log (Markdown)
14
+ 4. Emits events to event_queue for external consumers
15
+
16
+ For visualization, point a consumer at the event_queue or load the tape file
17
+ into your preferred visualization tool.
18
+ """
19
+
20
+ import sys
21
+ import argparse
22
+ import time
23
+ import json
24
+ from pathlib import Path
25
+ from queue import Queue
26
+
27
+ # Ensure package root is in path
28
+ sys.path.insert(0, str(Path(__file__).parent.parent))
29
+
30
+ from cascade import Monitor
31
+
32
+ # Shared event queue for external consumers (e.g., custom UIs)
33
+ event_queue: Queue = Queue()
34
+
35
+
36
+ def main():
37
+ parser = argparse.ArgumentParser(description="Cascade Passive Monitor")
38
+ parser.add_argument("--log-dir", default="./logs", help="Directory for logs")
39
+ parser.add_argument("--follow", help="Log file to follow (tail -f style)")
40
+ parser.add_argument("--quiet", "-q", action="store_true", help="Suppress console output")
41
+ args = parser.parse_args()
42
+
43
+ # 0. Setup Logs & Baggies
44
+ log_dir = Path(args.log_dir)
45
+ log_dir.mkdir(parents=True, exist_ok=True)
46
+
47
+ baggies_dir = log_dir / "baggies"
48
+ baggies_dir.mkdir(exist_ok=True)
49
+
50
+ # Excrement Management (Archive old artifacts)
51
+ follow_abs = Path(args.follow).absolute() if args.follow else None
52
+ for f in log_dir.glob("*.*"):
53
+ if f.is_file() and f.suffix in [".md", ".jsonl", ".log"] and "baggies" not in str(f):
54
+ if follow_abs and f.absolute() == follow_abs:
55
+ continue
56
+ try:
57
+ dest = baggies_dir / f.name
58
+ if dest.exists():
59
+ dest = baggies_dir / f"{f.stem}_{int(time.time())}{f.suffix}"
60
+ f.replace(dest)
61
+ except Exception:
62
+ pass
63
+ print(f"[CASCADE] Logs archived to {baggies_dir}")
64
+
65
+ session_id = int(time.time())
66
+ tape_path = log_dir / f"cascade_tape_{session_id}.jsonl"
67
+ human_path = log_dir / f"cascade_log_{session_id}.md"
68
+
69
+ tape_file = open(tape_path, "w", encoding="utf-8")
70
+ human_file = open(human_path, "w", encoding="utf-8")
71
+
72
+ # Init Log
73
+ human_file.write(f"# CASCADE MISSION LOG // SESSION {session_id}\n")
74
+ human_file.write(f"**Mode:** PASSIVE {'FOLLOWER' if args.follow else 'LISTENER'}\n")
75
+ human_file.write(f"**Target:** `{args.follow or 'STDIN'}`\n---\n\n")
76
+ human_file.flush()
77
+
78
+ print("="*60)
79
+ print("CASCADE // LISTENER")
80
+ print(f"Monitoring: {args.follow if args.follow else 'Standard Input'}")
81
+ print(f"Tape: {tape_path.absolute()}")
82
+ print(f"Baggies: {baggies_dir.absolute()}")
83
+ print("="*60)
84
+
85
+ monitor = Monitor("symbiont_passive")
86
+
87
+ def process_line(line):
88
+ line = line.strip()
89
+ if not line:
90
+ return
91
+ event = monitor.observe(line)
92
+ payload = {
93
+ "event": {
94
+ "event_id": event.event_id,
95
+ "timestamp": event.timestamp,
96
+ "component": event.component,
97
+ "event_type": event.event_type,
98
+ "data": event.data,
99
+ "raw": line, # Include original line for drill-down
100
+ },
101
+ "metrics": monitor.metrics.summary(),
102
+ "triage": monitor.metrics.triage(),
103
+ }
104
+ event_queue.put(payload)
105
+ tape_file.write(json.dumps(payload) + "\n")
106
+ tape_file.flush()
107
+
108
+ # Narrative
109
+ t_str = time.strftime('%H:%M:%S', time.localtime(event.timestamp))
110
+ icon = {"error": "🔴", "warning": "⚠️", "state_change": "🔄"}.get(event.event_type, "ℹ️")
111
+ if "loss" in str(event.data):
112
+ icon = "📉"
113
+ human_file.write(f"### {icon} {t_str} // {event.event_type.upper()}\n")
114
+ human_file.write(f"Event observed in **{event.component}**.\n")
115
+ if event.data:
116
+ human_file.write("```yaml\n")
117
+ for k, v in event.data.items():
118
+ human_file.write(f"{k}: {v}\n")
119
+ human_file.write("```\n")
120
+ human_file.write("\n")
121
+ human_file.flush()
122
+
123
+ # Mirror to console (unless quiet)
124
+ if not args.quiet:
125
+ sys.stdout.write(f"[SIGHT] {line[:80]}...\n")
126
+ sys.stdout.flush()
127
+
128
+ try:
129
+ if args.follow:
130
+ print(f"[CASCADE] Waiting for stream: {args.follow}")
131
+ f_path = Path(args.follow)
132
+ if not f_path.exists():
133
+ f_path.touch()
134
+ with open(f_path, "r", encoding="utf-8", errors="replace") as f:
135
+ print(f"[CASCADE] Scanning for events...")
136
+ while True:
137
+ line = f.readline()
138
+ if not line:
139
+ time.sleep(0.1)
140
+ continue
141
+ process_line(line)
142
+ else:
143
+ print("[CASCADE] Reading from stdin (Ctrl+C to stop)...")
144
+ for line in sys.stdin:
145
+ process_line(line)
146
+ except KeyboardInterrupt:
147
+ print("\n[CASCADE] Detaching...")
148
+ finally:
149
+ tape_file.close()
150
+ human_file.close()
151
+ print(f"[CASCADE] Session complete. Tape: {tape_path}")
152
+
153
+ if __name__ == "__main__":
154
+ main()
cascade/logging/__init__.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Logging System
3
+ Industry-standard dual-layer logging for mathematical precision and human clarity.
4
+
5
+ Two modes:
6
+ 1. Kleene Mode: Mathematical fixed point logs for debugging and verification
7
+ 2. Interpretive Mode: Human-readable causation stories for operators
8
+
9
+ Use together for complete system observability.
10
+ """
11
+
12
+ from .kleene_logger import (
13
+ KleeneLogger,
14
+ LogLevel,
15
+ get_kleene_logger,
16
+ log_fixed_point,
17
+ log_iterations
18
+ )
19
+
20
+ from .interpretive_logger import (
21
+ InterpretiveLogger,
22
+ ImpactLevel,
23
+ get_interpretive_logger,
24
+ translate_kleene_to_interpretive
25
+ )
26
+
27
+ from .log_manager import (
28
+ LogMode,
29
+ LogConfig,
30
+ CascadeLogManager,
31
+ init_logging,
32
+ get_log_manager,
33
+ log
34
+ )
35
+
36
+
37
+ def init_cascade_logging(component: str, system: str):
38
+ """Initialize both logging layers for a component"""
39
+ kleene = get_kleene_logger(component)
40
+ interpretive = get_interpretive_logger(system)
41
+
42
+ # Bridge automatic translation
43
+ def bridge_log(entry):
44
+ translate_kleene_to_interpretive(entry, interpretive)
45
+
46
+ kleene._emit_to_container = lambda entry: (
47
+ print(kleene._format_container(entry)),
48
+ bridge_log(entry)
49
+ )
50
+
51
+ return kleene, interpretive
52
+
53
+
54
+ # Convenience for quick setup
55
+ def setup_logging(component: str, system: str = "CASCADE"):
56
+ """Quick setup for both loggers"""
57
+ return init_cascade_logging(component, system)
58
+
59
+
60
+ # Export main interfaces
61
+ __all__ = [
62
+ # Kleene (mathematical)
63
+ 'KleeneLogger',
64
+ 'LogLevel',
65
+ 'get_kleene_logger',
66
+ 'log_fixed_point',
67
+ 'log_iterations',
68
+
69
+ # Interpretive (human)
70
+ 'InterpretiveLogger',
71
+ 'ImpactLevel',
72
+ 'get_interpretive_logger',
73
+ 'translate_kleene_to_interpretive',
74
+
75
+ # Log Manager (orchestrator)
76
+ 'LogMode',
77
+ 'LogConfig',
78
+ 'CascadeLogManager',
79
+ 'init_logging',
80
+ 'get_log_manager',
81
+ 'log',
82
+
83
+ # Unified
84
+ 'init_cascade_logging',
85
+ 'setup_logging'
86
+ ]
cascade/logging/color_example.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Color Logging Example
3
+ Shows how to integrate beautiful colored logs throughout your system.
4
+ """
5
+
6
+ from .kleene_logger import get_kleene_logger, LogLevel
7
+ from .interpretive_logger import get_interpretive_logger, ImpactLevel
8
+
9
+ def example_data_processing():
10
+ """Example: Data processing with beautiful logs"""
11
+ kleene = get_kleene_logger("DataProcessor")
12
+ interpretive = get_interpretive_logger("Data Pipeline")
13
+
14
+ # Start processing
15
+ kleene.log(LogLevel.INFO, "load_dataset_start",
16
+ state_before={"dataset": "smollm3-blueprint.pdf"})
17
+
18
+ interpretive.log(ImpactLevel.LOW, "DataLoader", "Loading dataset",
19
+ context="Reading PDF file for analysis",
20
+ consequence="Will extract text and metadata",
21
+ metrics={"file_size": "1.0MB", "type": "PDF"})
22
+
23
+ # Processing steps
24
+ kleene.log(LogLevel.DEBUG, "extract_text",
25
+ state_before={"page": 1},
26
+ state_after={"pages_processed": 15})
27
+
28
+ # Fixed point reached
29
+ kleene.log(LogLevel.INFO, "processing_complete",
30
+ state_after={"records": 500, "clean": True},
31
+ fixed_point=True,
32
+ iterations=3)
33
+
34
+ interpretive.log(ImpactLevel.MEDIUM, "DataProcessor", "Processing complete",
35
+ context="Successfully extracted and cleaned data",
36
+ consequence="Ready for forensics analysis",
37
+ metrics={"records": 500, "pages": 15, "errors": 0})
38
+
39
+ def example_model_observation():
40
+ """Example: Model observation with beautiful logs"""
41
+ kleene = get_kleene_logger("ModelObserver")
42
+ interpretive = get_interpretive_logger("Model Observatory")
43
+
44
+ # Model loading
45
+ kleene.log(LogLevel.INFO, "model_load_start",
46
+ state_before={"model": "mistralai/Mixtral-8x22B-Instruct-v0.1"})
47
+
48
+ interpretive.log(ImpactLevel.MEDIUM, "ModelLoader", "Loading Mixtral",
49
+ context="Loading 8x22B MoE model for inference",
50
+ consequence="Will consume significant VRAM",
51
+ metrics={"params": "141B", "active": "39B", "device": "cuda"})
52
+
53
+ # Observation
54
+ kleene.log(LogLevel.INFO, "observation_start",
55
+ state_before={"layers": 0, "hash": "initial"})
56
+
57
+ # Fixed point achieved
58
+ kleene.log(LogLevel.INFO, "observation_fixed_point",
59
+ state_after={"layers": 64, "merkle": "abc123..."},
60
+ fixed_point=True,
61
+ iterations=64)
62
+
63
+ interpretive.log(ImpactLevel.LOW, "CASCADE", "Model observed",
64
+ context="Cryptographic proof generated for model execution",
65
+ consequence="Merkle root provides verifiable audit trail",
66
+ metrics={"model": "Mixtral", "layers": 64, "merkle": "abc123..."})
67
+
68
+ def example_error_handling():
69
+ """Example: Error handling with colored logs"""
70
+ kleene = get_kleene_logger("ErrorHandler")
71
+ interpretive = get_interpretive_logger("System Monitor")
72
+
73
+ # Error detected
74
+ kleene.log(LogLevel.ERROR, "memory_exhaustion",
75
+ state_before={"memory": "15.8/16GB", "operation": "inference"},
76
+ fixed_point=False)
77
+
78
+ interpretive.log(ImpactLevel.HIGH, "MemoryManager", "Out of memory",
79
+ context="GPU memory exhausted during model inference",
80
+ consequence="Inference failed, system degraded",
81
+ metrics={"used": "15.8GB", "total": "16GB", "available": "200MB"},
82
+ recommendation="Enable gradient checkpointing or use smaller batch size")
83
+
84
+ # Recovery
85
+ kleene.log(LogLevel.WARNING, "fallback_activated",
86
+ state_after={"mode": "cpu_fallback", "batch_size": 1})
87
+
88
+ interpretive.log(ImpactLevel.MEDIUM, "FallbackHandler", "CPU fallback activated",
89
+ context="Switched to CPU inference due to memory constraints",
90
+ consequence="Performance degraded but functionality preserved",
91
+ metrics={"device": "cpu", "batch_size": 1, "slowdown": "10x"})
92
+
93
+ # Run all examples
94
+ if __name__ == "__main__":
95
+ print("\n🎨 CASCADE Color Logging Examples\n")
96
+ print("="*60)
97
+
98
+ example_data_processing()
99
+ print("\n" + "="*60)
100
+
101
+ example_model_observation()
102
+ print("\n" + "="*60)
103
+
104
+ example_error_handling()
105
+ print("\n" + "="*60)
106
+
107
+ print("\n✨ Beautiful logs are ready for production!")
cascade/logging/integrate.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Logging Integration
3
+ Plug-and-play logging for existing CASCADE components.
4
+
5
+ Retrofits existing systems with world-class logging without major surgery.
6
+ """
7
+
8
+ import functools
9
+ import time
10
+ from typing import Any, Callable, Dict, Optional
11
+
12
+ from .log_manager import get_log_manager, LogLevel, ImpactLevel
13
+
14
+
15
+ def log_component(component_name: str, system: str = "CASCADE"):
16
+ """Decorator to add logging to any class or function"""
17
+ def decorator(target):
18
+ if isinstance(target, type):
19
+ # Decorating a class
20
+ return _log_class(target, component_name, system)
21
+ else:
22
+ # Decorating a function
23
+ return _log_function(target, component_name, system)
24
+ return decorator
25
+
26
+
27
+ def _log_class(cls, component_name: str, system: str):
28
+ """Add logging to all methods of a class"""
29
+ manager = get_log_manager()
30
+ manager.register_component(component_name, system)
31
+
32
+ for attr_name in dir(cls):
33
+ if not attr_name.startswith('_'):
34
+ attr = getattr(cls, attr_name)
35
+ if callable(attr):
36
+ setattr(cls, attr_name, _log_method(attr, component_name))
37
+
38
+ return cls
39
+
40
+
41
+ def _log_function(func, component_name: str, system: str):
42
+ """Add logging to a function"""
43
+ manager = get_log_manager()
44
+ manager.register_component(component_name, system)
45
+
46
+ @functools.wraps(func)
47
+ def wrapper(*args, **kwargs):
48
+ start_time = time.time()
49
+
50
+ # Log start
51
+ get_log_manager().log_operation(
52
+ component_name, f"{func.__name__}_start",
53
+ level=LogLevel.DEBUG,
54
+ impact=ImpactLevel.TRACE,
55
+ details={
56
+ "context": f"Starting {func.__name__}",
57
+ "consequence": f"Will execute {func.__name__}",
58
+ "metrics": {"args": len(args), "kwargs": len(kwargs)}
59
+ }
60
+ )
61
+
62
+ try:
63
+ result = func(*args, **kwargs)
64
+
65
+ # Log success
66
+ duration = time.time() - start_time
67
+ get_log_manager().log_operation(
68
+ component_name, f"{func.__name__}_complete",
69
+ level=LogLevel.INFO,
70
+ impact=ImpactLevel.LOW,
71
+ details={
72
+ "context": f"Completed {func.__name__}",
73
+ "consequence": f"Result ready",
74
+ "metrics": {"duration_seconds": duration}
75
+ }
76
+ )
77
+
78
+ return result
79
+
80
+ except Exception as e:
81
+ # Log error
82
+ get_log_manager().log_operation(
83
+ component_name, f"{func.__name__}_error",
84
+ level=LogLevel.ERROR,
85
+ impact=ImpactLevel.HIGH,
86
+ details={
87
+ "context": f"Failed in {func.__name__}",
88
+ "consequence": "Operation failed",
89
+ "metrics": {"error": str(e)}
90
+ }
91
+ )
92
+ raise
93
+
94
+ return wrapper
95
+
96
+
97
+ def _log_method(method, component_name: str):
98
+ """Add logging to a method"""
99
+ @functools.wraps(method)
100
+ def wrapper(self, *args, **kwargs):
101
+ start_time = time.time()
102
+
103
+ try:
104
+ result = method(self, *args, **kwargs)
105
+
106
+ # Log successful method call
107
+ get_log_manager().log_operation(
108
+ component_name, f"{method.__name__}",
109
+ level=LogLevel.DEBUG,
110
+ impact=ImpactLevel.TRACE,
111
+ details={
112
+ "metrics": {"duration": time.time() - start_time}
113
+ }
114
+ )
115
+
116
+ return result
117
+
118
+ except Exception as e:
119
+ # Log method error
120
+ get_log_manager().log_operation(
121
+ component_name, f"{method.__name__}_error",
122
+ level=LogLevel.ERROR,
123
+ impact=ImpactLevel.HIGH,
124
+ details={
125
+ "context": f"Method {method.__name__} failed",
126
+ "metrics": {"error": str(e)}
127
+ }
128
+ )
129
+ raise
130
+
131
+ return wrapper
132
+
133
+
134
+ def log_kleene_iterations(operation_name: str):
135
+ """Decorator specifically for Kleene fixed point iterations"""
136
+ def decorator(func):
137
+ @functools.wraps(func)
138
+ def wrapper(*args, **kwargs):
139
+ get_log_manager().log_operation(
140
+ "KleeneEngine", f"{operation_name}_start",
141
+ level=LogLevel.INFO,
142
+ impact=ImpactLevel.MEDIUM,
143
+ details={
144
+ "context": f"Starting fixed point iteration for {operation_name}",
145
+ "consequence": "Will iterate until convergence"
146
+ }
147
+ )
148
+
149
+ start_time = time.time()
150
+ result = func(*args, **kwargs)
151
+
152
+ # Extract iteration info from result if available
153
+ iterations = getattr(result, 'iterations', 0)
154
+ converged = getattr(result, 'converged', True)
155
+
156
+ get_log_manager().log_operation(
157
+ "KleeneEngine", f"{operation_name}_complete",
158
+ level=LogLevel.INFO,
159
+ impact=ImpactLevel.LOW if converged else ImpactLevel.HIGH,
160
+ details={
161
+ "context": f"Fixed point iteration {'converged' if converged else 'diverged'}",
162
+ "consequence": f"Processed {iterations} iterations",
163
+ "metrics": {
164
+ "iterations": iterations,
165
+ "converged": converged,
166
+ "duration": time.time() - start_time
167
+ },
168
+ "fixed_point": converged
169
+ }
170
+ )
171
+
172
+ return result
173
+ return wrapper
174
+ return decorator
175
+
176
+
177
+ def log_model_observation(model_id: str):
178
+ """Decorator for model observation functions"""
179
+ def decorator(func):
180
+ @functools.wraps(func)
181
+ def wrapper(*args, **kwargs):
182
+ get_log_manager().log_operation(
183
+ "ModelObserver", f"observe_{model_id}",
184
+ level=LogLevel.INFO,
185
+ impact=ImpactLevel.MEDIUM,
186
+ details={
187
+ "context": f"Starting observation of model {model_id}",
188
+ "consequence": "Will generate cryptographic proof"
189
+ }
190
+ )
191
+
192
+ result = func(*args, **kwargs)
193
+
194
+ # Extract observation details
195
+ layers = getattr(result, 'layer_count', 0)
196
+ merkle = getattr(result, 'merkle_root', 'unknown')
197
+
198
+ get_log_manager().log_operation(
199
+ "ModelObserver", f"observed_{model_id}",
200
+ level=LogLevel.INFO,
201
+ impact=ImpactLevel.LOW,
202
+ details={
203
+ "context": f"Model observation complete",
204
+ "consequence": "Cryptographic proof generated",
205
+ "metrics": {
206
+ "model": model_id,
207
+ "layers": layers,
208
+ "merkle": merkle[:16] + "..."
209
+ },
210
+ "fixed_point": True
211
+ }
212
+ )
213
+
214
+ return result
215
+ return wrapper
216
+ return decorator
217
+
218
+
219
+ def log_data_processing(dataset_name: str):
220
+ """Decorator for data processing functions"""
221
+ def decorator(func):
222
+ @functools.wraps(func)
223
+ def wrapper(*args, **kwargs):
224
+ get_log_manager().log_operation(
225
+ "DataProcessor", f"process_{dataset_name}",
226
+ level=LogLevel.INFO,
227
+ impact=ImpactLevel.MEDIUM,
228
+ details={
229
+ "context": f"Processing dataset {dataset_name}",
230
+ "consequence": "Will extract and analyze data"
231
+ }
232
+ )
233
+
234
+ result = func(*args, **kwargs)
235
+
236
+ # Extract processing stats
237
+ records = getattr(result, 'record_count', 0)
238
+ operations = getattr(result, 'operations', [])
239
+
240
+ get_log_manager().log_operation(
241
+ "DataProcessor", f"processed_{dataset_name}",
242
+ level=LogLevel.INFO,
243
+ impact=ImpactLevel.LOW,
244
+ details={
245
+ "context": f"Dataset processing complete",
246
+ "consequence": f"Processed {records} records",
247
+ "metrics": {
248
+ "dataset": dataset_name,
249
+ "records": records,
250
+ "operations": len(operations)
251
+ }
252
+ }
253
+ )
254
+
255
+ return result
256
+ return wrapper
257
+ return decorator
258
+
259
+
260
+ # Quick integration function
261
+ def integrate_cascade_logging():
262
+ """One-call integration for entire CASCADE system"""
263
+ from ..system.observer import SystemObserver
264
+ from ..core.provenance import ProvenanceTracker
265
+ from data_unity import run_kleene_iteration
266
+
267
+ # Register main components
268
+ manager = get_log_manager()
269
+ manager.register_component("SystemObserver", "System Observatory")
270
+ manager.register_component("ProvenanceTracker", "Model Observatory")
271
+ manager.register_component("DataUnity", "Data Unity")
272
+ manager.register_component("KleeneEngine", "NEXUS")
273
+
274
+ print("✅ CASCADE logging integrated across all components")
275
+ return manager
cascade/logging/interpretive_logger.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Interpretive Logger
3
+ Human-readable causation flow logging for operators and stakeholders.
4
+
5
+ Translates mathematical events into stories humans can understand and act upon.
6
+ """
7
+
8
+ import time
9
+ from dataclasses import dataclass, field
10
+ from enum import Enum
11
+ from typing import Any, Dict, List, Optional
12
+ from datetime import datetime
13
+
14
+
15
+ class ImpactLevel(Enum):
16
+ """Business impact levels"""
17
+ CRITICAL = "🔴 CRITICAL" # Service down, data loss
18
+ HIGH = "🟠 HIGH" # Degraded performance, user impact
19
+ MEDIUM = "🟡 MEDIUM" # Issues detected, monitoring needed
20
+ LOW = "🟢 LOW" # Informational, routine operations
21
+ TRACE = "🔵 TRACE" # Detailed flow, debugging
22
+
23
+
24
+ @dataclass
25
+ class InterpretiveEntry:
26
+ """A human-readable system event"""
27
+ timestamp: float = field(default_factory=time.time)
28
+ impact: ImpactLevel = ImpactLevel.LOW
29
+ system: str = "" # High-level system name
30
+ component: str = "" # Specific component
31
+ event: str = "" # What happened
32
+ context: str = "" # Why it matters
33
+ consequence: str = "" # What happens next
34
+ metrics: Dict[str, Any] = field(default_factory=dict)
35
+ recommendation: Optional[str] = None
36
+
37
+ def format_display(self) -> str:
38
+ """Format for beautiful terminal output with colors"""
39
+ time_str = datetime.fromtimestamp(self.timestamp).strftime("%H:%M:%S")
40
+
41
+ # ANSI color codes
42
+ colors = {
43
+ "CRITICAL": ("\033[91m", "🔴"), # Bright red
44
+ "HIGH": ("\033[31m", "🟠"), # Red
45
+ "MEDIUM": ("\033[33m", "🟡"), # Yellow
46
+ "LOW": ("\033[32m", "🟢"), # Green
47
+ "TRACE": ("\033[90m", "🔵"), # Gray
48
+ "RESET": "\033[0m",
49
+ "BOLD": "\033[1m",
50
+ "DIM": "\033[2m",
51
+ "CYAN": "\033[36m",
52
+ "MAGENTA": "\033[35m",
53
+ }
54
+
55
+ color, icon = colors.get(self.impact.value, ("\033[0m", "⚪"))
56
+ reset = colors["RESET"]
57
+ bold = colors["BOLD"]
58
+ dim = colors["DIM"]
59
+ cyan = colors["CYAN"]
60
+ magenta = colors["MAGENTA"]
61
+
62
+ lines = [
63
+ f"\n{color}{bold}{icon} {self.impact.value} [{time_str}] {self.system}{reset}",
64
+ f"├─ {cyan}Component:{reset} {self.component}",
65
+ f"├─ {magenta}Event:{reset} {self.event}",
66
+ f"├─ {dim}Context:{reset} {self.context}",
67
+ f"├─ {dim}Consequence:{reset} {self.consequence}",
68
+ ]
69
+
70
+ if self.metrics:
71
+ lines.append(f"├─ {cyan}Metrics:{reset} {self._format_metrics()}")
72
+
73
+ if self.recommendation:
74
+ lines.append(f"└─ {bold}Recommendation:{reset} {self.recommendation}")
75
+ else:
76
+ lines.append(f"└─ {dim}Status: Monitoring{reset}")
77
+
78
+ return "\n".join(lines)
79
+
80
+ def _format_metrics(self) -> str:
81
+ """Format metrics nicely"""
82
+ return ", ".join([f"{k}={v}" for k, v in self.metrics.items()])
83
+
84
+
85
+ class InterpretiveLogger:
86
+ """Human-readable system storytelling"""
87
+
88
+ def __init__(self, system_name: str):
89
+ self.system = system_name
90
+ self.entries: List[InterpretiveEntry] = []
91
+ self.start_time = time.time()
92
+
93
+ def log(self, impact: ImpactLevel, component: str, event: str,
94
+ context: str, consequence: str,
95
+ metrics: Optional[Dict] = None,
96
+ recommendation: Optional[str] = None):
97
+ """Record a system event"""
98
+
99
+ entry = InterpretiveEntry(
100
+ impact=impact,
101
+ system=self.system,
102
+ component=component,
103
+ event=event,
104
+ context=context,
105
+ consequence=consequence,
106
+ metrics=metrics or {},
107
+ recommendation=recommendation
108
+ )
109
+
110
+ self.entries.append(entry)
111
+ self._emit_to_container(entry)
112
+
113
+ def _emit_to_container(self, entry: InterpretiveEntry):
114
+ """Emit beautiful formatted log to container"""
115
+ print(entry.format_display())
116
+
117
+ # Convenience methods for common events
118
+ def service_start(self, component: str, port: int = None):
119
+ """Service started successfully"""
120
+ self.log(
121
+ ImpactLevel.LOW,
122
+ component,
123
+ "Service started",
124
+ f"Component initialized and ready for requests",
125
+ f"Accepting connections on port {port}" if port else "Ready for operations",
126
+ metrics={"port": port} if port else {},
127
+ recommendation="Monitor for healthy connections"
128
+ )
129
+
130
+ def service_error(self, component: str, error: str, impact: ImpactLevel = ImpactLevel.HIGH):
131
+ """Service encountered error"""
132
+ self.log(
133
+ impact,
134
+ component,
135
+ "Service error",
136
+ f"Component failed to process request",
137
+ f"May affect system reliability",
138
+ metrics={"error": error},
139
+ recommendation="Check component logs and restart if needed"
140
+ )
141
+
142
+ def data_processing(self, dataset: str, records: int, operations: List[str]):
143
+ """Data processing pipeline"""
144
+ self.log(
145
+ ImpactLevel.MEDIUM,
146
+ "DataProcessor",
147
+ f"Processing {dataset}",
148
+ f"Executing pipeline operations on dataset",
149
+ f"Will process {records:,} records through {len(operations)} stages",
150
+ metrics={
151
+ "dataset": dataset,
152
+ "records": records,
153
+ "operations": len(operations)
154
+ },
155
+ recommendation="Monitor processing progress and error rates"
156
+ )
157
+
158
+ def model_loaded(self, model_id: str, size_gb: float, device: str):
159
+ """AI model loaded into memory"""
160
+ self.log(
161
+ ImpactLevel.MEDIUM,
162
+ "ModelLoader",
163
+ f"Model {model_id} loaded",
164
+ f"Neural network loaded and ready for inference",
165
+ f"Consuming {size_gb:.1f}GB VRAM on {device}",
166
+ metrics={
167
+ "model": model_id,
168
+ "size_gb": size_gb,
169
+ "device": device
170
+ },
171
+ recommendation="Monitor GPU memory usage during inference"
172
+ )
173
+
174
+ def security_event(self, component: str, event: str, details: str):
175
+ """Security-related event"""
176
+ self.log(
177
+ ImpactLevel.CRITICAL,
178
+ component,
179
+ f"Security: {event}",
180
+ f"Security system detected potential threat",
181
+ f"Immediate investigation required",
182
+ metrics={"details": details},
183
+ recommendation="Review security logs and consider blocking source"
184
+ )
185
+
186
+ def performance_warning(self, component: str, metric: str, value: float, threshold: float):
187
+ """Performance threshold exceeded"""
188
+ self.log(
189
+ ImpactLevel.HIGH,
190
+ component,
191
+ f"Performance warning: {metric}",
192
+ f"Component performance degraded",
193
+ f"May impact user experience if continues",
194
+ metrics={metric: value, "threshold": threshold},
195
+ recommendation=f"Optimize {metric} or scale resources"
196
+ )
197
+
198
+ def cascade_observation(self, model: str, layers: int, merkle_root: str):
199
+ """CASCADE observed model execution"""
200
+ self.log(
201
+ ImpactLevel.INFO,
202
+ "CASCADE",
203
+ f"Model observation complete",
204
+ f"Cryptographic proof generated for model execution",
205
+ f"Merkle root provides verifiable audit trail",
206
+ metrics={
207
+ "model": model,
208
+ "layers": layers,
209
+ "merkle": merkle_root[:16] + "..."
210
+ },
211
+ recommendation="Store attestation for permanent records"
212
+ )
213
+
214
+ def fixed_point_convergence(self, operation: str, iterations: int, entities: int):
215
+ """Mathematical fixed point reached"""
216
+ self.log(
217
+ ImpactLevel.INFO,
218
+ "KleeneEngine",
219
+ f"Fixed point convergence",
220
+ f"{operation} completed after {iterations} iterations",
221
+ f"Resolved relationships for {entities} entities",
222
+ metrics={
223
+ "operation": operation,
224
+ "iterations": iterations,
225
+ "entities": entities
226
+ },
227
+ recommendation="Review convergence quality metrics"
228
+ )
229
+
230
+
231
+ # Global interpretive loggers
232
+ _interpretive_loggers: Dict[str, InterpretiveLogger] = {}
233
+
234
+
235
+ def get_interpretive_logger(system: str) -> InterpretiveLogger:
236
+ """Get or create interpretive logger for system"""
237
+ if system not in _interpretive_loggers:
238
+ _interpretive_loggers[system] = InterpretiveLogger(system)
239
+ return _interpretive_loggers[system]
240
+
241
+
242
+ # Bridge function to translate Kleene logs to interpretive
243
+ def translate_kleene_to_interpretive(kleene_entry, interpretive_logger):
244
+ """Translate mathematical log to human story"""
245
+
246
+ # Map Kleene levels to impact levels
247
+ impact_map = {
248
+ "CRITICAL": ImpactLevel.CRITICAL,
249
+ "ERROR": ImpactLevel.HIGH,
250
+ "WARNING": ImpactLevel.MEDIUM,
251
+ "INFO": ImpactLevel.LOW,
252
+ "DEBUG": ImpactLevel.TRACE,
253
+ "TRACE": ImpactLevel.TRACE
254
+ }
255
+
256
+ # Create human-readable context
257
+ if kleene_entry.fixed_point_reached:
258
+ event = f"Mathematical convergence achieved"
259
+ context = f"Operation {kleene_entry.operation} reached stable state"
260
+ consequence = "System can proceed with verified result"
261
+ else:
262
+ event = f"State transition in {kleene_entry.operation}"
263
+ context = f"Component processing through iterations"
264
+ consequence = "Continuing toward fixed point"
265
+
266
+ interpretive_logger.log(
267
+ impact_map.get(kleene_entry.level.value, ImpactLevel.LOW),
268
+ kleene_entry.component,
269
+ event,
270
+ context,
271
+ consequence,
272
+ metrics={
273
+ "iterations": kleene_entry.iteration_count,
274
+ "hash": kleene_entry.hash_value
275
+ }
276
+ )
cascade/logging/kleene_logger.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Kleene Fixed Point Logger
3
+ Industry-standard mathematical logging for debugging and verification.
4
+
5
+ Each log entry is a fixed point observation - hashable, verifiable, complete.
6
+ """
7
+
8
+ import hashlib
9
+ import json
10
+ import time
11
+ from dataclasses import dataclass, field
12
+ from enum import Enum
13
+ from typing import Any, Dict, List, Optional
14
+ from contextlib import contextmanager
15
+
16
+
17
+ class LogLevel(Enum):
18
+ """Mathematical significance levels"""
19
+ CRITICAL = "CRITICAL" # System-breaking fixed point failure
20
+ ERROR = "ERROR" # Fixed point not reached
21
+ WARNING = "WARNING" # Unexpected state transition
22
+ INFO = "INFO" # Fixed point achieved
23
+ DEBUG = "DEBUG" # State transition details
24
+ TRACE = "TRACE" # Every computation step
25
+
26
+
27
+ @dataclass
28
+ class KleeneLogEntry:
29
+ """A single fixed point observation"""
30
+ timestamp: float = field(default_factory=time.time)
31
+ level: LogLevel = LogLevel.INFO
32
+ component: str = ""
33
+ operation: str = ""
34
+ state_before: Optional[Dict] = None
35
+ state_after: Optional[Dict] = None
36
+ fixed_point_reached: bool = False
37
+ iteration_count: int = 0
38
+ hash_value: str = field(init=False)
39
+
40
+ def __post_init__(self):
41
+ # Create content hash for verifiability
42
+ content = {
43
+ "timestamp": self.timestamp,
44
+ "component": self.component,
45
+ "operation": self.operation,
46
+ "state_before": self.state_before,
47
+ "state_after": self.state_after,
48
+ "iteration": self.iteration_count
49
+ }
50
+ self.hash_value = hashlib.sha256(
51
+ json.dumps(content, sort_keys=True).encode()
52
+ ).hexdigest()[:16]
53
+
54
+ def to_dict(self) -> Dict[str, Any]:
55
+ return {
56
+ "ts": self.timestamp,
57
+ "lvl": self.level.value,
58
+ "comp": self.component,
59
+ "op": self.operation,
60
+ "before": self.state_before,
61
+ "after": self.state_after,
62
+ "fixed": self.fixed_point_reached,
63
+ "iter": self.iteration_count,
64
+ "hash": self.hash_value
65
+ }
66
+
67
+
68
+ class KleeneLogger:
69
+ """Mathematical logging for fixed point systems"""
70
+
71
+ def __init__(self, component_name: str):
72
+ self.component = component_name
73
+ self.entries: List[KleeneLogEntry] = []
74
+ self.session_start = time.time()
75
+ self.operation_count = 0
76
+
77
+ def log(self, level: LogLevel, operation: str,
78
+ state_before: Optional[Dict] = None,
79
+ state_after: Optional[Dict] = None,
80
+ fixed_point: bool = False,
81
+ iterations: int = 0):
82
+ """Record a state transition"""
83
+
84
+ entry = KleeneLogEntry(
85
+ level=level,
86
+ component=self.component,
87
+ operation=operation,
88
+ state_before=state_before,
89
+ state_after=state_after,
90
+ fixed_point_reached=fixed_point,
91
+ iteration_count=iterations
92
+ )
93
+
94
+ self.entries.append(entry)
95
+ self._emit_to_container(entry)
96
+
97
+ def _emit_to_container(self, entry: KleeneLogEntry):
98
+ """Emit structured log to container with colors"""
99
+ # ANSI color codes
100
+ colors = {
101
+ "CRITICAL": "\033[91m", # Bright red
102
+ "ERROR": "\033[31m", # Red
103
+ "WARNING": "\033[33m", # Yellow
104
+ "INFO": "\033[32m", # Green
105
+ "DEBUG": "\033[36m", # Cyan
106
+ "TRACE": "\033[90m", # Gray
107
+ "RESET": "\033[0m", # Reset
108
+ "BOLD": "\033[1m", # Bold
109
+ "DIM": "\033[2m", # Dim
110
+ }
111
+
112
+ color = colors.get(entry.level.value, colors["RESET"])
113
+ reset = colors["RESET"]
114
+ dim = colors["DIM"]
115
+
116
+ # Format with colors
117
+ print(f"{color}[KLEENE]{reset} {color}{entry.level.value:8}{reset} | "
118
+ f"{dim}{entry.component:20}{reset} | "
119
+ f"{entry.operation:30} | "
120
+ f"Iter:{entry.iteration_count:3} | "
121
+ f"Fixed:{'Y' if entry.fixed_point_reached else 'N':1} | "
122
+ f"{dim}Hash:{entry.hash_value}{reset}")
123
+
124
+ @contextmanager
125
+ def observe_operation(self, operation: str, initial_state: Dict):
126
+ """Context manager for observing operations"""
127
+ self.operation_count += 1
128
+ iterations = 0
129
+
130
+ try:
131
+ self.log(LogLevel.DEBUG, f"{operation}_start",
132
+ state_before=initial_state)
133
+
134
+ # Yield control back to operation
135
+ yield self
136
+
137
+ # Operation completed successfully
138
+ self.log(LogLevel.INFO, f"{operation}_complete",
139
+ fixed_point=True, iterations=iterations)
140
+
141
+ except Exception as e:
142
+ self.log(LogLevel.ERROR, f"{operation}_failed",
143
+ state_after={"error": str(e)})
144
+ raise
145
+
146
+ def fixed_point(self, operation: str, final_state: Dict, iterations: int):
147
+ """Log successful fixed point convergence"""
148
+ self.log(LogLevel.INFO, f"{operation}_fixed_point",
149
+ state_after=final_state,
150
+ fixed_point=True,
151
+ iterations=iterations)
152
+
153
+ def divergence(self, operation: str, state: Dict):
154
+ """Log when system diverges (no fixed point)"""
155
+ self.log(LogLevel.WARNING, f"{operation}_divergence",
156
+ state_after=state,
157
+ fixed_point=False)
158
+
159
+ def critical_failure(self, operation: str, error_state: Dict):
160
+ """Log critical system failure"""
161
+ self.log(LogLevel.CRITICAL, f"{operation}_critical",
162
+ state_after=error_state,
163
+ fixed_point=False)
164
+
165
+ def get_session_hash(self) -> str:
166
+ """Get hash of entire session for verification"""
167
+ content = {
168
+ "component": self.component,
169
+ "start": self.session_start,
170
+ "operations": self.operation_count,
171
+ "entries": [e.hash_value for e in self.entries]
172
+ }
173
+ return hashlib.sha256(json.dumps(content).encode()).hexdigest()
174
+
175
+
176
+ # Global loggers for major components
177
+ _loggers: Dict[str, KleeneLogger] = {}
178
+
179
+
180
+ def get_kleene_logger(component: str) -> KleeneLogger:
181
+ """Get or create logger for component"""
182
+ if component not in _loggers:
183
+ _loggers[component] = KleeneLogger(component)
184
+ return _loggers[component]
185
+
186
+
187
+ # Convenience decorators
188
+ def log_fixed_point(operation: str):
189
+ """Decorator to automatically log fixed point operations"""
190
+ def decorator(func):
191
+ def wrapper(*args, **kwargs):
192
+ logger = get_kleene_logger(func.__module__)
193
+ start_state = {"args": str(args), "kwargs": str(kwargs)}
194
+
195
+ try:
196
+ result = func(*args, **kwargs)
197
+ logger.fixed_point(operation, {"result": str(result)}, 1)
198
+ return result
199
+ except Exception as e:
200
+ logger.critical_failure(operation, {"error": str(e)})
201
+ raise
202
+ return wrapper
203
+ return decorator
204
+
205
+
206
+ def log_iterations(operation: str):
207
+ """Decorator for operations that iterate to fixed points"""
208
+ def decorator(func):
209
+ def wrapper(*args, **kwargs):
210
+ logger = get_kleene_logger(func.__module__)
211
+
212
+ # Simulate iteration counting (real implementation would track)
213
+ result = func(*args, **kwargs)
214
+ iterations = getattr(result, 'iterations', 1)
215
+
216
+ logger.fixed_point(operation, {"converged": True}, iterations)
217
+ return result
218
+ return wrapper
219
+ return decorator
cascade/logging/log_manager.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Log Manager
3
+ Orchestrates the tsunami of data into ordered causation troops.
4
+
5
+ Manages log levels, routing, and the beautiful display of system truth.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import time
11
+ from typing import Dict, List, Optional, Any
12
+ from dataclasses import dataclass
13
+ from enum import Enum
14
+
15
+ from .kleene_logger import KleeneLogger, LogLevel
16
+ from .interpretive_logger import InterpretiveLogger, ImpactLevel
17
+
18
+
19
+ class LogMode(Enum):
20
+ """The two modes of logging excellence"""
21
+ KLEENE = "kleene" # Mathematical precision
22
+ INTERPRETIVE = "interpretive" # Human stories
23
+ DUAL = "dual" # Both simultaneously
24
+
25
+
26
+ @dataclass
27
+ class LogConfig:
28
+ """Configuration for logging behavior"""
29
+ mode: LogMode = LogMode.DUAL
30
+ min_level_kleene: LogLevel = LogLevel.INFO
31
+ min_level_interpretive: ImpactLevel = ImpactLevel.LOW
32
+ show_metrics: bool = True
33
+ show_timestamps: bool = True
34
+ color_output: bool = True
35
+ file_output: bool = False
36
+ max_file_size_mb: int = 100
37
+
38
+
39
+ class CascadeLogManager:
40
+ """The conductor of your causation orchestra"""
41
+
42
+ def __init__(self, config: Optional[LogConfig] = None):
43
+ self.config = config or LogConfig()
44
+ self.kleene_loggers: Dict[str, KleeneLogger] = {}
45
+ self.interpretive_loggers: Dict[str, InterpretiveLogger] = {}
46
+ self.start_time = time.time()
47
+ self.operation_count = 0
48
+
49
+ # Initialize display
50
+ self._setup_display()
51
+
52
+ def _setup_display(self):
53
+ """Setup beautiful terminal output"""
54
+ if self.config.color_output:
55
+ # Enable ANSI colors
56
+ sys.stdout.reconfigure(encoding='utf-8')
57
+
58
+ # Print header
59
+ self._print_header()
60
+
61
+ def _print_header(self):
62
+ """Print beautiful cascade header with colors"""
63
+ # ANSI color codes
64
+ colors = {
65
+ "WAVE": "\033[94m", # Bright blue
66
+ "BRIDGE": "\033[96m", # Cyan
67
+ "BOLD": "\033[1m",
68
+ "DIM": "\033[2m",
69
+ "RESET": "\033[0m",
70
+ "GREEN": "\033[32m",
71
+ "YELLOW": "\033[33m",
72
+ }
73
+
74
+ wave = colors["WAVE"]
75
+ bridge = colors["BRIDGE"]
76
+ bold = colors["BOLD"]
77
+ dim = colors["DIM"]
78
+ reset = colors["RESET"]
79
+ green = colors["GREEN"]
80
+ yellow = colors["YELLOW"]
81
+
82
+ print(f"\n{bold}{'='*80}{reset}")
83
+ print(f"{wave}🌊{reset} {bold}CASCADE // TRUTH INFRASTRUCTURE{reset} {bridge}🧠{reset}")
84
+ print(f"{bold}{'='*80}{reset}")
85
+ print(f"{bold}Mode:{reset} {green}{self.config.mode.value.upper()}{reset}")
86
+ print(f"{bold}Started:{reset} {dim}{time.strftime('%Y-%m-%d %H:%M:%S')}{reset}")
87
+ print(f"{bold}{'='*80}{reset}\n")
88
+
89
+ def register_component(self, component: str, system: str = "CASCADE"):
90
+ """Register a component for logging"""
91
+ if self.config.mode in [LogMode.KLEENE, LogMode.DUAL]:
92
+ kleene = KleeneLogger(component)
93
+ self.kleene_loggers[component] = kleene
94
+
95
+ if self.config.mode in [LogMode.INTERPRETIVE, LogMode.DUAL]:
96
+ interpretive = InterpretiveLogger(system)
97
+ self.interpretive_loggers[system] = interpretive
98
+
99
+ def log_operation(self, component: str, operation: str,
100
+ level: LogLevel = LogLevel.INFO,
101
+ impact: ImpactLevel = ImpactLevel.LOW,
102
+ details: Optional[Dict] = None):
103
+ """Log an operation across all active loggers"""
104
+ self.operation_count += 1
105
+
106
+ if self.config.mode in [LogMode.KLEENE, LogMode.DUAL]:
107
+ if component in self.kleene_loggers:
108
+ self.kleene_loggers[component].log(
109
+ level, operation,
110
+ state_before=details.get("before") if details else None,
111
+ state_after=details.get("after") if details else None,
112
+ fixed_point=details.get("fixed_point", False) if details else False,
113
+ iterations=details.get("iterations", 0) if details else 0
114
+ )
115
+
116
+ if self.config.mode in [LogMode.INTERPRETIVE, LogMode.DUAL]:
117
+ # Find interpretive logger for component
118
+ system = details.get("system", "CASCADE") if details else "CASCADE"
119
+ if system in self.interpretive_loggers:
120
+ self.interpretive_loggers[system].log(
121
+ impact, component, operation,
122
+ context=details.get("context", "") if details else "",
123
+ consequence=details.get("consequence", "") if details else "",
124
+ metrics=details.get("metrics", {}) if details else {},
125
+ recommendation=details.get("recommendation") if details else None
126
+ )
127
+
128
+ def get_session_stats(self) -> Dict[str, Any]:
129
+ """Get beautiful session statistics"""
130
+ total_kleene = sum(len(logger.entries) for logger in self.kleene_loggers.values())
131
+ total_interpretive = sum(len(logger.entries) for logger in self.interpretive_loggers.values())
132
+
133
+ return {
134
+ "uptime_seconds": time.time() - self.start_time,
135
+ "operations": self.operation_count,
136
+ "kleene_entries": total_kleene,
137
+ "interpretive_entries": total_interpretive,
138
+ "active_components": len(self.kleene_loggers),
139
+ "active_systems": len(self.interpretive_loggers)
140
+ }
141
+
142
+ def print_summary(self):
143
+ """Print beautiful session summary with colors"""
144
+ stats = self.get_session_stats()
145
+
146
+ # ANSI color codes
147
+ colors = {
148
+ "BOLD": "\033[1m",
149
+ "DIM": "\033[2m",
150
+ "RESET": "\033[0m",
151
+ "CYAN": "\033[36m",
152
+ "GREEN": "\033[32m",
153
+ "YELLOW": "\033[33m",
154
+ "BLUE": "\033[34m",
155
+ "MAGENTA": "\033[35m",
156
+ }
157
+
158
+ bold = colors["BOLD"]
159
+ dim = colors["DIM"]
160
+ reset = colors["RESET"]
161
+ cyan = colors["CYAN"]
162
+ green = colors["GREEN"]
163
+ yellow = colors["YELLOW"]
164
+ blue = colors["BLUE"]
165
+ magenta = colors["MAGENTA"]
166
+
167
+ print(f"\n{bold}{'='*80}{reset}")
168
+ print(f"{cyan}📊 CASCADE SESSION SUMMARY{reset}")
169
+ print(f"{bold}{'='*80}{reset}")
170
+ print(f"{bold}Uptime:{reset} {stats['uptime_seconds']:.1f} seconds")
171
+ print(f"{bold}Operations:{reset} {green}{stats['operations']:,}{reset}")
172
+ print(f"{bold}Kleene Entries:{reset} {yellow}{stats['kleene_entries']:,}{reset}")
173
+ print(f"{bold}Interpretive Entries:{reset} {blue}{stats['interpretive_entries']:,}{reset}")
174
+ print(f"{bold}Active Components:{reset} {magenta}{stats['active_components']}{reset}")
175
+ print(f"{bold}Active Systems:{reset} {magenta}{stats['active_systems']}{reset}")
176
+
177
+ if stats['kleene_entries'] > 0:
178
+ # Get session hash from first logger
179
+ first_logger = next(iter(self.kleene_loggers.values()))
180
+ print(f"{bold}Session Hash:{reset} {dim}{first_logger.get_session_hash()}{reset}")
181
+
182
+ print(f"{bold}{'='*80}{reset}")
183
+
184
+ def set_mode(self, mode: LogMode):
185
+ """Switch logging mode dynamically"""
186
+ old_mode = self.config.mode
187
+ self.config.mode = mode
188
+
189
+ print(f"\n🔄 Logging mode changed: {old_mode.value} → {mode.value}")
190
+
191
+ def enable_file_logging(self, filepath: str):
192
+ """Enable logging to file"""
193
+ self.config.file_output = True
194
+ # TODO: Implement file logging
195
+ print(f"📁 File logging enabled: {filepath}")
196
+
197
+
198
+ # Global log manager instance
199
+ _log_manager: Optional[CascadeLogManager] = None
200
+
201
+
202
+ def init_logging(config: Optional[LogConfig] = None) -> CascadeLogManager:
203
+ """Initialize the global CASCADE logging system"""
204
+ global _log_manager
205
+ _log_manager = CascadeLogManager(config)
206
+ return _log_manager
207
+
208
+
209
+ def get_log_manager() -> CascadeLogManager:
210
+ """Get the global log manager"""
211
+ global _log_manager
212
+ if _log_manager is None:
213
+ _log_manager = CascadeLogManager()
214
+ return _log_manager
215
+
216
+
217
+ def log(component: str, operation: str, context: str = "", consequence: str = "",
218
+ metrics: Dict[str, Any] = None, impact: str = "LOW", **kwargs):
219
+ """Quick log operation - convenience function"""
220
+ manager = get_log_manager()
221
+ manager.log_operation(component, operation,
222
+ details={
223
+ "context": context,
224
+ "consequence": consequence,
225
+ "metrics": metrics or {},
226
+ "impact": impact,
227
+ **kwargs
228
+ })
229
+
230
+
231
+ def log_fixed_point(component: str, operation: str, iterations: int, **kwargs):
232
+ """Log successful fixed point"""
233
+ log(component, operation,
234
+ level=LogLevel.INFO,
235
+ impact=ImpactLevel.LOW,
236
+ details={
237
+ "fixed_point": True,
238
+ "iterations": iterations,
239
+ **kwargs
240
+ })
241
+
242
+
243
+ def log_error(component: str, operation: str, error: str, **kwargs):
244
+ """Log error condition"""
245
+ log(component, f"{operation}_error",
246
+ level=LogLevel.ERROR,
247
+ impact=ImpactLevel.HIGH,
248
+ details={
249
+ "context": f"Operation failed: {error}",
250
+ "consequence": "System may be degraded",
251
+ "metrics": {"error": error},
252
+ **kwargs
253
+ })
254
+
255
+
256
+ def log_performance(component: str, metric: str, value: float, threshold: float):
257
+ """Log performance warning"""
258
+ log(component, f"performance_{metric}",
259
+ level=LogLevel.WARNING,
260
+ impact=ImpactLevel.MEDIUM,
261
+ details={
262
+ "context": f"Performance metric {metric} exceeded threshold",
263
+ "consequence": "May impact system performance",
264
+ "metrics": {metric: value, "threshold": threshold},
265
+ "recommendation": f"Optimize {metric} or scale resources"
266
+ })
cascade/observation.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Observation Manager
3
+
4
+ Connects the detective tabs (Observatory, Unity, System) to the lattice.
5
+
6
+ Flow:
7
+ 1. User runs observation through any tab
8
+ 2. Observation creates provenance chain
9
+ 3. Chain links to model identity (for model obs) or genesis (for data/system)
10
+ 4. Chain saved to lattice
11
+ 5. Optionally pinned to IPFS
12
+
13
+ This is the integration layer between UI and lattice.
14
+ """
15
+
16
+ import json
17
+ import time
18
+ from pathlib import Path
19
+ from typing import Optional, Dict, Any, List
20
+ from dataclasses import dataclass, field
21
+
22
+ from cascade.core.provenance import ProvenanceChain
23
+ from cascade.identity import ModelRegistry, ModelIdentity, create_model_identity
24
+ from cascade.genesis import get_genesis_root, link_to_genesis
25
+
26
+
27
+ @dataclass
28
+ class Observation:
29
+ """
30
+ A single observation record in the lattice.
31
+
32
+ Can be:
33
+ - Model observation (inference through Observatory)
34
+ - Data observation (entity resolution through Unity)
35
+ - System observation (log analysis through System tab)
36
+ """
37
+ observation_id: str
38
+ observation_type: str # "model", "data", "system"
39
+
40
+ # What was observed
41
+ source_id: str # Model ID, dataset ID, or log source
42
+ source_root: str # Merkle root of source identity
43
+
44
+ # The observation data
45
+ chain: ProvenanceChain
46
+ merkle_root: str
47
+
48
+ # Metadata
49
+ user_hash: Optional[str] = None # Anonymous user identifier
50
+ created_at: float = field(default_factory=time.time)
51
+
52
+ # IPFS
53
+ cid: Optional[str] = None
54
+
55
+
56
+ class ObservationManager:
57
+ """
58
+ Manages observations across all CASCADE tabs.
59
+
60
+ Responsibilities:
61
+ - Link observations to model identities or genesis
62
+ - Save observations to lattice
63
+ - Track observation history
64
+ - Provide stats for lattice gateway
65
+ """
66
+
67
+ def __init__(self, lattice_dir: Path = None):
68
+ self.lattice_dir = lattice_dir or Path(__file__).parent.parent / "lattice"
69
+ self.observations_dir = self.lattice_dir / "observations"
70
+ self.observations_dir.mkdir(parents=True, exist_ok=True)
71
+
72
+ # Model registry for linking model observations
73
+ self.model_registry = ModelRegistry(self.lattice_dir)
74
+
75
+ # Genesis root
76
+ self.genesis_root = get_genesis_root()
77
+
78
+ # In-memory observation index
79
+ self._observations: Dict[str, Observation] = {}
80
+ self._load_index()
81
+
82
+ def _load_index(self):
83
+ """Load observation index from disk."""
84
+ index_file = self.lattice_dir / "observation_index.json"
85
+ if index_file.exists():
86
+ try:
87
+ index = json.loads(index_file.read_text())
88
+ # Just load metadata, not full chains
89
+ for obs_id, meta in index.items():
90
+ self._observations[obs_id] = meta
91
+ except:
92
+ pass
93
+
94
+ def _save_index(self):
95
+ """Save observation index to disk."""
96
+ index_file = self.lattice_dir / "observation_index.json"
97
+ # Save lightweight index
98
+ index = {}
99
+ for obs_id, obs in self._observations.items():
100
+ if isinstance(obs, Observation):
101
+ index[obs_id] = {
102
+ "observation_id": obs.observation_id,
103
+ "observation_type": obs.observation_type,
104
+ "source_id": obs.source_id,
105
+ "source_root": obs.source_root,
106
+ "merkle_root": obs.merkle_root,
107
+ "created_at": obs.created_at,
108
+ "cid": obs.cid,
109
+ }
110
+ else:
111
+ index[obs_id] = obs
112
+ index_file.write_text(json.dumps(index, indent=2))
113
+
114
+ def observe_model(
115
+ self,
116
+ model_id: str,
117
+ chain: ProvenanceChain,
118
+ user_hash: Optional[str] = None,
119
+ **model_kwargs,
120
+ ) -> Observation:
121
+ """
122
+ Record a model observation.
123
+
124
+ Args:
125
+ model_id: HuggingFace model ID or local path
126
+ chain: Provenance chain from Observatory
127
+ user_hash: Anonymous user identifier
128
+ **model_kwargs: Additional model info (parameters, etc.)
129
+
130
+ Returns:
131
+ Observation linked to model identity
132
+ """
133
+ # Get or create model identity
134
+ identity = self.model_registry.get_or_create(model_id, **model_kwargs)
135
+
136
+ # Link chain to model identity
137
+ if not chain.external_roots:
138
+ chain.external_roots = []
139
+ if identity.merkle_root not in chain.external_roots:
140
+ chain.external_roots.append(identity.merkle_root)
141
+
142
+ # Finalize chain if not already
143
+ if not chain.finalized:
144
+ chain.finalize()
145
+
146
+ # Create observation record
147
+ obs_id = f"model_{chain.merkle_root}"
148
+ observation = Observation(
149
+ observation_id=obs_id,
150
+ observation_type="model",
151
+ source_id=model_id,
152
+ source_root=identity.merkle_root,
153
+ chain=chain,
154
+ merkle_root=chain.merkle_root,
155
+ user_hash=user_hash,
156
+ )
157
+
158
+ # Save chain to disk
159
+ self._save_observation(observation)
160
+
161
+ return observation
162
+
163
+ def observe_data(
164
+ self,
165
+ dataset_a: str,
166
+ dataset_b: str,
167
+ chain: ProvenanceChain,
168
+ user_hash: Optional[str] = None,
169
+ ) -> Observation:
170
+ """
171
+ Record a data unity observation.
172
+
173
+ Links directly to genesis (data doesn't have model identity).
174
+ """
175
+ # Link to genesis
176
+ if not chain.external_roots:
177
+ chain.external_roots = []
178
+ if self.genesis_root not in chain.external_roots:
179
+ chain.external_roots.append(self.genesis_root)
180
+
181
+ if not chain.finalized:
182
+ chain.finalize()
183
+
184
+ # Create observation
185
+ source_id = f"{dataset_a}::{dataset_b}"
186
+ obs_id = f"data_{chain.merkle_root}"
187
+
188
+ observation = Observation(
189
+ observation_id=obs_id,
190
+ observation_type="data",
191
+ source_id=source_id,
192
+ source_root=self.genesis_root,
193
+ chain=chain,
194
+ merkle_root=chain.merkle_root,
195
+ user_hash=user_hash,
196
+ )
197
+
198
+ self._save_observation(observation)
199
+ return observation
200
+
201
+ def observe_system(
202
+ self,
203
+ source_name: str,
204
+ chain: ProvenanceChain,
205
+ user_hash: Optional[str] = None,
206
+ ) -> Observation:
207
+ """
208
+ Record a system log observation.
209
+
210
+ Links directly to genesis.
211
+ """
212
+ # Link to genesis
213
+ if not chain.external_roots:
214
+ chain.external_roots = []
215
+ if self.genesis_root not in chain.external_roots:
216
+ chain.external_roots.append(self.genesis_root)
217
+
218
+ if not chain.finalized:
219
+ chain.finalize()
220
+
221
+ obs_id = f"system_{chain.merkle_root}"
222
+
223
+ observation = Observation(
224
+ observation_id=obs_id,
225
+ observation_type="system",
226
+ source_id=source_name,
227
+ source_root=self.genesis_root,
228
+ chain=chain,
229
+ merkle_root=chain.merkle_root,
230
+ user_hash=user_hash,
231
+ )
232
+
233
+ self._save_observation(observation)
234
+ return observation
235
+
236
+ def _save_observation(self, observation: Observation):
237
+ """Save observation to disk."""
238
+ # Save to index
239
+ self._observations[observation.observation_id] = observation
240
+ self._save_index()
241
+
242
+ # Save full chain
243
+ chain_file = self.observations_dir / f"{observation.merkle_root}.json"
244
+ chain_data = {
245
+ "observation_id": observation.observation_id,
246
+ "observation_type": observation.observation_type,
247
+ "source_id": observation.source_id,
248
+ "source_root": observation.source_root,
249
+ "user_hash": observation.user_hash,
250
+ "created_at": observation.created_at,
251
+ "cid": observation.cid,
252
+ "chain": observation.chain.to_dict() if hasattr(observation.chain, 'to_dict') else str(observation.chain),
253
+ }
254
+ chain_file.write_text(json.dumps(chain_data, indent=2, default=str))
255
+
256
+ def pin_observation(self, observation: Observation) -> Optional[str]:
257
+ """
258
+ Pin observation to IPFS.
259
+
260
+ Returns CID if successful.
261
+ """
262
+ try:
263
+ from cascade.ipld import chain_to_cid, encode_to_dag_cbor
264
+ from cascade.web3_pin import pin_file
265
+
266
+ # Convert to IPLD format
267
+ chain_data = observation.chain.to_dict() if hasattr(observation.chain, 'to_dict') else {}
268
+ cbor_data = encode_to_dag_cbor(chain_data)
269
+
270
+ # Save CBOR
271
+ cbor_file = self.observations_dir / f"{observation.merkle_root}.cbor"
272
+ cbor_file.write_bytes(cbor_data)
273
+
274
+ # Compute CID
275
+ cid = chain_to_cid(chain_data)
276
+ observation.cid = cid
277
+
278
+ # Update index
279
+ self._save_observation(observation)
280
+
281
+ return cid
282
+ except Exception as e:
283
+ print(f"Failed to pin observation: {e}")
284
+ return None
285
+
286
+ def get_observation(self, merkle_root: str) -> Optional[Observation]:
287
+ """Get observation by merkle root."""
288
+ for obs in self._observations.values():
289
+ if isinstance(obs, Observation) and obs.merkle_root == merkle_root:
290
+ return obs
291
+ elif isinstance(obs, dict) and obs.get("merkle_root") == merkle_root:
292
+ return obs
293
+ return None
294
+
295
+ def list_observations(
296
+ self,
297
+ observation_type: Optional[str] = None,
298
+ source_id: Optional[str] = None,
299
+ limit: int = 100,
300
+ ) -> List[Dict[str, Any]]:
301
+ """List observations with optional filters."""
302
+ results = []
303
+
304
+ for obs in self._observations.values():
305
+ if isinstance(obs, Observation):
306
+ obs_dict = {
307
+ "observation_id": obs.observation_id,
308
+ "observation_type": obs.observation_type,
309
+ "source_id": obs.source_id,
310
+ "merkle_root": obs.merkle_root,
311
+ "created_at": obs.created_at,
312
+ "cid": obs.cid,
313
+ }
314
+ else:
315
+ obs_dict = obs
316
+
317
+ # Apply filters
318
+ if observation_type and obs_dict.get("observation_type") != observation_type:
319
+ continue
320
+ if source_id and source_id not in obs_dict.get("source_id", ""):
321
+ continue
322
+
323
+ results.append(obs_dict)
324
+
325
+ # Sort by time, newest first
326
+ results.sort(key=lambda x: x.get("created_at", 0), reverse=True)
327
+
328
+ return results[:limit]
329
+
330
+ def get_stats(self) -> Dict[str, Any]:
331
+ """Get lattice statistics."""
332
+ obs_list = list(self._observations.values())
333
+
334
+ model_obs = [o for o in obs_list if (isinstance(o, Observation) and o.observation_type == "model") or (isinstance(o, dict) and o.get("observation_type") == "model")]
335
+ data_obs = [o for o in obs_list if (isinstance(o, Observation) and o.observation_type == "data") or (isinstance(o, dict) and o.get("observation_type") == "data")]
336
+ system_obs = [o for o in obs_list if (isinstance(o, Observation) and o.observation_type == "system") or (isinstance(o, dict) and o.get("observation_type") == "system")]
337
+
338
+ # Count unique models
339
+ model_ids = set()
340
+ for o in model_obs:
341
+ if isinstance(o, Observation):
342
+ model_ids.add(o.source_id)
343
+ elif isinstance(o, dict):
344
+ model_ids.add(o.get("source_id", ""))
345
+
346
+ return {
347
+ "total_observations": len(obs_list),
348
+ "model_observations": len(model_obs),
349
+ "data_observations": len(data_obs),
350
+ "system_observations": len(system_obs),
351
+ "unique_models": len(model_ids),
352
+ "registered_models": len(self.model_registry.list_all()),
353
+ "genesis_root": self.genesis_root,
354
+ }
355
+
356
+ def get_model_observations(self, model_id: str) -> List[Dict[str, Any]]:
357
+ """Get all observations for a specific model."""
358
+ return self.list_observations(observation_type="model", source_id=model_id)
359
+
360
+
361
+ # =============================================================================
362
+ # SINGLETON INSTANCE
363
+ # =============================================================================
364
+
365
+ _manager: Optional[ObservationManager] = None
366
+
367
+ def get_observation_manager() -> ObservationManager:
368
+ """Get singleton observation manager."""
369
+ global _manager
370
+ if _manager is None:
371
+ _manager = ObservationManager()
372
+ return _manager
373
+
374
+
375
+ # =============================================================================
376
+ # CLI
377
+ # =============================================================================
378
+
379
+ if __name__ == "__main__":
380
+ print("=== CASCADE Observation Manager ===\n")
381
+
382
+ manager = get_observation_manager()
383
+
384
+ # Show stats
385
+ stats = manager.get_stats()
386
+ print(f"Genesis: {stats['genesis_root']}")
387
+ print(f"Registered Models: {stats['registered_models']}")
388
+ print(f"Total Observations: {stats['total_observations']}")
389
+ print(f" - Model: {stats['model_observations']}")
390
+ print(f" - Data: {stats['data_observations']}")
391
+ print(f" - System: {stats['system_observations']}")
392
+ print(f"Unique Models Observed: {stats['unique_models']}")
393
+
394
+ # List recent observations
395
+ print("\nRecent Observations:")
396
+ for obs in manager.list_observations(limit=5):
397
+ print(f" [{obs['observation_type']}] {obs['source_id'][:40]}... → {obs['merkle_root']}")
cascade/observe.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cascade Observer CLI.
3
+
4
+ Wraps a target process and observes its output.
5
+
6
+ Usage:
7
+ python -m cascade.observe --cmd "python path/to/train.py --args..."
8
+
9
+ This module:
10
+ 1. Wraps the target process
11
+ 2. Pipes stdout/stderr -> Cascade Adapter
12
+ 3. Writes events to tape file (JSONL) and human log (Markdown)
13
+ 4. Emits events to event_queue for external consumers
14
+
15
+ For visualization, point a consumer at the event_queue or load the tape file
16
+ into your preferred visualization tool.
17
+ """
18
+
19
+ import sys
20
+ import subprocess
21
+ import argparse
22
+ import time
23
+ import json
24
+ import shlex
25
+ import shutil
26
+ from pathlib import Path
27
+ from queue import Queue
28
+
29
+ # Ensure package root is in path
30
+ sys.path.insert(0, str(Path(__file__).parent.parent))
31
+
32
+ from cascade import Monitor
33
+
34
+ # Shared event queue for external consumers (e.g., custom UIs)
35
+ event_queue: Queue = Queue()
36
+
37
+
38
+ def scoop_the_poop(log_dir: Path):
39
+ """
40
+ Baggies system - archive old logs on startup.
41
+ Keeps the logs folder clean. Old sessions go to baggies/.
42
+ """
43
+ baggies_dir = log_dir / "baggies"
44
+ baggies_dir.mkdir(parents=True, exist_ok=True)
45
+
46
+ # Find all old log files (not the current session)
47
+ tape_files = list(log_dir.glob("cascade_tape_*.jsonl"))
48
+ log_files = list(log_dir.glob("cascade_log_*.md"))
49
+
50
+ moved_count = 0
51
+ for f in tape_files + log_files:
52
+ if f.parent == log_dir: # Only files in root logs/, not baggies/
53
+ dest = baggies_dir / f.name
54
+ try:
55
+ shutil.move(str(f), str(dest))
56
+ moved_count += 1
57
+ except Exception as e:
58
+ print(f"[CASCADE] Could not archive {f.name}: {e}")
59
+
60
+ if moved_count > 0:
61
+ print(f"[CASCADE] 🧹 Scooped {moved_count} old logs → baggies/")
62
+
63
+
64
+ def main():
65
+ parser = argparse.ArgumentParser(
66
+ prog="cascade",
67
+ description="🌊 Cascade - Real-Time Neural Network Observability",
68
+ formatter_class=argparse.RawDescriptionHelpFormatter,
69
+ epilog="""
70
+ Examples:
71
+ cascade --cmd "python train.py"
72
+ cascade --cmd "python train.py --epochs=10"
73
+ cascade --cmd "python train.py" --cwd /path/to/project
74
+
75
+ Events are written to tape files in the log directory.
76
+ """
77
+ )
78
+
79
+ # Support both "cascade --cmd" and "cascade observe --cmd"
80
+ subparsers = parser.add_subparsers(dest="command")
81
+ observe_parser = subparsers.add_parser("observe", help="Observe a training process")
82
+
83
+ # Add args to both main parser and observe subparser
84
+ for p in [parser, observe_parser]:
85
+ p.add_argument("--cmd", required=True, help="Command to run the target process")
86
+ p.add_argument("--cwd", default=None, help="Working directory for the target (absolute path)")
87
+ p.add_argument("--log-dir", default="./logs", help="Directory for session tapes")
88
+ p.add_argument("--quiet", "-q", action="store_true", help="Suppress console output")
89
+
90
+ args = parser.parse_args()
91
+
92
+ # Resolve working directory to absolute
93
+ if args.cwd:
94
+ work_dir = Path(args.cwd).resolve()
95
+ else:
96
+ work_dir = Path.cwd()
97
+
98
+ # 0. Setup Session Tape (The Excrement/Product)
99
+ log_dir = Path(args.log_dir).resolve()
100
+ log_dir.mkdir(parents=True, exist_ok=True)
101
+
102
+ # 🧹 Scoop old logs before starting new session
103
+ scoop_the_poop(log_dir)
104
+
105
+ session_id = int(time.time())
106
+
107
+ # 1. Machine Tape (JSONL)
108
+ tape_path = log_dir / f"cascade_tape_{session_id}.jsonl"
109
+ tape_file = open(tape_path, "a", encoding="utf-8")
110
+
111
+ # 2. Human Log (Markdown)
112
+ human_path = log_dir / f"cascade_log_{session_id}.md"
113
+ human_file = open(human_path, "a", encoding="utf-8")
114
+
115
+ # Header for Human Log
116
+ human_file.write(f"# CASCADE MISSION LOG // SESSION {session_id}\n")
117
+ human_file.write(f"**Target:** `{args.cmd}`\n")
118
+ human_file.write(f"**Date:** {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
119
+ human_file.write("---\n\n")
120
+ human_file.flush()
121
+
122
+ print("="*60)
123
+ print("CASCADE // OBSERVER")
124
+ print(f"Target: {args.cmd}")
125
+ print(f"Tape: {tape_path.absolute()}")
126
+ print(f"Log: {human_path.absolute()}")
127
+ print("="*60)
128
+
129
+ # Init Monitor
130
+ monitor = Monitor("symbiont_alpha")
131
+
132
+ def write_human_entry(evt):
133
+ """Convert an event into an articulate log entry."""
134
+ t_str = time.strftime('%H:%M:%S', time.localtime(evt.timestamp))
135
+
136
+ # Narrative construction based on event type
137
+ if evt.event_type == "error":
138
+ icon = "🔴"
139
+ narrative = f"CRITICAL FAILURE in **{evt.component}**."
140
+ elif evt.event_type == "warning":
141
+ icon = "⚠️"
142
+ narrative = f"Warning signal detected from **{evt.component}**."
143
+ elif evt.event_type == "state_change":
144
+ icon = "🔄"
145
+ narrative = f"State transition observed in **{evt.component}**."
146
+ elif "loss" in str(evt.data):
147
+ icon = "📉"
148
+ narrative = f"Optimization step completed by **{evt.component}**."
149
+ else:
150
+ icon = "ℹ️"
151
+ narrative = f"Standard event recorded from **{evt.component}**."
152
+
153
+ # Write readable block
154
+ human_file.write(f"### {icon} {t_str} // {evt.event_type.upper()}\n")
155
+ human_file.write(f"{narrative}\n")
156
+ if evt.data:
157
+ # Format data as a clean list or quote
158
+ human_file.write("```yaml\n")
159
+ for k, v in evt.data.items():
160
+ human_file.write(f"{k}: {v}\n")
161
+ human_file.write("```\n")
162
+ human_file.write("\n")
163
+ human_file.flush()
164
+
165
+ # Launch Target
166
+ try:
167
+ # Split command for subprocess if it's a string
168
+ cmd_parts = shlex.split(args.cmd)
169
+
170
+ process = subprocess.Popen(
171
+ cmd_parts,
172
+ cwd=args.cwd,
173
+ stdout=subprocess.PIPE,
174
+ stderr=subprocess.STDOUT,
175
+ text=True,
176
+ bufsize=1
177
+ )
178
+
179
+ print(f"[CASCADE] Linked to target. Recording to tape & log...")
180
+
181
+ for line in process.stdout:
182
+ line = line.strip()
183
+ if not line: continue
184
+
185
+ # Feed Adapter
186
+ event = monitor.observe(line)
187
+
188
+ # Build payload with FULL wealth: metrics + triage + raw
189
+ metrics_summary = monitor.metrics.summary()
190
+ triage_status = monitor.metrics.triage()
191
+
192
+ payload = {
193
+ "event": {
194
+ "event_id": event.event_id,
195
+ "timestamp": event.timestamp,
196
+ "component": event.component,
197
+ "event_type": event.event_type,
198
+ "data": event.data,
199
+ "raw": line, # Include original line for drill-down
200
+ },
201
+ "metrics": metrics_summary,
202
+ "triage": triage_status,
203
+ }
204
+
205
+ # Emit to queue for external consumers
206
+ event_queue.put(payload)
207
+
208
+ # Write to Tape (Machine)
209
+ tape_file.write(json.dumps(payload) + "\n")
210
+ tape_file.flush()
211
+
212
+ # Write to Log (Human)
213
+ write_human_entry(event)
214
+
215
+ # Echo to console (unless quiet)
216
+ if not args.quiet:
217
+ print(f"[RAW] {line}")
218
+
219
+ except KeyboardInterrupt:
220
+ print("\n[CASCADE] Detaching...")
221
+ except Exception as e:
222
+ print(f"[CASCADE] Error: {e}")
223
+ finally:
224
+ tape_file.close()
225
+ human_file.close()
226
+ if 'process' in locals() and process.poll() is None:
227
+ process.terminate()
228
+ print(f"[CASCADE] Session complete. Tape: {tape_path}")
229
+
230
+ if __name__ == "__main__":
231
+ main()
cascade/patches/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CASCADE Patches - Auto-intercept LLM provider libraries
3
+
4
+ Each patch module wraps a provider's API to automatically emit receipts.
5
+ """
6
+
7
+ from .openai_patch import patch_openai
8
+ from .anthropic_patch import patch_anthropic
9
+ from .huggingface_patch import patch_huggingface
10
+ from .ollama_patch import patch_ollama
11
+ from .litellm_patch import patch_litellm
12
+
13
+ __all__ = [
14
+ "patch_openai",
15
+ "patch_anthropic",
16
+ "patch_huggingface",
17
+ "patch_ollama",
18
+ "patch_litellm",
19
+ ]