#!/usr/bin/env python3 """ Download LanceDB data from Hugging Face LFS storage at startup. This script downloads the Git LFS files that contain the 11,555 document database. """ import os import subprocess from pathlib import Path def download_lfs_files(): """Download Git LFS files from the Hugging Face Space repository.""" print("🔽 Checking for LanceDB data...") lancedb_path = Path("lancedb_data") # Check if database already exists and has data if lancedb_path.exists(): rajasthan_docs = lancedb_path / "rajasthan_documents.lance" if rajasthan_docs.exists(): # Check if it's a real file (not LFS pointer) version_file = rajasthan_docs / "_versions" if version_file.exists(): print("✅ LanceDB data already downloaded") return True else: print("⚠️ LanceDB data is LFS pointer - downloading actual files...") # Try to download LFS files using huggingface_hub try: from huggingface_hub import snapshot_download space_id = "ChAbhishek28/PensionBot" print(f"📥 Downloading LFS files from {space_id}...") # Download only the lancedb_data folder snapshot_download( repo_id=space_id, repo_type="space", allow_patterns="lancedb_data/**", local_dir=".", local_dir_use_symlinks=False, ) print("✅ LFS files downloaded successfully") # Verify download if lancedb_path.exists(): print(f"📂 lancedb_data exists at: {lancedb_path.absolute()}") rajasthan_docs = lancedb_path / "rajasthan_documents.lance" if rajasthan_docs.exists(): version_file = rajasthan_docs / "_versions" print(f"📂 rajasthan_documents.lance exists: {rajasthan_docs.absolute()}") print(f"📂 _versions exists: {version_file.exists()}") if version_file.exists(): version_files = list(version_file.iterdir()) print(f"📊 Found {len(version_files)} version files") else: print(f"⚠️ lancedb_data NOT found at: {lancedb_path.absolute()}") print(f"📂 Current directory: {Path.cwd()}") print(f"📂 Contents: {list(Path.cwd().iterdir())[:10]}") return True except Exception as e: print(f"❌ Failed to download LFS files: {e}") print("⚠️ Voice Bot will start with minimal documents") return False if __name__ == "__main__": download_lfs_files()