PensionBot / download_lfs_data.py
ChAbhishek28's picture
πŸ” Add debug logging to LFS download script
7c3ff5e
#!/usr/bin/env python3
"""
Download LanceDB data from Hugging Face LFS storage at startup.
This script downloads the Git LFS files that contain the 11,555 document database.
"""
import os
import subprocess
from pathlib import Path
def download_lfs_files():
"""Download Git LFS files from the Hugging Face Space repository."""
print("πŸ”½ Checking for LanceDB data...")
lancedb_path = Path("lancedb_data")
# Check if database already exists and has data
if lancedb_path.exists():
rajasthan_docs = lancedb_path / "rajasthan_documents.lance"
if rajasthan_docs.exists():
# Check if it's a real file (not LFS pointer)
version_file = rajasthan_docs / "_versions"
if version_file.exists():
print("βœ… LanceDB data already downloaded")
return True
else:
print("⚠️ LanceDB data is LFS pointer - downloading actual files...")
# Try to download LFS files using huggingface_hub
try:
from huggingface_hub import snapshot_download
space_id = "ChAbhishek28/PensionBot"
print(f"πŸ“₯ Downloading LFS files from {space_id}...")
# Download only the lancedb_data folder
snapshot_download(
repo_id=space_id,
repo_type="space",
allow_patterns="lancedb_data/**",
local_dir=".",
local_dir_use_symlinks=False,
)
print("βœ… LFS files downloaded successfully")
# Verify download
if lancedb_path.exists():
print(f"πŸ“‚ lancedb_data exists at: {lancedb_path.absolute()}")
rajasthan_docs = lancedb_path / "rajasthan_documents.lance"
if rajasthan_docs.exists():
version_file = rajasthan_docs / "_versions"
print(f"πŸ“‚ rajasthan_documents.lance exists: {rajasthan_docs.absolute()}")
print(f"πŸ“‚ _versions exists: {version_file.exists()}")
if version_file.exists():
version_files = list(version_file.iterdir())
print(f"πŸ“Š Found {len(version_files)} version files")
else:
print(f"⚠️ lancedb_data NOT found at: {lancedb_path.absolute()}")
print(f"πŸ“‚ Current directory: {Path.cwd()}")
print(f"πŸ“‚ Contents: {list(Path.cwd().iterdir())[:10]}")
return True
except Exception as e:
print(f"❌ Failed to download LFS files: {e}")
print("⚠️ Voice Bot will start with minimal documents")
return False
if __name__ == "__main__":
download_lfs_files()