PensionBot / analyze_database.py
ChAbhishek28's picture
Enhanced startup logging to show actual document count (23K+ docs) instead of just 7 sample docs
67a99cd
#!/usr/bin/env python3
"""
Comprehensive analysis of the actual LanceDB database contents
"""
import sys
import os
import traceback
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
try:
import lancedb
import pandas as pd
from pathlib import Path
def analyze_lancedb_contents():
"""Analyze the actual contents of the LanceDB database"""
db_path = "./lancedb_data"
print("πŸ” LanceDB Database Analysis")
print("=" * 60)
try:
db = lancedb.connect(db_path)
table_names = db.table_names()
print(f"πŸ“Š Found {len(table_names)} tables: {table_names}")
print()
total_documents = 0
for table_name in table_names:
print(f"πŸ“‹ Table: {table_name}")
print("-" * 40)
try:
table = db.open_table(table_name)
count = table.count_rows()
total_documents += count
print(f" πŸ“Š Total rows: {count}")
if count > 0:
# Get schema info
try:
schema = table.schema
print(f" πŸ“ Columns: {[field.name for field in schema]}")
except:
pass
# Show sample data
try:
sample_size = min(3, count)
sample = table.head(sample_size)
sample_data = sample.to_pylist()
print(f" πŸ“„ Sample documents ({sample_size}/{count}):")
for i, row in enumerate(sample_data):
print(f" Document {i+1}:")
# Show content preview
if 'content' in row:
content = str(row['content'])[:200] + "..." if len(str(row['content'])) > 200 else str(row['content'])
print(f" Content: {content}")
# Show filename if available
if 'filename' in row:
print(f" Filename: {row['filename']}")
# Show other relevant fields
for key, value in row.items():
if key not in ['content', 'filename', 'vector', 'id'] and value:
print(f" {key}: {str(value)[:100]}")
print()
except Exception as e:
print(f" ⚠️ Could not read sample data: {e}")
print()
except Exception as e:
print(f" ❌ Error reading table {table_name}: {e}")
print()
print("=" * 60)
print(f"🎯 SUMMARY:")
print(f" Total Documents Across All Tables: {total_documents}")
print(f" Database Size: {'LARGE' if total_documents > 100 else 'MEDIUM' if total_documents > 10 else 'SMALL'}")
# Check specifically for voice bot usage
if 'rajasthan_documents' in table_names:
raj_table = db.open_table('rajasthan_documents')
raj_count = raj_table.count_rows()
print(f" Voice Bot Documents: {raj_count} (rajasthan_documents table)")
if 'documents' in table_names:
doc_table = db.open_table('documents')
doc_count = doc_table.count_rows()
print(f" General Documents: {doc_count} (documents table)")
print()
print("πŸ€– Voice Bot Analysis:")
if total_documents >= 1000:
print(" βœ… YES - Voice bot has access to 1000+ documents!")
elif total_documents >= 100:
print(" ⚠️ PARTIAL - Voice bot has substantial documents but less than 1000")
elif total_documents >= 10:
print(" ⚠️ LIMITED - Voice bot has moderate document access")
else:
print(" ❌ MINIMAL - Voice bot has very limited document access")
return total_documents
except Exception as e:
print(f"❌ Error connecting to database: {e}")
traceback.print_exc()
return 0
if __name__ == "__main__":
total = analyze_lancedb_contents()
print(f"\n🎯 Final Answer: Your voice bot has access to {total} documents")
except ImportError as e:
print(f"❌ Missing dependencies: {e}")
print("Please install: pip install lancedb pandas")
except Exception as e:
print(f"❌ Unexpected error: {e}")
traceback.print_exc()