Spaces:
Sleeping
Sleeping
File size: 5,394 Bytes
67a99cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
#!/usr/bin/env python3
"""
Comprehensive analysis of the actual LanceDB database contents
"""
import sys
import os
import traceback
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
try:
import lancedb
import pandas as pd
from pathlib import Path
def analyze_lancedb_contents():
"""Analyze the actual contents of the LanceDB database"""
db_path = "./lancedb_data"
print("π LanceDB Database Analysis")
print("=" * 60)
try:
db = lancedb.connect(db_path)
table_names = db.table_names()
print(f"π Found {len(table_names)} tables: {table_names}")
print()
total_documents = 0
for table_name in table_names:
print(f"π Table: {table_name}")
print("-" * 40)
try:
table = db.open_table(table_name)
count = table.count_rows()
total_documents += count
print(f" π Total rows: {count}")
if count > 0:
# Get schema info
try:
schema = table.schema
print(f" π Columns: {[field.name for field in schema]}")
except:
pass
# Show sample data
try:
sample_size = min(3, count)
sample = table.head(sample_size)
sample_data = sample.to_pylist()
print(f" π Sample documents ({sample_size}/{count}):")
for i, row in enumerate(sample_data):
print(f" Document {i+1}:")
# Show content preview
if 'content' in row:
content = str(row['content'])[:200] + "..." if len(str(row['content'])) > 200 else str(row['content'])
print(f" Content: {content}")
# Show filename if available
if 'filename' in row:
print(f" Filename: {row['filename']}")
# Show other relevant fields
for key, value in row.items():
if key not in ['content', 'filename', 'vector', 'id'] and value:
print(f" {key}: {str(value)[:100]}")
print()
except Exception as e:
print(f" β οΈ Could not read sample data: {e}")
print()
except Exception as e:
print(f" β Error reading table {table_name}: {e}")
print()
print("=" * 60)
print(f"π― SUMMARY:")
print(f" Total Documents Across All Tables: {total_documents}")
print(f" Database Size: {'LARGE' if total_documents > 100 else 'MEDIUM' if total_documents > 10 else 'SMALL'}")
# Check specifically for voice bot usage
if 'rajasthan_documents' in table_names:
raj_table = db.open_table('rajasthan_documents')
raj_count = raj_table.count_rows()
print(f" Voice Bot Documents: {raj_count} (rajasthan_documents table)")
if 'documents' in table_names:
doc_table = db.open_table('documents')
doc_count = doc_table.count_rows()
print(f" General Documents: {doc_count} (documents table)")
print()
print("π€ Voice Bot Analysis:")
if total_documents >= 1000:
print(" β
YES - Voice bot has access to 1000+ documents!")
elif total_documents >= 100:
print(" β οΈ PARTIAL - Voice bot has substantial documents but less than 1000")
elif total_documents >= 10:
print(" β οΈ LIMITED - Voice bot has moderate document access")
else:
print(" β MINIMAL - Voice bot has very limited document access")
return total_documents
except Exception as e:
print(f"β Error connecting to database: {e}")
traceback.print_exc()
return 0
if __name__ == "__main__":
total = analyze_lancedb_contents()
print(f"\nπ― Final Answer: Your voice bot has access to {total} documents")
except ImportError as e:
print(f"β Missing dependencies: {e}")
print("Please install: pip install lancedb pandas")
except Exception as e:
print(f"β Unexpected error: {e}")
traceback.print_exc() |