|
|
""" |
|
|
Test data validation fixes for MCP paper parsing and PDF processing. |
|
|
This test verifies that malformed data (dicts instead of lists) is handled correctly. |
|
|
""" |
|
|
import sys |
|
|
from datetime import datetime |
|
|
from utils.schemas import Paper |
|
|
from utils.pdf_processor import PDFProcessor |
|
|
|
|
|
|
|
|
def test_paper_schema_validators(): |
|
|
"""Test that Paper schema validators correctly normalize malformed data.""" |
|
|
print("\n" + "="*80) |
|
|
print("TEST 1: Paper Schema Validators") |
|
|
print("="*80) |
|
|
|
|
|
|
|
|
print("\n1. Testing authors as dict (malformed data)...") |
|
|
try: |
|
|
paper = Paper( |
|
|
arxiv_id="test.001", |
|
|
title="Test Paper", |
|
|
authors={"author1": "John Doe", "author2": "Jane Smith"}, |
|
|
abstract="Test abstract", |
|
|
pdf_url="https://arxiv.org/pdf/test.001.pdf", |
|
|
published=datetime.now(), |
|
|
categories=["cs.AI"] |
|
|
) |
|
|
print(f" β Paper created successfully") |
|
|
print(f" Authors type: {type(paper.authors)}") |
|
|
print(f" Authors value: {paper.authors}") |
|
|
assert isinstance(paper.authors, list), "Authors should be normalized to list" |
|
|
print(f" β Authors correctly normalized to list") |
|
|
except Exception as e: |
|
|
print(f" β Failed: {str(e)}") |
|
|
return False |
|
|
|
|
|
|
|
|
print("\n2. Testing categories as dict (malformed data)...") |
|
|
try: |
|
|
paper = Paper( |
|
|
arxiv_id="test.002", |
|
|
title="Test Paper 2", |
|
|
authors=["John Doe"], |
|
|
abstract="Test abstract", |
|
|
pdf_url="https://arxiv.org/pdf/test.002.pdf", |
|
|
published=datetime.now(), |
|
|
categories={"cat1": "cs.AI", "cat2": "cs.LG"} |
|
|
) |
|
|
print(f" β Paper created successfully") |
|
|
print(f" Categories type: {type(paper.categories)}") |
|
|
print(f" Categories value: {paper.categories}") |
|
|
assert isinstance(paper.categories, list), "Categories should be normalized to list" |
|
|
print(f" β Categories correctly normalized to list") |
|
|
except Exception as e: |
|
|
print(f" β Failed: {str(e)}") |
|
|
return False |
|
|
|
|
|
|
|
|
print("\n3. Testing multiple fields malformed...") |
|
|
try: |
|
|
paper = Paper( |
|
|
arxiv_id="test.003", |
|
|
title={"title": "Test Paper 3"}, |
|
|
authors={"names": ["John Doe", "Jane Smith"]}, |
|
|
abstract={"summary": "Test abstract"}, |
|
|
pdf_url={"url": "https://arxiv.org/pdf/test.003.pdf"}, |
|
|
published=datetime.now(), |
|
|
categories={"categories": ["cs.AI"]} |
|
|
) |
|
|
print(f" β Paper created successfully") |
|
|
print(f" Title type: {type(paper.title)}, value: {paper.title}") |
|
|
print(f" Authors type: {type(paper.authors)}, value: {paper.authors}") |
|
|
print(f" Abstract type: {type(paper.abstract)}, value: {paper.abstract[:50]}...") |
|
|
print(f" PDF URL type: {type(paper.pdf_url)}, value: {paper.pdf_url}") |
|
|
print(f" Categories type: {type(paper.categories)}, value: {paper.categories}") |
|
|
|
|
|
assert isinstance(paper.title, str), "Title should be normalized to string" |
|
|
assert isinstance(paper.authors, list), "Authors should be normalized to list" |
|
|
assert isinstance(paper.abstract, str), "Abstract should be normalized to string" |
|
|
assert isinstance(paper.pdf_url, str), "PDF URL should be normalized to string" |
|
|
assert isinstance(paper.categories, list), "Categories should be normalized to list" |
|
|
print(f" β All fields correctly normalized") |
|
|
except Exception as e: |
|
|
print(f" β Failed: {str(e)}") |
|
|
return False |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("β ALL PAPER SCHEMA VALIDATION TESTS PASSED") |
|
|
print("="*80) |
|
|
return True |
|
|
|
|
|
|
|
|
def test_pdf_processor_resilience(): |
|
|
"""Test that PDFProcessor handles malformed Paper objects gracefully.""" |
|
|
print("\n" + "="*80) |
|
|
print("TEST 2: PDFProcessor Resilience") |
|
|
print("="*80) |
|
|
|
|
|
processor = PDFProcessor(chunk_size=100, chunk_overlap=10) |
|
|
|
|
|
|
|
|
print("\n1. Testing PDF processor with validated Paper object...") |
|
|
try: |
|
|
paper = Paper( |
|
|
arxiv_id="test.004", |
|
|
title="Test Paper", |
|
|
authors={"author1": "John Doe"}, |
|
|
abstract="Test abstract", |
|
|
pdf_url="https://arxiv.org/pdf/test.004.pdf", |
|
|
published=datetime.now(), |
|
|
categories=["cs.AI"] |
|
|
) |
|
|
|
|
|
|
|
|
test_text = "This is a test document. " * 100 |
|
|
|
|
|
chunks = processor.chunk_text(test_text, paper) |
|
|
print(f" β Created {len(chunks)} chunks successfully") |
|
|
print(f" First chunk metadata authors type: {type(chunks[0].metadata['authors'])}") |
|
|
print(f" First chunk metadata authors: {chunks[0].metadata['authors']}") |
|
|
|
|
|
assert isinstance(chunks[0].metadata['authors'], list), "Chunk metadata authors should be list" |
|
|
print(f" β Chunk metadata correctly contains list for authors") |
|
|
|
|
|
except Exception as e: |
|
|
print(f" β Failed: {str(e)}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return False |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("β PDF PROCESSOR RESILIENCE TESTS PASSED") |
|
|
print("="*80) |
|
|
return True |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("\n" + "="*80) |
|
|
print("DATA VALIDATION FIX VERIFICATION TESTS") |
|
|
print("="*80) |
|
|
print("\nThese tests verify that the fixes for malformed MCP data work correctly:") |
|
|
print("- Paper schema validators normalize dict fields to proper types") |
|
|
print("- PDF processor handles validated Paper objects without errors") |
|
|
print("="*80) |
|
|
|
|
|
test1_pass = test_paper_schema_validators() |
|
|
test2_pass = test_pdf_processor_resilience() |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("FINAL RESULTS") |
|
|
print("="*80) |
|
|
print(f"Paper Schema Validators: {'β PASS' if test1_pass else 'β FAIL'}") |
|
|
print(f"PDF Processor Resilience: {'β PASS' if test2_pass else 'β FAIL'}") |
|
|
print("="*80) |
|
|
|
|
|
if test1_pass and test2_pass: |
|
|
print("\nβ ALL TESTS PASSED - The data validation fixes are working correctly!") |
|
|
print("\nThe system should now handle malformed MCP responses gracefully.") |
|
|
sys.exit(0) |
|
|
else: |
|
|
print("\nβ SOME TESTS FAILED - Please review the errors above") |
|
|
sys.exit(1) |
|
|
|