Spaces:

John-Jiang
/

starfish_data_ai

Paused

App Files Files Community

starfish_data_ai / tests /data_ingest /test_ingest.py

John-Jiang

init commit

5301c48 6 months ago

raw

history blame

6.26 kB

	import pytest
	import os
	from pathlib import Path
	from src.starfish.data_ingest.ingest import determine_parser, generate_input_data, process_file
	from src.starfish.data_ingest.parsers import (
	PDFParser,
	WordDocumentParser,
	PPTParser,
	TXTParser,
	ExcelParser,
	HTMLDocumentParser,
	YouTubeParser,
	WebParser,
	)
	from starfish.data_factory.factory import data_factory

	from starfish.data_ingest.formatter.template_format import QAGenerationPrompt
	from starfish.data_ingest.splitter.token_splitter import TokenTextSplitter
	from starfish.data_ingest.utils.util import async_read_file
	from starfish.llm.structured_llm import StructuredLLM
	import nest_asyncio

	from starfish.data_factory.factory import data_factory, resume_from_checkpoint
	from starfish.common.env_loader import load_env_file

	nest_asyncio.apply()
	load_env_file()

	# Test data paths
	TEST_DATA_DIR = Path(__file__).parent / "test_data"
	INPUT_DIR = TEST_DATA_DIR / "input"
	OUTPUT_DIR = TEST_DATA_DIR / "output"

	# Test files
	TEST_FILES = {
	"pdf": INPUT_DIR / "ECE_598_PV_course_notes8_v2.pdf",
	"docx": INPUT_DIR / "test.docx",
	"pptx": INPUT_DIR / "test.pptx",
	"txt": INPUT_DIR / "test.txt",
	"xlsx": INPUT_DIR / "test.xlsx",
	"html": INPUT_DIR / "test.html",
	}

	# Test URLs
	TEST_URLS = {
	"youtube": "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
	"html": "https://example.com",
	}
	gina_api_key = os.environ.get("JINA_AI_API_KEY", "")
	gina_pdf_url = "https://arxiv.org/pdf/2303.08774.pdf"

	# @pytest.fixture(autouse=True)
	# def setup_teardown():
	# # Setup: Create output directory
	# OUTPUT_DIR.mkdir(exist_ok=True)
	# yield
	# # Teardown: Clean up output files
	# for file in OUTPUT_DIR.iterdir():
	# file.unlink()


	@pytest.mark.skip()
	def test_determine_parser_file_types():
	"""Test determine_parser with different file types"""
	# Test supported file types
	assert isinstance(determine_parser(str(TEST_FILES["pdf"])), PDFParser)
	# assert isinstance(determine_parser(str(TEST_FILES["docx"])), WordDocumentParser)
	# assert isinstance(determine_parser(str(TEST_FILES["pptx"])), PPTParser)
	# assert isinstance(determine_parser(str(TEST_FILES["txt"])), TXTParser)
	# assert isinstance(determine_parser(str(TEST_FILES["xlsx"])), ExcelParser)
	# assert isinstance(determine_parser(str(TEST_FILES["html"])), HTMLDocumentParser)

	# Test unsupported file type
	with pytest.raises(ValueError):
	determine_parser(str(INPUT_DIR / "test.unsupported"))


	@pytest.mark.skip()
	def test_process_file():
	process_file(str(TEST_FILES["pdf"]), OUTPUT_DIR)
	# parser = determine_parser(str(TEST_FILES["pdf"]))
	# parser.parse(str(TEST_FILES["pdf"]))


	@pytest.mark.skip(reason="not support UnstructuredParser to avoid too many package dependencies")
	def test_unstructured_parser():
	"""Test UnstructuredParser with a PDF file"""
	parser = UnstructuredParser()
	content = parser.parse(str(TEST_FILES["pdf"]))
	assert content is not None
	assert len(content) > 0 # Ensure content was extracted
	assert isinstance(content, str) # Verify content is a string


	@pytest.mark.asyncio
	@pytest.mark.skip()
	async def test_process_file_gina_ai():
	gina_ai_parser = WebParser(gina_api_key)
	content = await gina_ai_parser.parse_async(gina_pdf_url)
	gina_ai_parser.save(content=content, output_path=OUTPUT_DIR / "gina_ai.txt")


	@pytest.mark.asyncio
	@pytest.mark.skip()
	async def test_ingest_input_data():
	@data_factory(max_concurrency=10)
	async def test_ingest_pdf(prompt_msg: str):
	structured_llm = StructuredLLM(
	model_name="openai/gpt-4o-mini",
	prompt="{{prompt_msg}}",
	output_schema=[{"name": "question", "type": "str"}, {"name": "answer", "type": "str"}],
	model_kwargs={"temperature": 0.7},
	)
	output = await structured_llm.run(prompt_msg=prompt_msg)
	return output.data

	content = await async_read_file(file_path=OUTPUT_DIR / "gina_ai.txt")
	all_messages = generate_input_data(content, TokenTextSplitter(), QAGenerationPrompt())

	result = await test_ingest_pdf.run(prompt_msg=all_messages)
	assert len(result) == 4


	@pytest.mark.asyncio
	async def test_tiktoken_spiter():
	content = await async_read_file(file_path=OUTPUT_DIR / "gina_ai.txt")
	all_messages = TokenTextSplitter().split_text(content)

	assert len(all_messages) == 195


	def test_determine_parser_urls():
	"""Test determine_parser with different URL types"""
	# Test YouTube URL
	assert isinstance(determine_parser(TEST_URLS["youtube"]), YouTubeParser)

	# Test HTML URL
	assert isinstance(determine_parser(TEST_URLS["html"]), HTMLDocumentParser)


	@pytest.mark.skip()
	def test_process_file_output():
	"""Test process_file creates correct output files"""
	# Test with PDF file
	output_path = process_file(str(TEST_FILES["pdf"]), str(OUTPUT_DIR))
	assert os.path.exists(output_path)
	assert output_path.endswith("test.txt")

	# Test with custom output name
	custom_name = "custom_output.txt"
	output_path = process_file(str(TEST_FILES["docx"]), str(OUTPUT_DIR), custom_name)
	assert os.path.exists(output_path)
	assert output_path.endswith(custom_name)


	@pytest.mark.skip()
	def test_process_file_urls():
	"""Test process_file with URLs"""
	# Test YouTube URL
	output_path = process_file(TEST_URLS["youtube"], str(OUTPUT_DIR))
	assert os.path.exists(output_path)
	assert "youtube_dQw4w9WgXcQ.txt" in output_path

	# Test HTML URL
	output_path = process_file(TEST_URLS["html"], str(OUTPUT_DIR))
	assert os.path.exists(output_path)
	assert "example_com.txt" in output_path


	def test_process_file_nonexistent_file():
	"""Test process_file with non-existent file"""
	with pytest.raises(FileNotFoundError):
	process_file(str(INPUT_DIR / "nonexistent.file"), str(OUTPUT_DIR))


	@pytest.mark.skip()
	def test_process_file_output_dir_creation():
	"""Test process_file creates output directory if it doesn't exist"""
	new_output_dir = OUTPUT_DIR / "new_dir"
	output_path = process_file(str(TEST_FILES["txt"]), str(new_output_dir))
	assert os.path.exists(new_output_dir)
	assert os.path.exists(output_path)