Spaces:

ND06-25
/

Slash

Sleeping

App Files Files Community

Slash / api /utils.py

ND06-25

Codex PR

be5f84f 17 days ago

raw

history blame

3.61 kB

	import re
	from typing import List, Dict, Any
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def clean_text(text: str) -> str:
	"""
	Clean and preprocess extracted text from PDF.
	"""
	# Remove extra whitespace and normalize
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'\n+', '\n', text)
	text = text.strip()

	# Remove common PDF artifacts
	text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}]', '', text)

	return text

	def chunk_text(text: str, max_chunk_size: int = 1000, overlap: int = 100) -> List[str]:
	"""
	Split text into overlapping chunks for processing.

	Args:
	text: Input text to chunk
	max_chunk_size: Maximum size of each chunk
	overlap: Number of characters to overlap between chunks

	Returns:
	List of text chunks
	"""
	if len(text) <= max_chunk_size:
	return [text]

	chunks = []
	start = 0

	while start < len(text):
	end = start + max_chunk_size

	# Try to break at sentence boundaries
	if end < len(text):
	# Look for sentence endings
	sentence_endings = ['.', '!', '?']
	for ending in sentence_endings:
	last_ending = text.rfind(ending, start, end)
	if last_ending > start + max_chunk_size * 0.8: # Only break if we're at least 80% through
	end = last_ending + 1
	break

	chunk = text[start:end].strip()
	if chunk:
	chunks.append(chunk)

	# Move start position with overlap
	start = end - overlap
	if start >= len(text):
	break

	return chunks

	def extract_chapters(text: str) -> Dict[str, str]:
	"""
	Attempt to extract chapters from the text.
	"""
	chapters = {}

	# Common chapter patterns
	chapter_patterns = [
	r'Chapter\s+(\d+\|[IVXLC]+)',
	r'CHAPTER\s+(\d+\|[IVXLC]+)',
	r'(\d+)\.\s+[A-Z]',
	r'[IVXLC]+\.\s+[A-Z]'
	]

	lines = text.split('\n')
	current_chapter = "Introduction"
	current_content = []

	for line in lines:
	line = line.strip()
	if not line:
	continue

	# Check if this line is a chapter header
	is_chapter_header = False
	for pattern in chapter_patterns:
	if re.match(pattern, line, re.IGNORECASE):
	# Save previous chapter
	if current_content:
	chapters[current_chapter] = '\n'.join(current_content)

	current_chapter = line
	current_content = []
	is_chapter_header = True
	break

	if not is_chapter_header:
	current_content.append(line)

	# Save the last chapter
	if current_content:
	chapters[current_chapter] = '\n'.join(current_content)

	return chapters

	def get_text_statistics(text: str) -> Dict[str, Any]:
	"""
	Get basic statistics about the text.
	"""
	words = text.split()
	# Lightweight sentence split to avoid NLTK downloads
	sentences = [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+', text) if s.strip()]

	return {
	'total_characters': len(text),
	'total_words': len(words),
	'total_sentences': len(sentences),
	'average_words_per_sentence': len(words) / len(sentences) if sentences else 0,
	'estimated_reading_time_minutes': len(words) / 200 # Average reading speed
	}