Slash / api /utils.py
ND06-25's picture
Codex PR
be5f84f
raw
history blame
3.61 kB
import re
from typing import List, Dict, Any
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def clean_text(text: str) -> str:
"""
Clean and preprocess extracted text from PDF.
"""
# Remove extra whitespace and normalize
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\n+', '\n', text)
text = text.strip()
# Remove common PDF artifacts
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}]', '', text)
return text
def chunk_text(text: str, max_chunk_size: int = 1000, overlap: int = 100) -> List[str]:
"""
Split text into overlapping chunks for processing.
Args:
text: Input text to chunk
max_chunk_size: Maximum size of each chunk
overlap: Number of characters to overlap between chunks
Returns:
List of text chunks
"""
if len(text) <= max_chunk_size:
return [text]
chunks = []
start = 0
while start < len(text):
end = start + max_chunk_size
# Try to break at sentence boundaries
if end < len(text):
# Look for sentence endings
sentence_endings = ['.', '!', '?']
for ending in sentence_endings:
last_ending = text.rfind(ending, start, end)
if last_ending > start + max_chunk_size * 0.8: # Only break if we're at least 80% through
end = last_ending + 1
break
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
# Move start position with overlap
start = end - overlap
if start >= len(text):
break
return chunks
def extract_chapters(text: str) -> Dict[str, str]:
"""
Attempt to extract chapters from the text.
"""
chapters = {}
# Common chapter patterns
chapter_patterns = [
r'Chapter\s+(\d+|[IVXLC]+)',
r'CHAPTER\s+(\d+|[IVXLC]+)',
r'(\d+)\.\s+[A-Z]',
r'[IVXLC]+\.\s+[A-Z]'
]
lines = text.split('\n')
current_chapter = "Introduction"
current_content = []
for line in lines:
line = line.strip()
if not line:
continue
# Check if this line is a chapter header
is_chapter_header = False
for pattern in chapter_patterns:
if re.match(pattern, line, re.IGNORECASE):
# Save previous chapter
if current_content:
chapters[current_chapter] = '\n'.join(current_content)
current_chapter = line
current_content = []
is_chapter_header = True
break
if not is_chapter_header:
current_content.append(line)
# Save the last chapter
if current_content:
chapters[current_chapter] = '\n'.join(current_content)
return chapters
def get_text_statistics(text: str) -> Dict[str, Any]:
"""
Get basic statistics about the text.
"""
words = text.split()
# Lightweight sentence split to avoid NLTK downloads
sentences = [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+', text) if s.strip()]
return {
'total_characters': len(text),
'total_words': len(words),
'total_sentences': len(sentences),
'average_words_per_sentence': len(words) / len(sentences) if sentences else 0,
'estimated_reading_time_minutes': len(words) / 200 # Average reading speed
}