|
|
import re |
|
|
from typing import List, Dict, Any |
|
|
import logging |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def clean_text(text: str) -> str: |
|
|
""" |
|
|
Clean and preprocess extracted text from PDF. |
|
|
""" |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
text = re.sub(r'\n+', '\n', text) |
|
|
text = text.strip() |
|
|
|
|
|
|
|
|
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}]', '', text) |
|
|
|
|
|
return text |
|
|
|
|
|
def chunk_text(text: str, max_chunk_size: int = 1000, overlap: int = 100) -> List[str]: |
|
|
""" |
|
|
Split text into overlapping chunks for processing. |
|
|
|
|
|
Args: |
|
|
text: Input text to chunk |
|
|
max_chunk_size: Maximum size of each chunk |
|
|
overlap: Number of characters to overlap between chunks |
|
|
|
|
|
Returns: |
|
|
List of text chunks |
|
|
""" |
|
|
if len(text) <= max_chunk_size: |
|
|
return [text] |
|
|
|
|
|
chunks = [] |
|
|
start = 0 |
|
|
|
|
|
while start < len(text): |
|
|
end = start + max_chunk_size |
|
|
|
|
|
|
|
|
if end < len(text): |
|
|
|
|
|
sentence_endings = ['.', '!', '?'] |
|
|
for ending in sentence_endings: |
|
|
last_ending = text.rfind(ending, start, end) |
|
|
if last_ending > start + max_chunk_size * 0.8: |
|
|
end = last_ending + 1 |
|
|
break |
|
|
|
|
|
chunk = text[start:end].strip() |
|
|
if chunk: |
|
|
chunks.append(chunk) |
|
|
|
|
|
|
|
|
start = end - overlap |
|
|
if start >= len(text): |
|
|
break |
|
|
|
|
|
return chunks |
|
|
|
|
|
def extract_chapters(text: str) -> Dict[str, str]: |
|
|
""" |
|
|
Attempt to extract chapters from the text. |
|
|
""" |
|
|
chapters = {} |
|
|
|
|
|
|
|
|
chapter_patterns = [ |
|
|
r'Chapter\s+(\d+|[IVXLC]+)', |
|
|
r'CHAPTER\s+(\d+|[IVXLC]+)', |
|
|
r'(\d+)\.\s+[A-Z]', |
|
|
r'[IVXLC]+\.\s+[A-Z]' |
|
|
] |
|
|
|
|
|
lines = text.split('\n') |
|
|
current_chapter = "Introduction" |
|
|
current_content = [] |
|
|
|
|
|
for line in lines: |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
|
|
|
|
|
|
is_chapter_header = False |
|
|
for pattern in chapter_patterns: |
|
|
if re.match(pattern, line, re.IGNORECASE): |
|
|
|
|
|
if current_content: |
|
|
chapters[current_chapter] = '\n'.join(current_content) |
|
|
|
|
|
current_chapter = line |
|
|
current_content = [] |
|
|
is_chapter_header = True |
|
|
break |
|
|
|
|
|
if not is_chapter_header: |
|
|
current_content.append(line) |
|
|
|
|
|
|
|
|
if current_content: |
|
|
chapters[current_chapter] = '\n'.join(current_content) |
|
|
|
|
|
return chapters |
|
|
|
|
|
def get_text_statistics(text: str) -> Dict[str, Any]: |
|
|
""" |
|
|
Get basic statistics about the text. |
|
|
""" |
|
|
words = text.split() |
|
|
|
|
|
sentences = [s.strip() for s in re.split(r'(?<=[\.\!\?])\s+', text) if s.strip()] |
|
|
|
|
|
return { |
|
|
'total_characters': len(text), |
|
|
'total_words': len(words), |
|
|
'total_sentences': len(sentences), |
|
|
'average_words_per_sentence': len(words) / len(sentences) if sentences else 0, |
|
|
'estimated_reading_time_minutes': len(words) / 200 |
|
|
} |
|
|
|