File size: 5,561 Bytes
6880cd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import PyPDF2
import pdfplumber
import io
from typing import Dict, Any, Optional
import logging
from .utils import clean_text, get_text_statistics
logger = logging.getLogger(__name__)
class PDFProcessor:
"""
Handles PDF text extraction and processing.
"""
def __init__(self):
self.supported_formats = ['.pdf']
def extract_text_from_pdf(self, pdf_file: bytes) -> Dict[str, Any]:
"""
Extract text from PDF file bytes.
Args:
pdf_file: PDF file as bytes
Returns:
Dictionary containing extracted text and metadata
"""
try:
# Try pdfplumber first (better for complex layouts)
text = self._extract_with_pdfplumber(pdf_file)
if not text or len(text.strip()) < 100:
# Fallback to PyPDF2
text = self._extract_with_pypdf2(pdf_file)
if not text:
raise ValueError("Could not extract text from PDF")
# Clean the extracted text
cleaned_text = clean_text(text)
# Get text statistics
stats = get_text_statistics(cleaned_text)
return {
'success': True,
'text': cleaned_text,
'statistics': stats,
'pages': self._get_page_count(pdf_file),
'message': 'Text extracted successfully'
}
except Exception as e:
logger.error(f"Error extracting text from PDF: {str(e)}")
return {
'success': False,
'text': '',
'statistics': {},
'pages': 0,
'message': f'Error extracting text: {str(e)}'
}
def _extract_with_pdfplumber(self, pdf_file: bytes) -> str:
"""
Extract text using pdfplumber (better for complex layouts).
"""
text_parts = []
try:
with pdfplumber.open(io.BytesIO(pdf_file)) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text_parts.append(page_text)
return '\n'.join(text_parts)
except Exception as e:
logger.warning(f"pdfplumber extraction failed: {str(e)}")
return ""
def _extract_with_pypdf2(self, pdf_file: bytes) -> str:
"""
Extract text using PyPDF2 (fallback method).
"""
text_parts = []
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text_parts.append(page_text)
return '\n'.join(text_parts)
except Exception as e:
logger.warning(f"PyPDF2 extraction failed: {str(e)}")
return ""
def _get_page_count(self, pdf_file: bytes) -> int:
"""
Get the number of pages in the PDF.
"""
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
return len(pdf_reader.pages)
except:
return 0
def get_pdf_metadata(self, pdf_file: bytes) -> Dict[str, Any]:
"""
Extract metadata from PDF file.
"""
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
metadata = pdf_reader.metadata
return {
'title': metadata.get('/Title', 'Unknown'),
'author': metadata.get('/Author', 'Unknown'),
'subject': metadata.get('/Subject', ''),
'creator': metadata.get('/Creator', ''),
'producer': metadata.get('/Producer', ''),
'pages': len(pdf_reader.pages)
}
except Exception as e:
logger.error(f"Error extracting PDF metadata: {str(e)}")
return {
'title': 'Unknown',
'author': 'Unknown',
'subject': '',
'creator': '',
'producer': '',
'pages': 0
}
def validate_pdf(self, pdf_file: bytes) -> Dict[str, Any]:
"""
Validate PDF file and check if it can be processed.
"""
try:
# Check file size
file_size = len(pdf_file)
max_size = 50 * 1024 * 1024 # 50MB limit
if file_size > max_size:
return {
'valid': False,
'message': f'File too large. Maximum size is 50MB, got {file_size / (1024*1024):.1f}MB'
}
# Try to read PDF
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
if len(pdf_reader.pages) == 0:
return {
'valid': False,
'message': 'PDF appears to be empty or corrupted'
}
return {
'valid': True,
'message': 'PDF is valid',
'pages': len(pdf_reader.pages),
'size_mb': file_size / (1024 * 1024)
}
except Exception as e:
return {
'valid': False,
'message': f'Invalid PDF file: {str(e)}'
} |