File size: 5,561 Bytes
6880cd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import PyPDF2
import pdfplumber
import io
from typing import Dict, Any, Optional
import logging
from .utils import clean_text, get_text_statistics

logger = logging.getLogger(__name__)

class PDFProcessor:
    """
    Handles PDF text extraction and processing.
    """
    
    def __init__(self):
        self.supported_formats = ['.pdf']
    
    def extract_text_from_pdf(self, pdf_file: bytes) -> Dict[str, Any]:
        """
        Extract text from PDF file bytes.
        
        Args:
            pdf_file: PDF file as bytes
            
        Returns:
            Dictionary containing extracted text and metadata
        """
        try:
            # Try pdfplumber first (better for complex layouts)
            text = self._extract_with_pdfplumber(pdf_file)
            
            if not text or len(text.strip()) < 100:
                # Fallback to PyPDF2
                text = self._extract_with_pypdf2(pdf_file)
            
            if not text:
                raise ValueError("Could not extract text from PDF")
            
            # Clean the extracted text
            cleaned_text = clean_text(text)
            
            # Get text statistics
            stats = get_text_statistics(cleaned_text)
            
            return {
                'success': True,
                'text': cleaned_text,
                'statistics': stats,
                'pages': self._get_page_count(pdf_file),
                'message': 'Text extracted successfully'
            }
            
        except Exception as e:
            logger.error(f"Error extracting text from PDF: {str(e)}")
            return {
                'success': False,
                'text': '',
                'statistics': {},
                'pages': 0,
                'message': f'Error extracting text: {str(e)}'
            }
    
    def _extract_with_pdfplumber(self, pdf_file: bytes) -> str:
        """
        Extract text using pdfplumber (better for complex layouts).
        """
        text_parts = []
        
        try:
            with pdfplumber.open(io.BytesIO(pdf_file)) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text_parts.append(page_text)
            
            return '\n'.join(text_parts)
        except Exception as e:
            logger.warning(f"pdfplumber extraction failed: {str(e)}")
            return ""
    
    def _extract_with_pypdf2(self, pdf_file: bytes) -> str:
        """
        Extract text using PyPDF2 (fallback method).
        """
        text_parts = []
        
        try:
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
            
            for page in pdf_reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text_parts.append(page_text)
            
            return '\n'.join(text_parts)
        except Exception as e:
            logger.warning(f"PyPDF2 extraction failed: {str(e)}")
            return ""
    
    def _get_page_count(self, pdf_file: bytes) -> int:
        """
        Get the number of pages in the PDF.
        """
        try:
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
            return len(pdf_reader.pages)
        except:
            return 0
    
    def get_pdf_metadata(self, pdf_file: bytes) -> Dict[str, Any]:
        """
        Extract metadata from PDF file.
        """
        try:
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
            metadata = pdf_reader.metadata
            
            return {
                'title': metadata.get('/Title', 'Unknown'),
                'author': metadata.get('/Author', 'Unknown'),
                'subject': metadata.get('/Subject', ''),
                'creator': metadata.get('/Creator', ''),
                'producer': metadata.get('/Producer', ''),
                'pages': len(pdf_reader.pages)
            }
        except Exception as e:
            logger.error(f"Error extracting PDF metadata: {str(e)}")
            return {
                'title': 'Unknown',
                'author': 'Unknown',
                'subject': '',
                'creator': '',
                'producer': '',
                'pages': 0
            }
    
    def validate_pdf(self, pdf_file: bytes) -> Dict[str, Any]:
        """
        Validate PDF file and check if it can be processed.
        """
        try:
            # Check file size
            file_size = len(pdf_file)
            max_size = 50 * 1024 * 1024  # 50MB limit
            
            if file_size > max_size:
                return {
                    'valid': False,
                    'message': f'File too large. Maximum size is 50MB, got {file_size / (1024*1024):.1f}MB'
                }
            
            # Try to read PDF
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
            
            if len(pdf_reader.pages) == 0:
                return {
                    'valid': False,
                    'message': 'PDF appears to be empty or corrupted'
                }
            
            return {
                'valid': True,
                'message': 'PDF is valid',
                'pages': len(pdf_reader.pages),
                'size_mb': file_size / (1024 * 1024)
            }
            
        except Exception as e:
            return {
                'valid': False,
                'message': f'Invalid PDF file: {str(e)}'
            }