File size: 9,153 Bytes
df68ff3
 
 
 
 
 
 
 
 
 
 
 
 
585eb54
df68ff3
 
 
 
 
 
 
 
 
 
f59f198
 
 
 
 
 
df68ff3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdc67a5
df68ff3
 
 
 
 
 
efe5aeb
cdc67a5
efe5aeb
 
cdc67a5
 
efe5aeb
 
cdc67a5
 
efe5aeb
df68ff3
cdc67a5
f3dcd28
efe5aeb
 
 
 
 
df68ff3
 
 
f3dcd28
df68ff3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3dcd28
df68ff3
 
f3dcd28
 
 
df68ff3
 
 
 
efe5aeb
 
 
df68ff3
 
 
 
 
 
 
585eb54
 
 
 
 
df68ff3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdc67a5
 
 
 
 
df68ff3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
585eb54
 
 
 
 
 
df68ff3
f3dcd28
585eb54
df68ff3
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
"""
DeepSeek-OCR Gradio Interface for Hugging Face Spaces
------------------------------------------------------
Simplified Gradio app optimized for ZeroGPU deployment
"""

import gradio as gr
import torch
from PIL import Image
import tempfile
import os
from pathlib import Path
import spaces
import fitz  # PyMuPDF

# Initialize model (will be loaded on first use with ZeroGPU)
model = None
processor = None

def load_model():
    """Load DeepSeek-OCR model with ZeroGPU"""
    global model, processor
    if model is None:
        from transformers import AutoModelForCausalLM, AutoTokenizer
        try:
            # Try importing from backend.process first (for Hugging Face Space)
            from backend.process.image_process import DeepseekOCRProcessor
        except ImportError:
            # Fall back to process.image_process (for local deployment)
            from process.image_process import DeepseekOCRProcessor

        model_path = "deepseek-ai/DeepSeek-OCR"

        print("Loading DeepSeek-OCR model...")
        processor = DeepseekOCRProcessor.from_pretrained(model_path)
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True
        )
        print("Model loaded successfully!")
    return model, processor

@spaces.GPU(duration=120)
def perform_ocr(image, prompt_text):
    """
    Perform OCR on the uploaded image

    Args:
        image: PIL Image or file path
        prompt_text: Custom prompt for OCR task

    Returns:
        str: Extracted text or analysis result
    """
    try:
        # Load model
        model, processor = load_model()

        # Handle image input
        if isinstance(image, str):
            image = Image.open(image).convert("RGB")
        elif not isinstance(image, Image.Image):
            raise ValueError("Invalid image input")

        # Prepare prompt
        if not prompt_text or prompt_text.strip() == "":
            prompt = "<image>\nFree OCR."
        else:
            prompt = f"<image>\n{prompt_text}"

        # Process image
        inputs = processor.tokenize_with_images(
            images=[image],
            prompt=prompt,
            bos=True,
            eos=True,
            cropping=True
        )

        # Move to GPU
        inputs = {k: v.to(model.device) if isinstance(v, torch.Tensor) else v
                  for k, v in inputs.items()}

        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=2048,
                do_sample=False,
                temperature=1.0,
                top_p=1.0,
                use_cache=True,
            )

        # Decode output
        result = processor.tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        )

        return result

    except Exception as e:
        return f"Error during OCR processing: {str(e)}"

@spaces.GPU(duration=180)
def process_pdf(pdf_file, prompt_text):
    """
    Process PDF file (extract text from first few pages)

    Args:
        pdf_file: Uploaded PDF file path (string)
        prompt_text: Custom prompt for OCR task

    Returns:
        str: Extracted text from PDF pages
    """
    try:
        # Validate file upload
        if pdf_file is None or pdf_file == "":
            return "❌ Please upload a PDF file first."

        # pdf_file is now a filepath string
        pdf_path = pdf_file

        # Check if file exists
        if not os.path.exists(pdf_path):
            return f"❌ File not found: {pdf_path}"

        # Open PDF
        pdf_document = fitz.open(pdf_path)
        total_pages = len(pdf_document)

        if total_pages == 0:
            pdf_document.close()
            return "❌ PDF file is empty (0 pages)."

        results = []

        # Process first 3 pages (to avoid timeout)
        max_pages = min(3, total_pages)

        for page_num in range(max_pages):
            page = pdf_document[page_num]

            # Convert page to image
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x resolution
            img_data = pix.tobytes("png")

            # Save to temporary file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
                tmp.write(img_data)
                tmp_path = tmp.name

            # Perform OCR
            image = Image.open(tmp_path)
            result = perform_ocr(image, prompt_text)
            results.append(f"--- Page {page_num + 1} ---\n{result}\n")

            # Cleanup
            os.unlink(tmp_path)

        # Close PDF before checking total_pages
        pdf_document.close()

        # Add note if PDF has more pages
        if max_pages < total_pages:
            results.append(f"\n(Only first {max_pages} pages processed. Full PDF has {total_pages} pages)")

        return "\n".join(results)

    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        return f"❌ Error processing PDF: {str(e)}\n\nPlease make sure you uploaded a valid PDF file."

# Create Gradio Interface
with gr.Blocks(title="DeepSeek-OCR Studio", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # πŸ” DeepSeek-OCR Studio

    Advanced OCR system supporting:
    - πŸ“ Multi-language text recognition (Chinese, English, etc.)
    - πŸ“Š Table & chart extraction
    - 🎨 Professional drawing analysis (CAD, flowcharts)
    - πŸ“„ PDF document processing & OCR
    - πŸ“ Layout analysis & Markdown conversion

    **Note**: Running on ZeroGPU - first request may take longer to load the model.
    """)

    with gr.Tab("Image OCR"):
        with gr.Row():
            with gr.Column():
                image_input = gr.Image(type="pil", label="Upload Image")
                image_prompt = gr.Textbox(
                    label="Custom Prompt (Optional)",
                    placeholder="Free OCR.",
                    value="Free OCR.",
                    lines=2
                )
                image_btn = gr.Button("Extract Text", variant="primary")

            with gr.Column():
                image_output = gr.Textbox(
                    label="Extracted Text",
                    lines=20,
                    show_copy_button=True
                )

        image_btn.click(
            fn=perform_ocr,
            inputs=[image_input, image_prompt],
            outputs=image_output
        )

        gr.Examples(
            examples=[
                ["examples/sample1.png", "Free OCR."],
                ["examples/sample2.jpg", "Extract all text and tables."],
            ],
            inputs=[image_input, image_prompt],
            label="Example Images (if available)"
        )

    with gr.Tab("PDF OCR"):
        with gr.Row():
            with gr.Column():
                pdf_input = gr.File(
                    label="Upload PDF",
                    file_types=[".pdf"],
                    type="filepath"
                )
                pdf_prompt = gr.Textbox(
                    label="Custom Prompt (Optional)",
                    placeholder="Free OCR.",
                    value="Free OCR.",
                    lines=2
                )
                pdf_btn = gr.Button("Process PDF (First 3 Pages)", variant="primary")

            with gr.Column():
                pdf_output = gr.Textbox(
                    label="Extracted Text",
                    lines=20,
                    show_copy_button=True
                )

        pdf_btn.click(
            fn=process_pdf,
            inputs=[pdf_input, pdf_prompt],
            outputs=pdf_output
        )

    with gr.Tab("Advanced Prompts"):
        gr.Markdown("""
        ### Prompt Examples

        **Basic OCR:**
        ```
        Free OCR.
        ```

        **Table Extraction:**
        ```
        Extract all tables and convert to markdown format.
        ```

        **Chart Analysis:**
        ```
        Analyze this chart and extract data in table format.
        ```

        **Multi-language:**
        ```
        Extract all text in multiple languages.
        ```

        **CAD Drawing:**
        ```
        Analyze this technical drawing and describe its components.
        ```
        """)

    gr.Markdown("""
    ---
    ### About
    - **Model**: [DeepSeek-OCR](https://huggingface.co/deepseek-ai/DeepSeek-OCR)
    - **Project**: [DeepSeek-OCR-Web](https://github.com/fufankeji/DeepSeek-OCR-Web)
    - **GPU**: ZeroGPU (Hugging Face Spaces)

    ### Features
    - πŸ” **Image OCR**: Upload images for text extraction
    - πŸ“„ **PDF OCR**: Extract text from PDF documents (first 3 pages)
    - πŸ“Š **Table & Chart**: Extract tables and analyze charts
    - 🌍 **Multi-language**: Support for 100+ languages

    ### Note
    - Processing time: 30-120 seconds per image/page
    - PDF OCR limited to first 3 pages on ZeroGPU
    - For full functionality, deploy locally with GPU
    """)

if __name__ == "__main__":
    demo.queue(max_size=20)
    demo.launch()