Spaces:

Andertseng
/

DeepSeek-OCR

Running on Zero

File size: 9,153 Bytes

"""
DeepSeek-OCR Gradio Interface for Hugging Face Spaces
------------------------------------------------------
Simplified Gradio app optimized for ZeroGPU deployment
"""

import gradio as gr
import torch
from PIL import Image
import tempfile
import os
from pathlib import Path
import spaces
import fitz  # PyMuPDF

# Initialize model (will be loaded on first use with ZeroGPU)
model = None
processor = None

def load_model():
    """Load DeepSeek-OCR model with ZeroGPU"""
    global model, processor
    if model is None:
        from transformers import AutoModelForCausalLM, AutoTokenizer
        try:
            # Try importing from backend.process first (for Hugging Face Space)
            from backend.process.image_process import DeepseekOCRProcessor
        except ImportError:
            # Fall back to process.image_process (for local deployment)
            from process.image_process import DeepseekOCRProcessor

        model_path = "deepseek-ai/DeepSeek-OCR"

        print("Loading DeepSeek-OCR model...")
        processor = DeepseekOCRProcessor.from_pretrained(model_path)
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True
        )
        print("Model loaded successfully!")
    return model, processor

@spaces.GPU(duration=120)
def perform_ocr(image, prompt_text):
    """
    Perform OCR on the uploaded image

    Args:
        image: PIL Image or file path
        prompt_text: Custom prompt for OCR task

    Returns:
        str: Extracted text or analysis result
    """
    try:
        # Load model
        model, processor = load_model()

        # Handle image input
        if isinstance(image, str):
            image = Image.open(image).convert("RGB")
        elif not isinstance(image, Image.Image):
            raise ValueError("Invalid image input")

        # Prepare prompt
        if not prompt_text or prompt_text.strip() == "":
            prompt = "<image>\nFree OCR."
        else:
            prompt = f"<image>\n{prompt_text}"

        # Process image
        inputs = processor.tokenize_with_images(
            images=[image],
            prompt=prompt,
            bos=True,
            eos=True,
            cropping=True
        )

        # Move to GPU
        inputs = {k: v.to(model.device) if isinstance(v, torch.Tensor) else v
                  for k, v in inputs.items()}

        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=2048,
                do_sample=False,
                temperature=1.0,
                top_p=1.0,
                use_cache=True,
            )

        # Decode output
        result = processor.tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        )

        return result

    except Exception as e:
        return f"Error during OCR processing: {str(e)}"

@spaces.GPU(duration=180)
def process_pdf(pdf_file, prompt_text):
    """
    Process PDF file (extract text from first few pages)

    Args:
        pdf_file: Uploaded PDF file path (string)
        prompt_text: Custom prompt for OCR task

    Returns:
        str: Extracted text from PDF pages
    """
    try:
        # Validate file upload
        if pdf_file is None or pdf_file == "":
            return "❌ Please upload a PDF file first."

        # pdf_file is now a filepath string
        pdf_path = pdf_file

        # Check if file exists
        if not os.path.exists(pdf_path):
            return f"❌ File not found: {pdf_path}"

        # Open PDF
        pdf_document = fitz.open(pdf_path)
        total_pages = len(pdf_document)

        if total_pages == 0:
            pdf_document.close()
            return "❌ PDF file is empty (0 pages)."

        results = []

        # Process first 3 pages (to avoid timeout)
        max_pages = min(3, total_pages)

        for page_num in range(max_pages):
            page = pdf_document[page_num]

            # Convert page to image
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x resolution
            img_data = pix.tobytes("png")

            # Save to temporary file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
                tmp.write(img_data)
                tmp_path = tmp.name

            # Perform OCR
            image = Image.open(tmp_path)
            result = perform_ocr(image, prompt_text)
            results.append(f"--- Page {page_num + 1} ---\n{result}\n")

            # Cleanup
            os.unlink(tmp_path)

        # Close PDF before checking total_pages
        pdf_document.close()

        # Add note if PDF has more pages
        if max_pages < total_pages:
            results.append(f"\n(Only first {max_pages} pages processed. Full PDF has {total_pages} pages)")

        return "\n".join(results)

    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        return f"❌ Error processing PDF: {str(e)}\n\nPlease make sure you uploaded a valid PDF file."

# Create Gradio Interface
with gr.Blocks(title="DeepSeek-OCR Studio", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🔍 DeepSeek-OCR Studio

    Advanced OCR system supporting:
    - 📝 Multi-language text recognition (Chinese, English, etc.)
    - 📊 Table & chart extraction
    - 🎨 Professional drawing analysis (CAD, flowcharts)
    - 📄 PDF document processing & OCR
    - 📐 Layout analysis & Markdown conversion

    **Note**: Running on ZeroGPU - first request may take longer to load the model.
    """)

    with gr.Tab("Image OCR"):
        with gr.Row():
            with gr.Column():
                image_input = gr.Image(type="pil", label="Upload Image")
                image_prompt = gr.Textbox(
                    label="Custom Prompt (Optional)",
                    placeholder="Free OCR.",
                    value="Free OCR.",
                    lines=2
                )
                image_btn = gr.Button("Extract Text", variant="primary")

            with gr.Column():
                image_output = gr.Textbox(
                    label="Extracted Text",
                    lines=20,
                    show_copy_button=True
                )

        image_btn.click(
            fn=perform_ocr,
            inputs=[image_input, image_prompt],
            outputs=image_output
        )

        gr.Examples(
            examples=[
                ["examples/sample1.png", "Free OCR."],
                ["examples/sample2.jpg", "Extract all text and tables."],
            ],
            inputs=[image_input, image_prompt],
            label="Example Images (if available)"
        )

    with gr.Tab("PDF OCR"):
        with gr.Row():
            with gr.Column():
                pdf_input = gr.File(
                    label="Upload PDF",
                    file_types=[".pdf"],
                    type="filepath"
                )
                pdf_prompt = gr.Textbox(
                    label="Custom Prompt (Optional)",
                    placeholder="Free OCR.",
                    value="Free OCR.",
                    lines=2
                )
                pdf_btn = gr.Button("Process PDF (First 3 Pages)", variant="primary")

            with gr.Column():
                pdf_output = gr.Textbox(
                    label="Extracted Text",
                    lines=20,
                    show_copy_button=True
                )

        pdf_btn.click(
            fn=process_pdf,
            inputs=[pdf_input, pdf_prompt],
            outputs=pdf_output
        )

    with gr.Tab("Advanced Prompts"):
        gr.Markdown("""
        ### Prompt Examples

        **Basic OCR:**
        ```
        Free OCR.
        ```

        **Table Extraction:**
        ```
        Extract all tables and convert to markdown format.
        ```

        **Chart Analysis:**
        ```
        Analyze this chart and extract data in table format.
        ```

        **Multi-language:**
        ```
        Extract all text in multiple languages.
        ```

        **CAD Drawing:**
        ```
        Analyze this technical drawing and describe its components.
        ```
        """)

    gr.Markdown("""
    ---
    ### About
    - **Model**: [DeepSeek-OCR](https://huggingface.co/deepseek-ai/DeepSeek-OCR)
    - **Project**: [DeepSeek-OCR-Web](https://github.com/fufankeji/DeepSeek-OCR-Web)
    - **GPU**: ZeroGPU (Hugging Face Spaces)

    ### Features
    - 🔍 **Image OCR**: Upload images for text extraction
    - 📄 **PDF OCR**: Extract text from PDF documents (first 3 pages)
    - 📊 **Table & Chart**: Extract tables and analyze charts
    - 🌍 **Multi-language**: Support for 100+ languages

    ### Note
    - Processing time: 30-120 seconds per image/page
    - PDF OCR limited to first 3 pages on ZeroGPU
    - For full functionality, deploy locally with GPU
    """)

if __name__ == "__main__":
    demo.queue(max_size=20)
    demo.launch()