LightOnOCR

Paused

App Files Files Community

IFMedTechdemo commited on 20 days ago

Commit

95d2834

verified ·

1 Parent(s): 60d9cea

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -105

app.py CHANGED Viewed

@@ -8,27 +8,23 @@ import torch
 import gradio as gr
 from PIL import Image
-from io import BytesIO
 import pypdfium2 as pdfium
 from transformers import (
     LightOnOCRForConditionalGeneration,
     LightOnOCRProcessor,
 )
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 device = "cuda" if torch.cuda.is_available() else "cpu"
 if device == "cuda":
     attn_implementation = "sdpa"
     dtype = torch.bfloat16
-    print("Using sdpa for GPU")
 else:
     attn_implementation = "eager"
     dtype = torch.float32
-    print("Using eager attention for CPU")
-print(f"Loading LightOnOCR model on {device} with {attn_implementation} attention...")
 ocr_model = LightOnOCRForConditionalGeneration.from_pretrained(
     "lightonai/LightOnOCR-1B-1025",
     attn_implementation=attn_implementation,
@@ -40,10 +36,7 @@ processor = LightOnOCRProcessor.from_pretrained(
     "lightonai/LightOnOCR-1B-1025",
     trust_remote_code=True,
 )
-print("LightOnOCR model loaded successfully!")
-# -------- Clinical NER models (load ONCE) --------
-print("Loading clinical NER model...")
 ner_tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
 ner_model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
 ner_pipeline = pipeline(
@@ -52,11 +45,8 @@ ner_pipeline = pipeline(
     tokenizer=ner_tokenizer,
     aggregation_strategy="simple",
 )
-print("Clinical NER model loaded successfully!")
 def render_pdf_page(page, max_resolution=1540, scale=2.77):
-    """Render a PDF page to PIL Image."""
     width, height = page.get_size()
     pixel_width = width * scale
     pixel_height = height * scale
@@ -64,61 +54,58 @@ def render_pdf_page(page, max_resolution=1540, scale=2.77):
     target_scale = scale * resize_factor
     return page.render(scale=target_scale, rev_byteorder=True).to_pil()
 def process_pdf(pdf_path, page_num=1):
-    """Extract a specific page from PDF."""
     pdf = pdfium.PdfDocument(pdf_path)
     total_pages = len(pdf)
     page_idx = min(max(int(page_num) - 1, 0), total_pages - 1)
     page = pdf[page_idx]
     img = render_pdf_page(page)
     pdf.close()
     return img, total_pages, page_idx + 1
 def clean_output_text(text):
-    """Remove chat template artifacts from output."""
-    # Remove common chat template markers
     markers_to_remove = ["system", "user", "assistant"]
-    # Split by lines and filter
     lines = text.split('\n')
     cleaned_lines = []
     for line in lines:
         stripped = line.strip()
-        # Skip lines that are just template markers
         if stripped.lower() not in markers_to_remove:
             cleaned_lines.append(line)
-    # Join back and strip leading/trailing whitespace
     cleaned = '\n'.join(cleaned_lines).strip()
-    # Alternative approach: if there's an "assistant" marker, take everything after it
     if "assistant" in text.lower():
         parts = text.split("assistant", 1)
         if len(parts) > 1:
             cleaned = parts[1].strip()
     return cleaned
 @spaces.GPU
 def extract_text_from_image(image, temperature=0.2):
-    """Extract text from image using LightOnOCR model, and run clinical NER."""
-    # Prepare the chat format
     chat = [
         {
             "role": "user",
             "content": [
-                {"type": "image", "url": image},  # adjust to {"type": "image", "image": image} if LightOnOCR expects that
             ],
         }
     ]
-    # Tokenize
     inputs = processor.apply_chat_template(
         chat,
         add_generation_prompt=True,
@@ -126,7 +113,6 @@ def extract_text_from_image(image, temperature=0.2):
         return_dict=True,
         return_tensors="pt",
     )
     # Move inputs to device
     inputs = {
         k: (
@@ -138,7 +124,6 @@ def extract_text_from_image(image, temperature=0.2):
         )
         for k, v in inputs.items()
     }
     generation_kwargs = dict(
         **inputs,
         max_new_tokens=2048,
@@ -146,19 +131,12 @@ def extract_text_from_image(image, temperature=0.2):
         use_cache=True,
         do_sample=temperature > 0,
     )
-    # Non-streaming generation
     with torch.no_grad():
         outputs = ocr_model.generate(**generation_kwargs)
     output_text = processor.decode(outputs[0], skip_special_tokens=True)
-    cleaned_text = clean_output_text(output_text)
-    print("\n this is cleaned_text",cleaned_text )
-    # Clinical NER on the full cleaned text
-    entities = ner_pipeline(cleaned_text)
-    print("\n this is entity",entities)
     medications = []
     for ent in entities:
         if ent["entity_group"] == "treatment":
@@ -167,28 +145,19 @@ def extract_text_from_image(image, temperature=0.2):
                 medications[-1] += word[2:]
             else:
                 medications.append(word)
     medications_str = ", ".join(set(medications)) if medications else "None detected"
-    yield cleaned_text, medications_str
 def process_input(file_input, temperature, page_num):
-    """Process uploaded file (image or PDF) and extract text with optional streaming."""
     if file_input is None:
-        # 6 outputs: [output_text, medications_output, raw_output, page_info, rendered_image, num_pages]
-        yield "Please upload an image or PDF first.", "", "", "", None, 1
         return
     image_to_process = None
     page_info = ""
     slider_value = page_num
     file_path = file_input if isinstance(file_input, str) else file_input.name
-    # Handle PDF files
     if file_path.lower().endswith(".pdf"):
         try:
             image_to_process, total_pages, actual_page = process_pdf(file_path, int(page_num))
@@ -199,7 +168,6 @@ def process_input(file_input, temperature, page_num):
             yield msg, "", msg, "", None, slider_value
             return
     else:
-        # Handle image files
         try:
             image_to_process = Image.open(file_path)
             page_info = "Processing image"
@@ -209,29 +177,18 @@ def process_input(file_input, temperature, page_num):
             return
     try:
-        # Extract text using LightOnOCR with optional streaming
-        for extracted_text, medications in extract_text_from_image(
             image_to_process, temperature
         ):
-            raw_md = extracted_text  # or you can keep a different raw version
-            # 6 outputs: markdown_text, medications, raw_output, page_info, image, slider
-            yield extracted_text, medications, raw_md, page_info, image_to_process, gr.update(
-                value=slider_value
-            )
     except Exception as e:
         error_msg = f"Error during text extraction: {str(e)}"
-        # 6 outputs
-        yield error_msg, "", error_msg, page_info, image_to_process, gr.update(value=slider_value)
 def update_slider(file_input):
-    """Update page slider based on PDF page count."""
     if file_input is None:
         return gr.update(maximum=20, value=1)
     file_path = file_input if isinstance(file_input, str) else file_input.name
     if file_path.lower().endswith('.pdf'):
         try:
             pdf = pdfium.PdfDocument(file_path)
@@ -243,6 +200,75 @@ def update_slider(file_input):
     else:
         return gr.update(maximum=1, value=1)
 # Create Gradio interface
 # with gr.Blocks(title="📖 Image/PDF OCR with LightOnOCR", theme=gr.themes.Soft()) as demo:
@@ -330,41 +356,6 @@ def update_slider(file_input):
 #     outputs=[output_text, medications_output, raw_output, page_info, rendered_image, num_pages]
 # )
-with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo:
-    file_input = gr.File(
-        label="🖼️ Upload Image or PDF",
-        file_types=[".pdf", ".png", ".jpg", ".jpeg"],
-        type="filepath"
-    )
-    temperature = gr.Slider(
-        minimum=0.0,
-        maximum=1.0,
-        value=0.2,
-        step=0.05,
-        label="Temperature",
-        info="0.0 = deterministic, Higher = more varied"
-    )
-    medicines_output = gr.Textbox(
-        label="💊 Extracted Medicines/Drugs",
-        placeholder="Medicine/drug names will appear here...",
-        lines=2,
-        max_lines=5,
-        interactive=False,
-        show_copy_button=True
-    )
-    submit_btn = gr.Button("Extract Medicines", variant="primary")
-    submit_btn.click(
-        fn=process_input,  # already yields medicines as second output
-        inputs=[file_input, temperature, 1],  # fix page=1 or expose slider
-        outputs=[gr.update(), medicines_output, gr.update(), gr.update(), gr.update(), gr.update()]
-    )
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from PIL import Image
+import numpy as np
+import cv2
 import pypdfium2 as pdfium
 from transformers import (
     LightOnOCRForConditionalGeneration,
     LightOnOCRProcessor,
 )
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 device = "cuda" if torch.cuda.is_available() else "cpu"
 if device == "cuda":
     attn_implementation = "sdpa"
     dtype = torch.bfloat16
 else:
     attn_implementation = "eager"
     dtype = torch.float32
 ocr_model = LightOnOCRForConditionalGeneration.from_pretrained(
     "lightonai/LightOnOCR-1B-1025",
     attn_implementation=attn_implementation,
     "lightonai/LightOnOCR-1B-1025",
     trust_remote_code=True,
 )
 ner_tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
 ner_model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
 ner_pipeline = pipeline(
     tokenizer=ner_tokenizer,
     aggregation_strategy="simple",
 )
 def render_pdf_page(page, max_resolution=1540, scale=2.77):
     width, height = page.get_size()
     pixel_width = width * scale
     pixel_height = height * scale
     target_scale = scale * resize_factor
     return page.render(scale=target_scale, rev_byteorder=True).to_pil()
 def process_pdf(pdf_path, page_num=1):
     pdf = pdfium.PdfDocument(pdf_path)
     total_pages = len(pdf)
     page_idx = min(max(int(page_num) - 1, 0), total_pages - 1)
     page = pdf[page_idx]
     img = render_pdf_page(page)
     pdf.close()
     return img, total_pages, page_idx + 1
 def clean_output_text(text):
     markers_to_remove = ["system", "user", "assistant"]
     lines = text.split('\n')
     cleaned_lines = []
     for line in lines:
         stripped = line.strip()
         if stripped.lower() not in markers_to_remove:
             cleaned_lines.append(line)
     cleaned = '\n'.join(cleaned_lines).strip()
     if "assistant" in text.lower():
         parts = text.split("assistant", 1)
         if len(parts) > 1:
             cleaned = parts[1].strip()
     return cleaned
+def preprocess_image_for_ocr(image):
+    """Convert PIL.Image to adaptive thresholded image for OCR."""
+    image_rgb = image.convert("RGB")
+    img_np = np.array(image_rgb)
+    gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
+    adaptive_threshold = cv2.adaptiveThreshold(
+        gray,
+        255,
+        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+        cv2.THRESH_BINARY,
+        85,
+        11,
+    )
+    preprocessed_pil = Image.fromarray(adaptive_threshold)
+    return preprocessed_pil
 @spaces.GPU
 def extract_text_from_image(image, temperature=0.2):
+    """OCR + clinical NER, with preprocessing."""
+    processed_img = preprocess_image_for_ocr(image)
     chat = [
         {
             "role": "user",
             "content": [
+                {"type": "image", "image": processed_img}
             ],
         }
     ]
     inputs = processor.apply_chat_template(
         chat,
         add_generation_prompt=True,
         return_dict=True,
         return_tensors="pt",
     )
     # Move inputs to device
     inputs = {
         k: (
         )
         for k, v in inputs.items()
     }
     generation_kwargs = dict(
         **inputs,
         max_new_tokens=2048,
         use_cache=True,
         do_sample=temperature > 0,
     )
     with torch.no_grad():
         outputs = ocr_model.generate(**generation_kwargs)
     output_text = processor.decode(outputs[0], skip_special_tokens=True)
+    cleaned_text = clean_output_text(output_text)
+    entities = ner_pipeline(cleaned_text)
     medications = []
     for ent in entities:
         if ent["entity_group"] == "treatment":
                 medications[-1] += word[2:]
             else:
                 medications.append(word)
     medications_str = ", ".join(set(medications)) if medications else "None detected"
+    yield cleaned_text, medications_str, output_text, processed_img
 def process_input(file_input, temperature, page_num):
     if file_input is None:
+        yield "Please upload an image or PDF first.", "", "", "", "No file!", 1
         return
     image_to_process = None
     page_info = ""
     slider_value = page_num
     file_path = file_input if isinstance(file_input, str) else file_input.name
     if file_path.lower().endswith(".pdf"):
         try:
             image_to_process, total_pages, actual_page = process_pdf(file_path, int(page_num))
             yield msg, "", msg, "", None, slider_value
             return
     else:
         try:
             image_to_process = Image.open(file_path)
             page_info = "Processing image"
             return
     try:
+        for cleaned_text, medications, raw_md, processed_img in extract_text_from_image(
             image_to_process, temperature
         ):
+            yield cleaned_text, medications, raw_md, page_info, processed_img, slider_value
     except Exception as e:
         error_msg = f"Error during text extraction: {str(e)}"
+        yield error_msg, "", error_msg, page_info, image_to_process, slider_value
 def update_slider(file_input):
     if file_input is None:
         return gr.update(maximum=20, value=1)
     file_path = file_input if isinstance(file_input, str) else file_input.name
     if file_path.lower().endswith('.pdf'):
         try:
             pdf = pdfium.PdfDocument(file_path)
     else:
         return gr.update(maximum=1, value=1)
+with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo:
+    file_input = gr.File(
+        label="🖼️ Upload Image or PDF",
+        file_types=[".pdf", ".png", ".jpg", ".jpeg"],
+        type="filepath"
+    )
+    temperature = gr.Slider(
+        minimum=0.0,
+        maximum=1.0,
+        value=0.2,
+        step=0.05,
+        label="Temperature"
+    )
+    page_slider = gr.Slider(
+        minimum=1, maximum=20, value=1, step=1,
+        label="Page Number (PDF only)",
+        interactive=True
+    )
+    output_text = gr.Textbox(
+        label="📝 Extracted Text",
+        lines=4,
+        max_lines=10,
+        interactive=False,
+        show_copy_button=True
+    )
+    medicines_output = gr.Textbox(
+        label="💊 Extracted Medicines/Drugs",
+        placeholder="Medicine/drug names will appear here...",
+        lines=2,
+        max_lines=5,
+        interactive=False,
+        show_copy_button=True
+    )
+    raw_output = gr.Textbox(
+        label="Raw Model Output",
+        lines=2,
+        max_lines=5,
+        interactive=False
+    )
+    page_info = gr.Markdown(
+        value="",  # Info of PDF page
+        interactive=False
+    )
+    rendered_image = gr.Image(
+        label="Processed Image (Thresholded for OCR)",
+        interactive=False
+    )
+    num_pages = gr.Number(
+        value=1, label="Current Page (slider)", visible=False
+    )
+    submit_btn = gr.Button("Extract Medicines", variant="primary")
+    submit_btn.click(
+        fn=process_input,
+        inputs=[file_input, temperature, page_slider],
+        outputs=[output_text, medicines_output, raw_output, page_info, rendered_image, num_pages]
+    )
+    file_input.change(
+        fn=update_slider,
+        inputs=[file_input],
+        outputs=[page_slider]
+    )
+if __name__ == "__main__":
+    demo.launch()
 # Create Gradio interface
 # with gr.Blocks(title="📖 Image/PDF OCR with LightOnOCR", theme=gr.themes.Soft()) as demo:
 #     outputs=[output_text, medications_output, raw_output, page_info, rendered_image, num_pages]
 # )