LightOnOCR

Paused

App Files Files Community

IFMedTechdemo commited on 25 days ago

Commit

83140b5

verified ·

1 Parent(s): 6f82d20

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -61

app.py CHANGED Viewed

@@ -16,6 +16,9 @@ from transformers import (
     TextIteratorStreamer,
 )
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Choose best attention implementation based on device
@@ -43,6 +46,10 @@ processor = LightOnOCRProcessor.from_pretrained(
 )
 print("Model loaded successfully!")
 def render_pdf_page(page, max_resolution=1540, scale=2.77):
     """Render a PDF page to PIL Image."""
@@ -69,35 +76,32 @@ def process_pdf(pdf_path, page_num=1):
 def clean_output_text(text):
     """Remove chat template artifacts from output."""
-    # Remove common chat template markers
     markers_to_remove = ["system", "user", "assistant"]
-    # Split by lines and filter
     lines = text.split('\n')
     cleaned_lines = []
     for line in lines:
         stripped = line.strip()
         # Skip lines that are just template markers
         if stripped.lower() not in markers_to_remove:
             cleaned_lines.append(line)
-    # Join back and strip leading/trailing whitespace
     cleaned = '\n'.join(cleaned_lines).strip()
-    # Alternative approach: if there's an "assistant" marker, take everything after it
     if "assistant" in text.lower():
         parts = text.split("assistant", 1)
         if len(parts) > 1:
             cleaned = parts[1].strip()
     return cleaned
 @spaces.GPU
 def extract_text_from_image(image, temperature=0.2, stream=False):
     """Extract text from image using LightOnOCR model."""
-    # Prepare the chat format
     chat = [
         {
             "role": "user",
@@ -106,8 +110,6 @@ def extract_text_from_image(image, temperature=0.2, stream=False):
             ],
         }
     ]
-    # Apply chat template and tokenize
     inputs = processor.apply_chat_template(
         chat,
         add_generation_prompt=True,
@@ -115,15 +117,12 @@ def extract_text_from_image(image, temperature=0.2, stream=False):
         return_dict=True,
         return_tensors="pt"
     )
-    # Move inputs to device AND convert to the correct dtype
     inputs = {
         k: v.to(device=device, dtype=dtype) if isinstance(v, torch.Tensor) and v.dtype in [torch.float32, torch.float16, torch.bfloat16]
         else v.to(device) if isinstance(v, torch.Tensor)
         else v
         for k, v in inputs.items()
     }
     generation_kwargs = dict(
         **inputs,
         max_new_tokens=2048,
@@ -131,54 +130,38 @@ def extract_text_from_image(image, temperature=0.2, stream=False):
         use_cache=True,
         do_sample=temperature > 0,
     )
     if stream:
-        # Setup streamer for streaming generation
         streamer = TextIteratorStreamer(
             processor.tokenizer,
             skip_prompt=True,
             skip_special_tokens=True
         )
         generation_kwargs["streamer"] = streamer
-        # Run generation in a separate thread
         thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
-        # Yield chunks as they arrive
         full_text = ""
         for new_text in streamer:
             full_text += new_text
-            # Clean the accumulated text
             cleaned_text = clean_output_text(full_text)
             yield cleaned_text
         thread.join()
     else:
         # Non-streaming generation
         with torch.no_grad():
             outputs = model.generate(**generation_kwargs)
-        # Decode the output
         output_text = processor.decode(outputs[0], skip_special_tokens=True)
-        # Clean the output
         cleaned_text = clean_output_text(output_text)
         yield cleaned_text
 def process_input(file_input, temperature, page_num, enable_streaming):
-    """Process uploaded file (image or PDF) and extract text with optional streaming."""
     if file_input is None:
         yield "Please upload an image or PDF first.", "", "", None, gr.update()
         return
     image_to_process = None
     page_info = ""
     file_path = file_input if isinstance(file_input, str) else file_input.name
     # Handle PDF files
     if file_path.lower().endswith('.pdf'):
         try:
@@ -195,24 +178,20 @@ def process_input(file_input, temperature, page_num, enable_streaming):
         except Exception as e:
             yield f"Error opening image: {str(e)}", "", "", None, gr.update()
             return
     try:
-        # Extract text using LightOnOCR with optional streaming
         for extracted_text in extract_text_from_image(image_to_process, temperature, stream=enable_streaming):
-            yield extracted_text, extracted_text, page_info, image_to_process, gr.update()
     except Exception as e:
         error_msg = f"Error during text extraction: {str(e)}"
         yield error_msg, error_msg, page_info, image_to_process, gr.update()
 def update_slider(file_input):
     """Update page slider based on PDF page count."""
     if file_input is None:
         return gr.update(maximum=20, value=1)
     file_path = file_input if isinstance(file_input, str) else file_input.name
     if file_path.lower().endswith('.pdf'):
         try:
             pdf = pdfium.PdfDocument(file_path)
@@ -224,25 +203,23 @@ def update_slider(file_input):
     else:
         return gr.update(maximum=1, value=1)
-# Create Gradio interface
-with gr.Blocks(title="📖 Image/PDF OCR with LightOnOCR", theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"""
-# 📖 Image/PDF to Text Extraction with LightOnOCR
 **💡 How to use:**
 1. Upload an image or PDF
-2. For PDFs: select which page to extract (1-20)
 3. Adjust temperature if needed
-4. Click "Extract Text"
-**Note:** The Markdown rendering for tables may not always be perfect. Check the raw output for complex tables!
 **Model:** LightOnOCR-1B-1025 by LightOn AI
 **Device:** {device.upper()}
 **Attention:** {attn_implementation}
 """)
     with gr.Row():
         with gr.Column(scale=1):
             file_input = gr.File(
@@ -282,43 +259,37 @@ with gr.Blocks(title="📖 Image/PDF OCR with LightOnOCR", theme=gr.themes.Soft(
                 value=True,
                 info="Show text progressively as it's generated"
             )
-            submit_btn = gr.Button("Extract Text", variant="primary")
             clear_btn = gr.Button("Clear", variant="secondary")
         with gr.Column(scale=2):
             output_text = gr.Markdown(
-                label="📄 Extracted Text (Rendered)",
-                value="*Extracted text will appear here...*"
             )
     with gr.Row():
         with gr.Column():
             raw_output = gr.Textbox(
-                label="Raw Markdown Output",
-                placeholder="Raw text will appear here...",
                 lines=20,
                 max_lines=30,
                 show_copy_button=True
             )
     # Event handlers
     submit_btn.click(
         fn=process_input,
         inputs=[file_input, temperature, num_pages, enable_streaming],
         outputs=[output_text, raw_output, page_info, rendered_image, num_pages]
     )
     file_input.change(
         fn=update_slider,
         inputs=[file_input],
         outputs=[num_pages]
     )
     clear_btn.click(
-        fn=lambda: (None, "*Extracted text will appear here...*", "", "", None, 1),
         outputs=[file_input, output_text, raw_output, page_info, rendered_image, num_pages]
     )
 if __name__ == "__main__":
-    demo.launch()

     TextIteratorStreamer,
 )
+# ---- CLINICAL NER IMPORTS ----
+import spacy
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Choose best attention implementation based on device
 )
 print("Model loaded successfully!")
+# ---- LOAD CLINICAL NER MODEL (BC5CDR) ----
+print("Loading clinical NER model (bc5cdr)...")
+nlp_ner = spacy.load("en_ner_bc5cdr_md")
+print("Clinical NER loaded.")
 def render_pdf_page(page, max_resolution=1540, scale=2.77):
     """Render a PDF page to PIL Image."""
 def clean_output_text(text):
     """Remove chat template artifacts from output."""
     markers_to_remove = ["system", "user", "assistant"]
     lines = text.split('\n')
     cleaned_lines = []
     for line in lines:
         stripped = line.strip()
         # Skip lines that are just template markers
         if stripped.lower() not in markers_to_remove:
             cleaned_lines.append(line)
     cleaned = '\n'.join(cleaned_lines).strip()
     if "assistant" in text.lower():
         parts = text.split("assistant", 1)
         if len(parts) > 1:
             cleaned = parts[1].strip()
     return cleaned
+def extract_medication_names(text):
+    """Extract medication names using clinical NER (spacy: bc5cdr CHEMICAL)."""
+    doc = nlp_ner(text)
+    meds = [ent.text for ent in doc.ents if ent.label_ == "CHEMICAL"]
+    meds_unique = list(dict.fromkeys(meds))
+    return meds_unique
 @spaces.GPU
 def extract_text_from_image(image, temperature=0.2, stream=False):
     """Extract text from image using LightOnOCR model."""
     chat = [
         {
             "role": "user",
             ],
         }
     ]
     inputs = processor.apply_chat_template(
         chat,
         add_generation_prompt=True,
         return_dict=True,
         return_tensors="pt"
     )
     inputs = {
         k: v.to(device=device, dtype=dtype) if isinstance(v, torch.Tensor) and v.dtype in [torch.float32, torch.float16, torch.bfloat16]
         else v.to(device) if isinstance(v, torch.Tensor)
         else v
         for k, v in inputs.items()
     }
     generation_kwargs = dict(
         **inputs,
         max_new_tokens=2048,
         use_cache=True,
         do_sample=temperature > 0,
     )
     if stream:
+        # Streaming generation
         streamer = TextIteratorStreamer(
             processor.tokenizer,
             skip_prompt=True,
             skip_special_tokens=True
         )
         generation_kwargs["streamer"] = streamer
         thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         full_text = ""
         for new_text in streamer:
             full_text += new_text
             cleaned_text = clean_output_text(full_text)
             yield cleaned_text
         thread.join()
     else:
         # Non-streaming generation
         with torch.no_grad():
             outputs = model.generate(**generation_kwargs)
         output_text = processor.decode(outputs[0], skip_special_tokens=True)
         cleaned_text = clean_output_text(output_text)
         yield cleaned_text
 def process_input(file_input, temperature, page_num, enable_streaming):
+    """Process uploaded file (image or PDF) and extract medication names via OCR+NER."""
     if file_input is None:
         yield "Please upload an image or PDF first.", "", "", None, gr.update()
         return
     image_to_process = None
     page_info = ""
     file_path = file_input if isinstance(file_input, str) else file_input.name
     # Handle PDF files
     if file_path.lower().endswith('.pdf'):
         try:
         except Exception as e:
             yield f"Error opening image: {str(e)}", "", "", None, gr.update()
             return
     try:
         for extracted_text in extract_text_from_image(image_to_process, temperature, stream=enable_streaming):
+            meds = extract_medication_names(extracted_text)
+            meds_str = "\n".join(meds) if meds else "No medications found."
+            yield meds_str, meds_str, page_info, image_to_process, gr.update()
     except Exception as e:
         error_msg = f"Error during text extraction: {str(e)}"
         yield error_msg, error_msg, page_info, image_to_process, gr.update()
 def update_slider(file_input):
     """Update page slider based on PDF page count."""
     if file_input is None:
         return gr.update(maximum=20, value=1)
     file_path = file_input if isinstance(file_input, str) else file_input.name
     if file_path.lower().endswith('.pdf'):
         try:
             pdf = pdfium.PdfDocument(file_path)
     else:
         return gr.update(maximum=1, value=1)
+# ----- GRADIO UI -----
+with gr.Blocks(title="📖 Image/PDF OCR + Clinical NER", theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"""
+# 📖 Medication Extraction from Image/PDF with LightOnOCR + Clinical NER
 **💡 How to use:**
 1. Upload an image or PDF
+2. For PDFs: select which page to extract
 3. Adjust temperature if needed
+4. Click "Extract Medications"
+**Output:** Only medication names found in text (via NER)
 **Model:** LightOnOCR-1B-1025 by LightOn AI
 **Device:** {device.upper()}
 **Attention:** {attn_implementation}
 """)
     with gr.Row():
         with gr.Column(scale=1):
             file_input = gr.File(
                 value=True,
                 info="Show text progressively as it's generated"
             )
+            submit_btn = gr.Button("Extract Medications", variant="primary")
             clear_btn = gr.Button("Clear", variant="secondary")
         with gr.Column(scale=2):
             output_text = gr.Markdown(
+                label="🩺 Extracted Medication Names",
+                value="*Medication names will appear here...*"
             )
     with gr.Row():
         with gr.Column():
             raw_output = gr.Textbox(
+                label="Extracted Medication Names (Raw)",
+                placeholder="Medication list will appear here...",
                 lines=20,
                 max_lines=30,
                 show_copy_button=True
             )
     # Event handlers
     submit_btn.click(
         fn=process_input,
         inputs=[file_input, temperature, num_pages, enable_streaming],
         outputs=[output_text, raw_output, page_info, rendered_image, num_pages]
     )
     file_input.change(
         fn=update_slider,
         inputs=[file_input],
         outputs=[num_pages]
     )
     clear_btn.click(
+        fn=lambda: (None, "*Medication names will appear here...*", "", "", None, 1),
         outputs=[file_input, output_text, raw_output, page_info, rendered_image, num_pages]
     )
 if __name__ == "__main__":
+    demo.launch()