Spaces:

AuditEdge
/

optimised-ocr

Running

App Files Files Community

Mallisetty Siva Mahesh commited on Feb 24

Commit

c04d620

1 Parent(s): caa039b

added msme and cinllpin

Browse files

Files changed (2) hide show

app.py +111 -55
utils.py +122 -40

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ from fastapi import FastAPI, HTTPException, Request
 from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
 from dotenv import load_dotenv
 import urllib.parse
-from utils import doc_processing
 # Load .env file
 load_dotenv()
@@ -143,59 +143,91 @@ for dir_path in process_dirs.values():
 logging.basicConfig(level=logging.INFO)
-# Perform Inference with optional S3 upload
 def perform_inference(file_paths: Dict[str, str], upload_to_s3: bool):
     model_dirs = {
         "pan_file": pan_model,
         "gst_file": gst_model,
         "cheque_file": cheque_model,
     }
     try:
         inference_results = {}
         for doc_type, file_path in file_paths.items():
-            if doc_type in model_dirs:
-                print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
-                processed_file_p = file_path.split("&&")[0]
-                unprocessed_file_path = file_path.split("&&")[1]
-                images_path = [processed_file_p]
-                inference_batch = prepare_batch_for_inference(images_path)
-                context = model_dirs[doc_type]
-                processor = globals()[f"processor_{doc_type.split('_')[0]}"]
-                name = doc_type.split("_")[0]
-                attachemnt_num = {
-                    "pan_file": 2,
-                    "gst_file": 4,
-                    "msme_file": 5,
-                    "cin_llpin_file": 6,
-                    "cheque_file": 8,
-                }[doc_type]
-                if upload_to_s3:
-                    client = s3_client()
-                    bucket_name = "edgekycdocs"
-                    folder_name = f"{name}docs"
-                    file_name = unprocessed_file_path.split("/")[-1]
                     response = client.upload_file(
                         unprocessed_file_path, bucket_name, folder_name, file_name
                     )
                     print("The file has been uploaded to S3 bucket", response)
                     attachment_url = response["url"]
-                else:
                     attachment_url = None
-                result = handle(inference_batch, context, processor, name)
-                result["attachment_url"] = attachment_url
-                result["detect"] = True
-                inference_results[f"attachment_{attachemnt_num}"] = result
-            else:
-                print(f"Model directory not found for {doc_type}. Skipping.")
         return inference_results
-    except:
         return {"status": "error", "message": "Text extraction failed."}
@@ -234,21 +266,31 @@ async def aadhar_ocr(
     print("file_paths", file_paths)
     files = {}
-    for key, value in file_paths.items():
-        name = value.split("/")[-1].split(".")[0]
-        id_type = key.split("_")[0]
-        doc_type = value.split("/")[-1].split(".")[-1]
-        f_path = value
-        print("variables required", name, id_type, doc_type, f_path)
-        preprocessing = doc_processing(name, id_type, doc_type, f_path)
-        response = preprocessing.process()
-        print("response after preprocessing", response)
-        files[key] = response["output_p"] + "&&" + f_path
-        # files["unprocessed_file_path"] = f_path
-        print("response", response)
     # Perform inference
     result = perform_inference(files, upload_to_s3)
@@ -307,16 +349,30 @@ async def document_ocr_s3(request: Request):
     logging.info(f"Downloaded files: {list(file_paths.keys())}")
     files = {}
-    for key, value in file_paths.items():
-        name = value.split("/")[-1].split(".")[0]
-        id_type = key.split("_")[0]
-        doc_type = value.split("/")[-1].split(".")[-1]
-        f_path = value
-        preprocessing = doc_processing(name, id_type, doc_type, f_path)
-        response = preprocessing.process()
-        files[key] = response["output_p"] + "&&" + f_path
     result = perform_inference(files, upload_to_s3)

 from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
 from dotenv import load_dotenv
 import urllib.parse
+from utils import doc_processing, extract_document_number_from_file
 # Load .env file
 load_dotenv()
 logging.basicConfig(level=logging.INFO)
 def perform_inference(file_paths: Dict[str, str], upload_to_s3: bool):
     model_dirs = {
         "pan_file": pan_model,
         "gst_file": gst_model,
         "cheque_file": cheque_model,
     }
     try:
         inference_results = {}
         for doc_type, file_path in file_paths.items():
+            processed_file_p = file_path.split("&&")[
+                0
+            ]  # Extracted document number or processed image
+            unprocessed_file_path = file_path.split("&&")[1]  # Original file path
+            print(f"Processing {doc_type}: {processed_file_p}")
+            # Determine the attachment number based on the document type
+            attachment_num = {
+                "pan_file": 2,
+                "gst_file": 4,
+                "msme_file": 5,
+                "cin_llpin_file": 6,
+                "cheque_file": 8,
+            }.get(doc_type, None)
+            if attachment_num is None:
+                print(f"Skipping {doc_type}, not recognized.")
+                continue
+            # Upload file to S3 if required
+            if upload_to_s3:
+                client = s3_client()
+                bucket_name = "edgekycdocs"
+                folder_name = f"{doc_type.split('_')[0]}docs"
+                file_name = unprocessed_file_path.split("/")[-1].replace(" ", "_")
+                try:
                     response = client.upload_file(
                         unprocessed_file_path, bucket_name, folder_name, file_name
                     )
                     print("The file has been uploaded to S3 bucket", response)
                     attachment_url = response["url"]
+                    print(f"File uploaded to S3: {attachment_url}")
+                except Exception as e:
+                    print(f"Failed to upload {file_name} to S3: {e}")
                     attachment_url = None
+            else:
+                attachment_url = None
+            # If it's an OCR-based extraction (CIN, MSME, LLPIN, PAN, Aadhaar), return the extracted number
+            if doc_type in ["msme_file", "cin_llpin_file", "aadhar_file"]:
+                result = {
+                    "attachment_num": processed_file_p,  # Extracted CIN, LLPIN, MSME, PAN, or Aadhaar number
+                    "attachment_url": attachment_url,
+                    "attachment_status": 200,
+                    "detect": True,
+                }
+            else:
+                # If the document needs ML model inference (PAN, GST, Cheque)
+                if doc_type in model_dirs:
+                    print(
+                        f"Running ML inference for {doc_type} using {model_dirs[doc_type]}"
+                    )
+                    images_path = [processed_file_p]
+                    inference_batch = prepare_batch_for_inference(images_path)
+                    context = model_dirs[doc_type]
+                    processor = globals()[f"processor_{doc_type.split('_')[0]}"]
+                    name = doc_type.split("_")[0]
+                    result = handle(inference_batch, context, processor, name)
+                    result["attachment_url"] = attachment_url
+                    result["detect"] = True
+                else:
+                    print(f"No model found for {doc_type}, skipping inference.")
+                    continue
+            inference_results[f"attachment_{attachment_num}"] = result
         return inference_results
+    except Exception as e:
+        print(f"Error in perform_inference: {e}")
         return {"status": "error", "message": "Text extraction failed."}
     print("file_paths", file_paths)
     files = {}
+    for key, f_path in file_paths.items():
+        name = os.path.splitext(os.path.basename(f_path))[0]
+        # Determine id_type: for cin_llpin_file, explicitly set id_type to "cin_llpin"
+        if key == "cin_llpin_file":
+            id_type = "cin_llpin"
+        else:
+            id_type = key.split("_")[0]
+        doc_type = os.path.splitext(f_path)[-1].lstrip(".")
+        if key in ["msme_file", "cin_llpin_file", "aadhar_file"]:
+            extracted_number = extract_document_number_from_file(f_path, id_type)
+            if not extracted_number:
+                logging.error(f"Failed to extract document number from {f_path}")
+                raise HTTPException(
+                    status_code=400, detail=f"Invalid document format in {key}"
+                )
+            files[key] = extracted_number + "&&" + f_path
+            print("files", files[key])
+        else:
+            # For other files, use existing preprocessing.
+            preprocessing = doc_processing(name, id_type, doc_type, f_path)
+            response = preprocessing.process()
+            files[key] = response["output_p"] + "&&" + f_path
     # Perform inference
     result = perform_inference(files, upload_to_s3)
     logging.info(f"Downloaded files: {list(file_paths.keys())}")
     files = {}
+    for key, f_path in file_paths.items():
+        name = f_path.split("/")[-1].split(".")[0]
+        if key == "cin_llpin_file":
+            id_type = "cin_llpin"
+        else:
+            id_type = key.split("_")[0]
+        # id_type = key.split("_")[0]
+        doc_type = f_path.split("/")[-1].split(".")[-1]
+        # For MSME and CIN/LLPIN files, extract document number via OCR and regex
+        if key in ["msme_file", "cin_llpin_file", "aadhar_file"]:
+            extracted_number = extract_document_number_from_file(f_path, id_type)
+            if not extracted_number:
+                logging.error(f"Failed to extract document number from {f_path}")
+                raise HTTPException(
+                    status_code=400, detail=f"Invalid document format in {key}"
+                )
+            files[key] = extracted_number + "&&" + f_path
+        else:
+            # For other documents, use the existing ML model preprocessing
+            preprocessing = doc_processing(name, id_type, doc_type, f_path)
+            response = preprocessing.process()
+            files[key] = response["output_p"] + "&&" + f_path
     result = perform_inference(files, upload_to_s3)

utils.py CHANGED Viewed

@@ -1,71 +1,75 @@
 import fitz
 from PIL import Image
 class doc_processing:
     def __init__(self, name, id_type, doc_type, f_path):
         self.name = name
         self.id_type = id_type
         self.doc_type = doc_type
         self.f_path = f_path
         # self.o_path = o_path
     def pdf_to_image_scale(self):
         pdf_document = fitz.open(self.f_path)
         if self.id_type == "gst":
             page_num = 2
         else:
             page_num = 0
         page = pdf_document.load_page(page_num)
         pix = page.get_pixmap()  # Render page as a pixmap (image)
         # Convert pixmap to PIL Image
         image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
         original_width, original_height = image.size
-        print("original_width",original_width)
-        print("original_height",original_height)
         new_width = (1000 / original_width) * original_width
         new_height = (1000 / original_height) * original_height
-        print("new_width",new_width)
-        print("new_height",new_height)
-        # new_width =
-        # new_height =
         image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
-        output_path = "processed_images/{}/{}.jpeg".format(self.id_type,self.name)
         image.save(output_path)
-        return  {"success":200,"output_p":output_path}
     def scale_img(self):
-        print("path of file",self.f_path)
         image = Image.open(self.f_path).convert("RGB")
         original_width, original_height = image.size
-        print("original_width",original_width)
-        print("original_height",original_height)
         new_width = (1000 / original_width) * original_width
         new_height = (1000 / original_height) * original_height
-        print("new_width",new_width)
-        print("new_height",new_height)
-        # new_width =
-        # new_height =
         image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
-        output_path = "processed_images/{}/{}.jpeg".format(self.id_type,self.name)
         image.save(output_path)
-        return {"success":200,"output_p":output_path}
     def process(self):
         if self.doc_type == "pdf":
@@ -76,12 +80,95 @@ class doc_processing:
         return response
 # files = {
 #     "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
-#     "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
 #     "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
 #     "gst_file": "/home/javmulla/model_one/test_images_gst/0a52fbcb_page3_image_0.jpg"
 # }
@@ -89,7 +176,7 @@ class doc_processing:
 # files = {
 #     "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
-#     "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
 #     "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
 #     "gst_file": "test_Images_folder/gst/e.pdf"
 # }
@@ -102,11 +189,6 @@ class doc_processing:
 #     preprocessing = doc_processing(name,id_type,doc_type,f_path)
 #     response = preprocessing.process()
 #     print("response",response)
-    # id_type, doc_type, f_path

 import fitz
 from PIL import Image
+import re
+import io
+import os
+import logging
+import shutil
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from google.cloud import vision
+from pdf2image import convert_from_path
 class doc_processing:
     def __init__(self, name, id_type, doc_type, f_path):
         self.name = name
         self.id_type = id_type
         self.doc_type = doc_type
         self.f_path = f_path
         # self.o_path = o_path
     def pdf_to_image_scale(self):
         pdf_document = fitz.open(self.f_path)
         if self.id_type == "gst":
             page_num = 2
         else:
             page_num = 0
         page = pdf_document.load_page(page_num)
         pix = page.get_pixmap()  # Render page as a pixmap (image)
         # Convert pixmap to PIL Image
         image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
         original_width, original_height = image.size
+        print("original_width", original_width)
+        print("original_height", original_height)
         new_width = (1000 / original_width) * original_width
         new_height = (1000 / original_height) * original_height
+        print("new_width", new_width)
+        print("new_height", new_height)
+        # new_width =
+        # new_height =
         image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
+        output_path = "processed_images/{}/{}.jpeg".format(self.id_type, self.name)
         image.save(output_path)
+        return {"success": 200, "output_p": output_path}
     def scale_img(self):
+        print("path of file", self.f_path)
         image = Image.open(self.f_path).convert("RGB")
         original_width, original_height = image.size
+        print("original_width", original_width)
+        print("original_height", original_height)
         new_width = (1000 / original_width) * original_width
         new_height = (1000 / original_height) * original_height
+        print("new_width", new_width)
+        print("new_height", new_height)
+        # new_width =
+        # new_height =
         image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS)
+        output_path = "processed_images/{}/{}.jpeg".format(self.id_type, self.name)
         image.save(output_path)
+        return {"success": 200, "output_p": output_path}
     def process(self):
         if self.doc_type == "pdf":
         return response
+from google.cloud import vision
+vision_client = vision.ImageAnnotatorClient()
+def extract_document_number(ocr_text: str, id_type: str) -> str:
+    """
+    Searches the OCR text for a valid document number based on regex patterns.
+    Checks for CIN, then MSME, and finally LLPIN.
+    """
+    patterns = {
+        "cin": re.compile(r"([LUu]{1}[0-9]{5}[A-Za-z]{2}[0-9]{4}[A-Za-z]{3}[0-9]{6})"),
+        "msme": re.compile(r"(UDYAM-[A-Z]{2}-\d{2}-\d{7})"),
+        "llpin": re.compile(r"([A-Z]{3}-[0-9]{4})"),
+        "pan": re.compile(r"^[A-Z]{3}[PCHFTBALJGT][A-Z][\d]{4}[A-Z]$"),
+        "aadhaar": re.compile(r"^\d{12}$"),
+    }
+    if id_type == "cin_llpin":
+        # Try CIN first
+        match = patterns["cin"].search(ocr_text)
+        if match:
+            return match.group(0)
+        # If CIN not found, try LLPIN
+        match = patterns["llpin"].search(ocr_text)
+        if match:
+            return match.group(0)
+    elif id_type in patterns:
+        match = patterns[id_type].search(ocr_text)
+        if match:
+            return match.group(0)
+    return None
+def run_google_vision(file_content: bytes) -> str:
+    """
+    Uses Google Vision OCR to extract text from binary file content.
+    """
+    image = vision.Image(content=file_content)
+    response = vision_client.text_detection(image=image)
+    texts = response.text_annotations
+    if texts:
+        # The first annotation contains the complete detected text
+        return texts[0].description
+    return ""
+def extract_text_from_file(file_path: str) -> str:
+    """
+    Reads the file from file_path. If it's a PDF, converts only the first page to an image,
+    then runs OCR using Google Vision.
+    """
+    if file_path.lower().endswith(".pdf"):
+        try:
+            # Open the PDF file using PyMuPDF (fitz)
+            pdf_document = fitz.open(file_path)
+            page = pdf_document.load_page(0)  # Load the first page
+            pix = page.get_pixmap()  # Render page as an image
+            # Convert pixmap to PIL Image
+            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            # Convert image to bytes for OCR
+            img_byte_arr = io.BytesIO()
+            image.save(img_byte_arr, format="JPEG")
+            file_content = img_byte_arr.getvalue()
+        except Exception as e:
+            logging.error(f"Error converting PDF to image: {e}")
+            return ""
+    else:
+        with open(file_path, "rb") as f:
+            file_content = f.read()
+    return run_google_vision(file_content)
+def extract_document_number_from_file(file_path: str, id_type: str) -> str:
+    """
+    Extracts the document number (CIN, MSME, or LLPIN) from the file at file_path.
+    """
+    ocr_text = extract_text_from_file(file_path)
+    return extract_document_number(ocr_text, id_type)
 # files = {
 #     "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
+#     "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
 #     "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
 #     "gst_file": "/home/javmulla/model_one/test_images_gst/0a52fbcb_page3_image_0.jpg"
 # }
 # files = {
 #     "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg",
+#     "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg",
 #     "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg",
 #     "gst_file": "test_Images_folder/gst/e.pdf"
 # }
 #     preprocessing = doc_processing(name,id_type,doc_type,f_path)
 #     response = preprocessing.process()
 #     print("response",response)
+# id_type, doc_type, f_path