zain1133604 commited on
Commit
8a7566d
Β·
1 Parent(s): 46e5a8a

first commit

Browse files
Files changed (6) hide show
  1. Dockerfile +42 -0
  2. __pycache__/app.cpython-311.pyc +0 -0
  3. app.py +243 -0
  4. requirements.txt +49 -0
  5. segformer_b1.onnx +3 -0
  6. yolov8n.pt +3 -0
Dockerfile ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================
2
+ # πŸš€ SMART CCTV β€” Dockerfile
3
+ # ==============================================
4
+
5
+ # ---- Base Image ----
6
+ # (Use NVIDIA’s PyTorch image for GPU support; change to "python:3.11-slim" if CPU only)
7
+ FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-runtime
8
+
9
+ # ---- System Setup ----
10
+ ENV DEBIAN_FRONTEND=noninteractive
11
+
12
+ # Install system-level deps (for OpenCV, ffmpeg, etc.)
13
+ RUN apt-get update && apt-get install -y \
14
+ ffmpeg \
15
+ libsm6 \
16
+ libxext6 \
17
+ libgl1 \
18
+ && rm -rf /var/lib/apt/lists/*
19
+
20
+ # ---- Working Directory ----
21
+ WORKDIR /app
22
+
23
+ # ---- Copy Requirements ----
24
+ COPY requirements.txt .
25
+
26
+ # ---- Install Python Dependencies ----
27
+ RUN pip install --upgrade pip \
28
+ && pip install --no-cache-dir -r requirements.txt
29
+
30
+ # ---- Copy App Code ----
31
+ COPY . .
32
+
33
+ # ---- Environment Variables ----
34
+ ENV TRANSFORMERS_CACHE=/tmp/hf_cache
35
+ RUN mkdir -p /tmp/hf_cache
36
+
37
+ # ---- Expose Port ----
38
+ EXPOSE 7860
39
+
40
+ # ---- Command to Run the App ----
41
+ # (You can change --host 0.0.0.0 --port 7860 to whatever you like)
42
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "120"]
__pycache__/app.cpython-311.pyc ADDED
Binary file (11.8 kB). View file
 
app.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import cv2
3
+ import numpy as np
4
+ import os
5
+ import tempfile
6
+ import shutil
7
+ from ultralytics import YOLO
8
+ # --- UPDATED IMPORTS for SegFormer PyTorch model ---
9
+ from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
10
+ from transformers import VideoMAEForVideoClassification, VideoMAEFeatureExtractor
11
+ from fastapi import FastAPI, UploadFile, File
12
+ from fastapi.responses import FileResponse
13
+ from starlette.background import BackgroundTask
14
+
15
+ # --- Configuration ---
16
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
+ print(f"βš™οΈ Using device: {DEVICE}")
18
+
19
+ # Set a writable directory for model caches on Hugging Face Spaces
20
+ os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
21
+ os.makedirs('/tmp/hf_cache', exist_ok=True)
22
+
23
+ # --- 1️⃣ Load Models ---
24
+
25
+ # Semantic Segmentation (SegFormer PyTorch) <-- CHANGE HERE
26
+ SEG_MODEL_NAME = "nvidia/segformer-b1-finetuned-ade-512-512"
27
+ seg_processor = SegformerImageProcessor.from_pretrained(SEG_MODEL_NAME)
28
+ seg_model = SegformerForSemanticSegmentation.from_pretrained(
29
+ SEG_MODEL_NAME
30
+ ).to(DEVICE).eval()
31
+
32
+ # Object Detection + Tracking (YOLOv8 + ByteTrack)
33
+ detector = YOLO("yolov8n.pt")
34
+
35
+ # Behavior Recognition (VideoMAE)
36
+ act_processor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
37
+ act_model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics").to(DEVICE).eval()
38
+ ACTION_CLIP_LEN = 16
39
+
40
+ # This dictionary stores the last predicted label for each ID
41
+ id_last_labels = {}
42
+
43
+ # -------------------------------
44
+ # 2️⃣ FastAPI Setup
45
+ # -------------------------------
46
+ app = FastAPI(title="Smart CCTV Video Processor")
47
+
48
+ # -------------------------------
49
+ # 3️⃣ Utility Functions
50
+ # -------------------------------
51
+
52
+ def cleanup_task(dir_path):
53
+ """Utility to create a BackgroundTask for deleting the temp directory."""
54
+ # Ignore errors (like PermissionError on Windows) during cleanup
55
+ return BackgroundTask(shutil.rmtree, dir_path, ignore_errors=True)
56
+
57
+ # --- REMOVED: preprocess_segformer() and run_segformer_onnx() ---
58
+ # They are replaced by the functions below using the HuggingFace SegFormer pipeline.
59
+
60
+ def run_segformer_pytorch(frame_rgb: np.ndarray, original_shape) -> np.ndarray:
61
+ """
62
+ Runs PyTorch SegFormer inference and returns the human mask.
63
+ This replaces the ONNX code.
64
+ """
65
+ # 1. Preprocess
66
+ # The processor handles resizing, normalization, and tensor conversion automatically
67
+ inputs = seg_processor(images=frame_rgb, return_tensors="pt").to(DEVICE)
68
+
69
+ # 2. Inference
70
+ with torch.no_grad():
71
+ seg_logits = seg_model(**inputs).logits
72
+
73
+ # 3. Post-process
74
+ # Resize logits to original image size
75
+ seg_logits = torch.nn.functional.interpolate(
76
+ seg_logits,
77
+ size=(original_shape[0], original_shape[1]), # H, W
78
+ mode='bilinear',
79
+ align_corners=False
80
+ )
81
+
82
+ pred_mask = torch.argmax(seg_logits, dim=1).squeeze().cpu().numpy()
83
+
84
+ # ADE20K class 12 = person
85
+ human_mask = (pred_mask == 12).astype(np.uint8)
86
+
87
+ return human_mask
88
+
89
+ def run_action_recognition(clip_buffer: list) -> str:
90
+ """Runs VideoMAE on a list of RGB frames (clip_buffer)."""
91
+ # Resize all frames to 224x224 (VideoMAE input requirement)
92
+ clip_resized = [cv2.resize(f, (224, 224), interpolation=cv2.INTER_LINEAR) for f in clip_buffer]
93
+
94
+ # Convert to tensor input for VideoMAE
95
+ inputs = act_processor(clip_resized, return_tensors="pt").to(DEVICE)
96
+
97
+ with torch.no_grad():
98
+ outputs = act_model(**inputs)
99
+ pred = outputs.logits.argmax(-1).item()
100
+ label = act_model.config.id2label.get(pred, "Unknown Action")
101
+ return label
102
+
103
+ # -------------------------------
104
+ # 4️⃣ Main API Endpoint
105
+ # -------------------------------
106
+
107
+ @app.get("/")
108
+ async def root():
109
+ return {"message": "Smart CCTV Backend: Upload a video to /process-video/"}
110
+
111
+ @app.post("/process-video/")
112
+ async def process_video_endpoint(file: UploadFile = File(...)):
113
+ global id_last_labels
114
+
115
+ if not file.content_type.startswith('video/'):
116
+ await file.close()
117
+ return {"error": "Invalid file type. Only video files are supported."}, 400
118
+
119
+ # MANUAL TEMP DIR SETUP: Fixes PermissionError by controlling cleanup
120
+ tmpdir = tempfile.mkdtemp()
121
+ input_path = os.path.join(tmpdir, file.filename)
122
+ output_path = os.path.join(tmpdir, "smart_cctv_output.mp4")
123
+
124
+ cap, out = None, None
125
+ try:
126
+ # Save the uploaded file to the temporary directory
127
+ with open(input_path, "wb") as buffer:
128
+ # IMPORTANT: Stream the file content to the buffer
129
+ while content := await file.read(1024 * 1024): # Read in chunks (1MB)
130
+ buffer.write(content)
131
+
132
+ await file.close()
133
+
134
+ cap = cv2.VideoCapture(input_path)
135
+
136
+ if not cap.isOpened():
137
+ return {"error": "Could not open video file."}, 500
138
+
139
+ # Video properties
140
+ fps = cap.get(cv2.CAP_PROP_FPS)
141
+ width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
142
+
143
+ if width <= 0 or height <= 0:
144
+ return {"error": "Invalid video dimensions."}, 500
145
+
146
+ # Output Video Setup
147
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
148
+ out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
149
+
150
+ # Buffers for Per-ID Action Clips
151
+ id_clip_buffers = {}
152
+ id_last_labels.clear()
153
+ action_clip_len = ACTION_CLIP_LEN
154
+
155
+ # Process Each Frame
156
+ frame_count = 0
157
+ while cap.isOpened():
158
+ ret, frame = cap.read()
159
+ if not ret:
160
+ break
161
+
162
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
163
+ original_shape = frame.shape
164
+
165
+ # --- Segmentation (PyTorch) --- <-- CHANGE HERE
166
+ human_mask = run_segformer_pytorch(frame_rgb, original_shape)
167
+
168
+ # --- Detection + Tracking (YOLO + ByteTrack) ---
169
+ results = detector.track(frame, persist=True, tracker="bytetrack.yaml", verbose=False)
170
+ boxes = results[0].boxes.xyxy.cpu().numpy() if results[0].boxes is not None else []
171
+ ids = results[0].boxes.id.cpu().numpy() if results[0].boxes.id is not None else []
172
+
173
+ # --- For each tracked person ---
174
+ for box, track_id in zip(boxes, ids):
175
+ x1, y1, x2, y2 = map(int, box)
176
+ track_id = int(track_id)
177
+
178
+ # Crop with a slight buffer
179
+ buffer_px = 5
180
+ y1 = max(0, y1 - buffer_px)
181
+ y2 = min(height, y2 + buffer_px)
182
+ x1 = max(0, x1 - buffer_px)
183
+ x2 = min(width, x2 + buffer_px)
184
+
185
+ person_crop = frame_rgb[y1:y2, x1:x2]
186
+
187
+ if person_crop.size == 0:
188
+ continue
189
+
190
+ # Maintain a clip buffer per person ID
191
+ id_clip_buffers.setdefault(track_id, [])
192
+ id_clip_buffers[track_id].append(person_crop)
193
+
194
+ # Keep only latest N frames
195
+ if len(id_clip_buffers[track_id]) > action_clip_len:
196
+ id_clip_buffers[track_id] = id_clip_buffers[track_id][-action_clip_len:]
197
+
198
+ # --- Action Recognition when clip ready ---
199
+ label = id_last_labels.get(track_id, "Analyzing...")
200
+
201
+ if len(id_clip_buffers[track_id]) == action_clip_len:
202
+ # Run inference every N frames to save time/resources
203
+ if frame_count % (action_clip_len // 2) == 0:
204
+ label = run_action_recognition(id_clip_buffers[track_id])
205
+ id_last_labels[track_id] = label
206
+
207
+ # --- Draw box + label ---
208
+ color = (0, 255, 0)
209
+ cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
210
+ cv2.putText(frame, f"ID {track_id}: {label}", (x1, y1 - 10),
211
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
212
+
213
+ # --- Segmentation Overlay ---
214
+ # NOTE: We no longer need to resize the mask here as the PyTorch function handles interpolation
215
+ mask_colored = cv2.applyColorMap((human_mask * 255).astype(np.uint8), cv2.COLORMAP_JET)
216
+
217
+ # Blend safely
218
+ overlay = cv2.addWeighted(frame, 0.7, mask_colored, 0.3, 0)
219
+
220
+ out.write(overlay)
221
+ frame_count += 1
222
+
223
+ # --- Cleanup (Before Return) ---
224
+ cap.release()
225
+ out.release()
226
+
227
+ # --- Return the processed video file ---
228
+ return FileResponse(
229
+ path=output_path,
230
+ media_type='video/mp4',
231
+ filename=f"processed_{os.path.basename(input_path)}",
232
+ # Use background task to delete the temporary folder AFTER the response is sent
233
+ background=cleanup_task(tmpdir)
234
+ )
235
+
236
+ except Exception as e:
237
+ # Ensure cleanup on failure
238
+ if cap: cap.release()
239
+ if out: out.release()
240
+ # Clean up the manually created temp directory
241
+ shutil.rmtree(tmpdir, ignore_errors=True)
242
+ # Re-raise the exception for FastAPI/Uvicorn to handle
243
+ raise e
requirements.txt ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==========================
2
+ # 🧠 CORE DEEP LEARNING
3
+ # ==========================
4
+ torch==2.5.1+cu121
5
+ torchvision==0.20.1+cu121
6
+ torchaudio==2.5.1+cu121
7
+
8
+ # ==========================
9
+ # πŸ€– TRANSFORMERS + HF HUB
10
+ # ==========================
11
+ transformers==4.48.3
12
+ huggingface-hub==0.34.4
13
+ timm==0.8.13.dev0
14
+ safetensors==0.6.2
15
+
16
+ # ==========================
17
+ # 🧩 YOLOv8 + BYTETrack
18
+ # ==========================
19
+ ultralytics==8.3.183
20
+ lap==0.5.12
21
+ lapx==0.5.11.post1
22
+ shapely==2.1.2
23
+
24
+ # ==========================
25
+ # πŸŽ₯ VISION + UTILITIES
26
+ # ==========================
27
+ opencv-python==4.12.0.88
28
+ numpy==2.1.2
29
+ tqdm==4.67.1
30
+ pillow==11.0.0
31
+ scipy==1.16.1
32
+ imageio==2.37.0
33
+
34
+ # ==========================
35
+ # 🌐 FASTAPI BACKEND
36
+ # ==========================
37
+ fastapi==0.116.1
38
+ uvicorn==0.35.0
39
+ starlette==0.47.2
40
+ python-multipart==0.0.20
41
+ pydantic==2.11.7
42
+ pydantic-core==2.33.2
43
+ typing-extensions==4.14.1
44
+
45
+ # ==========================
46
+ # βš™οΈ MISC (Safe to include)
47
+ # ==========================
48
+ requests==2.32.5
49
+ orjson==3.11.1
segformer_b1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5df6ae360ad1ab7f3085f3a510522f25ae1b6c4a2f4479a40852cb5be64305b4
3
+ size 55144053
yolov8n.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f59b3d833e2ff32e194b5bb8e08d211dc7c5bdf144b90d2c8412c47ccfc83b36
3
+ size 6549796