Spaces:
Paused
Paused
| import whisper | |
| import os | |
| import shutil | |
| import cv2 | |
| from moviepy.editor import ImageSequenceClip, AudioFileClip, VideoFileClip | |
| from tqdm import tqdm | |
| FONT = cv2.FONT_HERSHEY_SIMPLEX | |
| FONT_SCALE = 0.8 | |
| FONT_THICKNESS = 2 | |
| class VideoTranscriber: | |
| def __init__(self, model_path, video_path): | |
| self.model = whisper.load_model(model_path) | |
| self.video_path = video_path | |
| self.audio_path = '' | |
| self.text_array = [] | |
| self.fps = 0 | |
| self.char_width = 0 | |
| def transcribe_video(self): | |
| print('Transcribing video') | |
| result = self.model.transcribe(self.audio_path) | |
| text = result["segments"][0]["text"] | |
| textsize = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)[0] | |
| cap = cv2.VideoCapture(self.video_path) | |
| width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) | |
| height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) | |
| asp = 16/9 | |
| ret, frame = cap.read() | |
| width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1] | |
| width = width - (width * 0.1) | |
| self.fps = cap.get(cv2.CAP_PROP_FPS) | |
| self.char_width = int(textsize[0] / len(text)) | |
| for j in tqdm(result["segments"]): | |
| lines = [] | |
| text = j["text"] | |
| end = j["end"] | |
| start = j["start"] | |
| total_frames = int((end - start) * self.fps) | |
| start = start * self.fps | |
| total_chars = len(text) | |
| words = text.split(" ") | |
| i = 0 | |
| while i < len(words): | |
| words[i] = words[i].strip() | |
| if words[i] == "": | |
| i += 1 | |
| continue | |
| length_in_pixels = (len(words[i]) + 1) * self.char_width | |
| remaining_pixels = width - length_in_pixels | |
| line = words[i] | |
| while remaining_pixels > 0: | |
| i += 1 | |
| if i >= len(words): | |
| break | |
| length_in_pixels = (len(words[i]) + 1) * self.char_width | |
| remaining_pixels -= length_in_pixels | |
| if remaining_pixels < 0: | |
| continue | |
| else: | |
| line += " " + words[i] | |
| line_array = [line, int(start) + 15, int(len(line) / total_chars * total_frames) + int(start) + 15] | |
| start = int(len(line) / total_chars * total_frames) + int(start) | |
| lines.append(line_array) | |
| self.text_array.append(line_array) | |
| cap.release() | |
| print('Transcription complete') | |
| def extract_audio(self): | |
| print('Extracting audio') | |
| audio_path = os.path.join(os.path.dirname(self.video_path), "audio.mp3") | |
| video = VideoFileClip(self.video_path) | |
| audio = video.audio | |
| audio.write_audiofile(audio_path) | |
| self.audio_path = audio_path | |
| print('Audio extracted') | |
| def extract_frames(self, output_folder): | |
| print('Extracting frames') | |
| cap = cv2.VideoCapture(self.video_path) | |
| width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) | |
| height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) | |
| asp = width / height | |
| N_frames = 0 | |
| while True: | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| frame = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)] | |
| for i in self.text_array: | |
| if N_frames >= i[1] and N_frames <= i[2]: | |
| text = i[0] | |
| text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2) | |
| text_x = int((frame.shape[1] - text_size[0]) / 2) | |
| text_y = int(height/2) | |
| cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2) | |
| break | |
| cv2.imwrite(os.path.join(output_folder, str(N_frames) + ".jpg"), frame) | |
| N_frames += 1 | |
| cap.release() | |
| print('Frames extracted') | |
| def create_video(self, output_video_path): | |
| print('Creating video') | |
| image_folder = os.path.join(os.path.dirname(self.video_path), "frames") | |
| if not os.path.exists(image_folder): | |
| os.makedirs(image_folder) | |
| self.extract_frames(image_folder) | |
| images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")] | |
| images.sort(key=lambda x: int(x.split(".")[0])) | |
| frame = cv2.imread(os.path.join(image_folder, images[0])) | |
| height, width, layers = frame.shape | |
| clip = ImageSequenceClip([os.path.join(image_folder, image) for image in images], fps=self.fps) | |
| audio = AudioFileClip(self.audio_path) | |
| clip = clip.set_audio(audio) | |
| clip.write_videofile(output_video_path) | |
| shutil.rmtree(image_folder) | |
| os.remove(os.path.join(os.path.dirname(self.video_path), "audio.mp3")) | |
| # Example usage | |
| model_path = "base" | |
| # video_path = "test_videos/videoplayback.mp4" | |
| output_video_path = "output.mp4" | |
| # output_audio_path = "test_videos/audio.mp3" | |