Spaces:

ar08
/

Auto-caption

Paused

App Files Files Community

Auto-caption / main.py

ar08

Upload 6 files

34b89df verified 8 months ago

raw

history blame contribute delete

5.48 kB

	import whisper
	import os
	import shutil
	import cv2
	from moviepy.editor import ImageSequenceClip, AudioFileClip, VideoFileClip
	from tqdm import tqdm

	FONT = cv2.FONT_HERSHEY_SIMPLEX
	FONT_SCALE = 0.8
	FONT_THICKNESS = 2

	class VideoTranscriber:
	def __init__(self, model_path, video_path):
	self.model = whisper.load_model(model_path)
	self.video_path = video_path
	self.audio_path = ''
	self.text_array = []
	self.fps = 0
	self.char_width = 0

	def transcribe_video(self):
	print('Transcribing video')
	result = self.model.transcribe(self.audio_path)
	text = result["segments"][0]["text"]
	textsize = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
	cap = cv2.VideoCapture(self.video_path)
	width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	asp = 16/9
	ret, frame = cap.read()
	width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1]
	width = width - (width * 0.1)
	self.fps = cap.get(cv2.CAP_PROP_FPS)
	self.char_width = int(textsize[0] / len(text))

	for j in tqdm(result["segments"]):
	lines = []
	text = j["text"]
	end = j["end"]
	start = j["start"]
	total_frames = int((end - start) * self.fps)
	start = start * self.fps
	total_chars = len(text)
	words = text.split(" ")
	i = 0

	while i < len(words):
	words[i] = words[i].strip()
	if words[i] == "":
	i += 1
	continue
	length_in_pixels = (len(words[i]) + 1) * self.char_width
	remaining_pixels = width - length_in_pixels
	line = words[i]

	while remaining_pixels > 0:
	i += 1
	if i >= len(words):
	break
	length_in_pixels = (len(words[i]) + 1) * self.char_width
	remaining_pixels -= length_in_pixels
	if remaining_pixels < 0:
	continue
	else:
	line += " " + words[i]

	line_array = [line, int(start) + 15, int(len(line) / total_chars * total_frames) + int(start) + 15]
	start = int(len(line) / total_chars * total_frames) + int(start)
	lines.append(line_array)
	self.text_array.append(line_array)

	cap.release()
	print('Transcription complete')

	def extract_audio(self):
	print('Extracting audio')
	audio_path = os.path.join(os.path.dirname(self.video_path), "audio.mp3")
	video = VideoFileClip(self.video_path)
	audio = video.audio
	audio.write_audiofile(audio_path)
	self.audio_path = audio_path
	print('Audio extracted')

	def extract_frames(self, output_folder):
	print('Extracting frames')
	cap = cv2.VideoCapture(self.video_path)
	width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	asp = width / height
	N_frames = 0

	while True:
	ret, frame = cap.read()
	if not ret:
	break

	frame = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)]

	for i in self.text_array:
	if N_frames >= i[1] and N_frames <= i[2]:
	text = i[0]
	text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
	text_x = int((frame.shape[1] - text_size[0]) / 2)
	text_y = int(height/2)
	cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)
	break

	cv2.imwrite(os.path.join(output_folder, str(N_frames) + ".jpg"), frame)
	N_frames += 1

	cap.release()
	print('Frames extracted')

	def create_video(self, output_video_path):
	print('Creating video')
	image_folder = os.path.join(os.path.dirname(self.video_path), "frames")
	if not os.path.exists(image_folder):
	os.makedirs(image_folder)

	self.extract_frames(image_folder)

	images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")]
	images.sort(key=lambda x: int(x.split(".")[0]))

	frame = cv2.imread(os.path.join(image_folder, images[0]))
	height, width, layers = frame.shape

	clip = ImageSequenceClip([os.path.join(image_folder, image) for image in images], fps=self.fps)
	audio = AudioFileClip(self.audio_path)
	clip = clip.set_audio(audio)
	clip.write_videofile(output_video_path)
	shutil.rmtree(image_folder)
	os.remove(os.path.join(os.path.dirname(self.video_path), "audio.mp3"))

	# Example usage
	model_path = "base"
	# video_path = "test_videos/videoplayback.mp4"
	output_video_path = "output.mp4"
	# output_audio_path = "test_videos/audio.mp3"