π₯οΈπ€ Π’ΡΠ°Π½ΡΠΊΡΠΈΠ±Π°ΡΠΈΡ Π²ΠΈΠ΄Π΅ΠΎ ΠΈ ΡΠΎΠ·Π΄Π°Π½ΠΈΠ΅ ΡΡΠ±ΡΠΈΡΡΠΎΠ² Ρ ΠΏΠΎΠΌΠΎΡΡΡ Whisper, FFmpeg ΠΈ Python
Π‘Π»Π΅Π΄ΡΡ Π½Π°ΡΠ΅ΠΌΡ ΠΏΠΎΡΠ°Π³ΠΎΠ²ΠΎΠΌΡ ΡΡΠΊΠΎΠ²ΠΎΠ΄ΡΡΠ²Ρ, Π²Ρ ΡΠΌΠΎΠΆΠ΅ΡΠ΅ Π°Π²ΡΠΎΠΌΠ°ΡΠΈΡΠ΅ΡΠΊΠΈ ΡΡΠ°Π½ΡΠΊΡΠΈΠ±ΠΈΡΠΎΠ²Π°ΡΡ Π°ΡΠ΄ΠΈΠΎ ΠΈ Π΄ΠΎΠ±Π°Π²Π»ΡΡΡ ΡΡΠ±ΡΠΈΡΡΡ ΠΊ ΡΠ²ΠΎΠΈΠΌ Π²ΠΈΠ΄Π΅ΠΎ Π²ΡΠ΅Π³ΠΎ Π·Π° Π½Π΅ΡΠΊΠΎΠ»ΡΠΊΠΎ ΠΌΠΈΠ½ΡΡ.
ΠΠ΅ΠΎΠ±Ρ ΠΎΠ΄ΠΈΠΌΡΠ΅ ΠΈΠ½ΡΡΡΡΠΌΠ΅Π½ΡΡ: Python, Whisper, FFmpeg.
ΠΠ°ΡΡΡΠΎΠΉΠΊΠ° ΡΠ°Π±ΠΎΡΠ΅Π³ΠΎ ΠΏΡΠΎΡΡΡΠ°Π½ΡΡΠ²Π°
Π‘ΠΎΠ·Π΄Π°Π΄ΠΈΠΌ ΡΠ°Π±ΠΎΡΡΡ ΠΏΠ°ΠΏΠΊΡ
mkdir open-ai-whisper-ffmpeg
ΠΠ΅ΡΠ΅ΠΉΠ΄Π΅ΠΌ Π² ΠΏΠ°ΠΏΠΊΡ ΠΏΡΠΎΠ΅ΠΊΡΠ° ΠΈ ΡΠΎΠ·Π΄Π°Π΄ΠΈΠΌ Π²ΠΈΡΡΡΠ°Π»ΡΠ½ΠΎΠ΅ ΠΎΠΊΡΡΠΆΠ΅Π½ΠΈΠ΅:
cd open-ai-whisper-ffmpeg python3 -m venv .venv source .venv/bin/activate
Π£ΡΡΠ°Π½ΠΎΠ²ΠΈΠΌ Π½Π΅ΠΎΠ±Ρ ΠΎΠ΄ΠΈΠΌΡΠ΅ ΠΏΠ°ΠΊΠ΅ΡΡ Π΄Π»Ρ OpenAI Whisper:
pip install git+https://github.com/m-bain/whisperx.git
Π’ΡΠ°Π½ΡΠΊΡΠΈΠ±Π°ΡΠΈΡ Π²ΠΈΠ΄Π΅ΠΎ
Π‘Π½Π°ΡΠ°Π»Π° ΡΠΎΠ·Π΄Π°Π΄ΠΈΠΌ Π½ΠΎΠ²ΡΠΉ ΡΠ°ΠΉΠ» Python β main.py.
touch main.py
Π Π΄ΠΎΠ±Π°Π²ΠΈΠΌ Π² Π½Π΅Π³ΠΎ ΠΊΠΎΠ΄:
from datetime import timedelta
import os
import whisperx
def transcribe_video(input_video):
batch_size = 32
compute_type = "float32"
device = "cpu"
model = whisperx.load_model("large-v2", device=device, compute_type=compute_type)
audio = whisperx.load_audio(input_video)
result = model.transcribe(audio, batch_size=batch_size, language="en")
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
segments = result["segments"]
# if srt file exists, delete it
if os.path.exists("subtitles.srt"):
os.remove("subtitles.srt")
for index, segment in enumerate(segments):
startTime = str(0)+str(timedelta(seconds=int(segment['start'])))+',000'
endTime = str(0)+str(timedelta(seconds=int(segment['end'])))+',000'
text = segment['text']
print(text)
segment = f"{index + 1}\n{startTime} --> {endTime}\n{text[1:] if text[0] == ' ' else text}\n\n"
srtFilename = os.path.join(f"subtitles.srt")
with open(srtFilename, 'a', encoding='utf-8') as srtFile:
srtFile.write(segment)
return srtFilename
def main():
input_video_path = "input.mp4"
transcribe_video(input_video_path)
main()
ΠΠ°Π²Π°ΠΉΡΠ΅ ΡΠ°ΡΡΠΌΠΎΡΡΠΈΠΌ, ΡΡΠΎ ΠΌΡ Π΄Π΅Π»Π°Π΅ΠΌ Π² ΠΏΡΠΈΠ²Π΅Π΄Π΅Π½Π½ΠΎΠΌ Π²ΡΡΠ΅ ΠΊΠΎΠ΄Π΅. Π ΡΡΠΈΡ
ΡΡΡΠΎΠΊΠ°Ρ
ΠΌΡ ΠΈΠΌΠΏΠΎΡΡΠΈΡΡΠ΅ΠΌ Π½Π΅ΠΎΠ±Ρ
ΠΎΠ΄ΠΈΠΌΡΠ΅ ΠΏΠ°ΠΊΠ΅ΡΡ Π΄Π»Ρ ΡΠ°Π±ΠΎΡΡ: whisperx Π΄Π»Ρ Π·Π°Π³ΡΡΠ·ΠΊΠΈ ΠΌΠΎΠ΄Π΅Π»ΠΈ Whisper, os Π΄Π»Ρ ΠΏΠΎΠ»ΡΡΠ΅Π½ΠΈΡ ΠΏΡΡΠΈ ΠΊ ΡΠ°ΠΉΠ»Ρ ΡΡΠ±ΡΠΈΡΡΠΎΠ² ΠΈ timedelta Π΄Π»Ρ ΡΠΎΡΠΌΠ°ΡΠΈΡΠΎΠ²Π°Π½ΠΈΡ ΡΠ΅ΠΊΡΡΠ° Π²ΡΠ΅ΠΌΠ΅Π½Π½ΡΡ
ΠΌΠ΅ΡΠΎΠΊ.
from datetime import timedelta import os import whisperx
ΠΠ΄Π΅ΡΡ ΠΌΡ ΠΎΠΏΡΠ΅Π΄Π΅Π»ΠΈΠ»ΠΈ ΡΡΠ½ΠΊΡΠΈΡ, ΠΊΠΎΡΠΎΡΠ°Ρ ΠΏΡΠΈΠ½ΠΈΠΌΠ°Π΅Ρ Π²Ρ ΠΎΠ΄Π½ΠΎΠ΅ Π²ΠΈΠ΄Π΅ΠΎ, Π·Π°Π³ΡΡΠΆΠ°Π΅Ρ ΠΌΠΎΠ΄Π΅Π»Ρ Whisper large-v2, ΡΠΊΠ°Π·ΡΠ²Π°Π΅Ρ ΡΠΈΠΏ Π²ΡΡΠΈΡΠ»Π΅Π½ΠΈΠΉ ΠΈ Π½Π°ΡΡΡΠ°ΠΈΠ²Π°Π΅Ρ ΠΌΠΎΠ΄Π΅Π»Ρ Π½Π° ΠΈΡΠΏΠΎΠ»ΡΠ·ΠΎΠ²Π°Π½ΠΈΠ΅ CPU Π²ΠΌΠ΅ΡΡΠΎ GPU.
ΠΠΎΡΠ»Π΅ ΡΡΠΎΠ³ΠΎ ΡΡΠ½ΠΊΡΠΈΡ Π·Π°Π³ΡΡΠΆΠ°Π΅Ρ Π°ΡΠ΄ΠΈΠΎ Π² ΠΌΠΎΠ΄Π΅Π»Ρ, Π·Π°ΡΠ΅ΠΌ ΡΡΠ°Π½ΡΠΊΡΠΈΠ±ΠΈΡΡΠ΅Ρ ΠΈ Π²ΠΎΠ·Π²ΡΠ°ΡΠ°Π΅Ρ ΡΠ΅ΠΊΡΡ Ρ Π²ΡΠ΅ΠΌΠ΅Π½Π½ΡΠΌΠΈ ΠΌΠ΅ΡΠΊΠ°ΠΌΠΈ.
def transcribe_video(input_video):
batch_size = 32
compute_type = "float32"
device = "cpu"
model = whisperx.load_model("large-v2", device=device, compute_type=compute_type)
audio = whisperx.load_audio(input_video)
result = model.transcribe(audio, batch_size=batch_size, language="en")
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
segments = result["segments"]
ΠΠΎΡΠ»Π΅ ΡΡΠΎΠ³ΠΎ ΡΡΠ½ΠΊΡΠΈΡ ΠΏΡΠΎΡ ΠΎΠ΄ΠΈΡ ΠΏΠΎ Π²ΡΠ΅ΠΌ ΠΏΠΎΠ»ΡΡΠ΅Π½Π½ΡΠΌ ΠΌΠΎΠ΄Π΅Π»ΡΡ ΡΠ΅Π·ΡΠ»ΡΡΠ°ΡΠ°ΠΌ, ΠΏΡΠ΅ΠΎΠ±ΡΠ°Π·ΡΠ΅Ρ ΠΈΡ Π² ΡΠΎΡΠΌΠ°Ρ .srt ΠΈ Π΄ΠΎΠ±Π°Π²Π»ΡΠ΅Ρ ΠΊΠ°ΠΆΠ΄ΡΠΉ ΡΠ»Π΅ΠΌΠ΅Π½Ρ ΡΠ»ΠΎΠ²Π° Π² ΡΠ°ΠΉΠ» subtitles.srt.
# if srt file exists, delete it
if os.path.exists("subtitles.srt"):
os.remove("subtitles.srt")
for index, segment in enumerate(segments):
startTime = str(0)+str(timedelta(seconds=int(segment['start'])))+',000'
endTime = str(0)+str(timedelta(seconds=int(segment['end'])))+',000'
text = segment['text']
print(text)
segment = f"{index + 1}\n{startTime} --> {endTime}\n{text[1:] if text[0] == ' ' else text}\n\n"
srtFilename = os.path.join(f"subtitles.srt")
with open(srtFilename, 'a', encoding='utf-8') as srtFile:
srtFile.write(segment)
return srtFilename
ΠΠΎΠ±Π°Π²Π»Π΅Π½ΠΈΠ΅ ΡΡΠ±ΡΠΈΡΡΠΎΠ² ΠΊ Π²ΠΈΠ΄Π΅ΠΎ
ΠΠ°Π»Π΅Π΅ ΠΌΡ Π·Π°Π³ΡΡΠΆΠ°Π΅ΠΌ subtitles.srt Π² Π²ΠΈΠ΄Π΅ΠΎ Ρ ΠΏΠΎΠΌΠΎΡΡΡ FFmpeg. Π ΠΈΡΠΎΠ³Π΅ ΠΏΠΎΠ»ΡΡΠ°Π΅ΠΌ ΡΠ»Π΅Π΄ΡΡΡΠΈΠΉ ΡΠΊΡΠΈΠΏΡ:
from datetime import timedelta
import os
import whisperx
import subprocess
def transcribe_video(input_video):
batch_size = 32
compute_type = "float32"
device = "cpu"
model = whisperx.load_model("large-v2", device=device, compute_type=compute_type)
audio = whisperx.load_audio(input_video)
result = model.transcribe(audio, batch_size=batch_size, language="en")
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
segments = result["segments"]
# if srt file exists, delete it
if os.path.exists("subtitles.srt"):
os.remove("subtitles.srt")
for index, segment in enumerate(segments):
startTime = str(0)+str(timedelta(seconds=int(segment['start'])))+',000'
endTime = str(0)+str(timedelta(seconds=int(segment['end'])))+',000'
text = segment['text']
print(text)
segment = f"{index + 1}\n{startTime} --> {endTime}\n{text[1:] if text[0] == ' ' else text}\n\n"
srtFilename = os.path.join(f"subtitles.srt")
with open(srtFilename, 'a', encoding='utf-8') as srtFile:
srtFile.write(segment)
return srtFilename
def add_srt_to_video(input_video, output_file):
# FFmpeg command
subtitles_file = 'subtitles.srt'
# FFmpeg command
ffmpeg_command = f"""ffmpeg -i {input_video} -vf "subtitles={subtitles_file}:force_style='FontName=Arial,FontSize=10,PrimaryColour=&HFFFFFF,OutlineColour=&H000000,BorderStyle=3,Outline=1,Shadow=1,Alignment=2,MarginV=10'" -c:a copy {output_file} -y """
# Run the FFmpeg command
subprocess.run(ffmpeg_command, shell=True)
<u>def main():</u>
input_video_path = "input.mp4"
output_file = "output.mp4"
transcribe_video(input_video_path)
add_srt_to_video(input_video_path, output_file)
main()