Spaces:

neyugncol
/

video-chatbot

Running on Zero

App Files Files Community

neyugncol commited on Jun 29

Commit

d994d22

verified ·

1 Parent(s): bbc095d

First commit

Browse files

Files changed (12) hide show

.gitignore +2 -0
README.md +45 -14
agent.py +82 -0
app.py +85 -0
configs.py +26 -0
embeder.py +39 -0
prompt.py +13 -0
rag.py +273 -0
requirements.txt +8 -0
tools.py +122 -0
transcriber.py +134 -0
utils.py +181 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ data/*

README.md CHANGED Viewed

@@ -1,14 +1,45 @@
----
-title: Video Chatbot
-emoji: 🐢
-colorFrom: indigo
-colorTo: yellow
-sdk: gradio
-sdk_version: 5.35.0
-app_file: app.py
-pinned: false
-license: mit
-short_description: A chatbot that can answer questions about a video.
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Chatbot for Video Question Answering Demo
+AI chatbot that can answer questions about video content. This project leverages multi-modal LLM, multi-modal RAG pipeline to process video frames, transcribe audio, and retrieval information to provide accurate answers to questions about video content.
+## Requirements
+- Python 3.12+
+- [uv](https://docs.astral.sh/uv/) for package and project manager
+- [FFmpeg](https://ffmpeg.org/) installed and available in PATH
+- [Google Gemini API key](https://aistudio.google.com/apikey) for the LLM functionality
+## Installation
+1. Clone this repository
+   ```bash
+   git clone [repository-url]
+   cd VideoChatbot
+   ```
+2. Install dependencies using uv
+   ```bash
+   uv sync
+   ```
+3. Create a `.env` file in the project root with your API key
+   ```
+   GEMINI_API_KEY=your_api_key_here
+   ```
+## Usage
+1. Start the application
+   ```bash
+   python -m app.main
+   ```
+2. Access the UI through your browser (typically at http://127.0.0.1:7860)
+3. Upload a video file or provide a YouTube URL and ask questions about it
+4. The system will process the video (extract frames, transcribe audio), index the content, and then answer your questions
+## Notes
+This project is designed to be a demo and may require additional configuration for production use. The video processing and indexing can take time depending on the video length and complexity. Use a larger LLMs, embeddings, transcription models, and vector databases for better performance and accuracy.

agent.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import re
+from typing import Generator
+from smolagents import ToolCallingAgent, OpenAIServerModel, ActionStep
+from PIL import Image
+import tools
+from configs import settings
+from prompt import video_to_text_prompt
+from rag import VideoRAG
+class VideoChatbot:
+    def __init__(
+            self,
+            model: str = 'gemini-2.0-flash',
+            api_base: str = None,
+            api_key: str = None
+    ):
+        self.video_rag = VideoRAG(
+            video_frame_rate=settings.VIDEO_EXTRACTION_FRAME_RATE,
+            audio_segment_length=settings.AUDIO_SEGMENT_LENGTH,
+        )
+        self.agent = ToolCallingAgent(
+            tools=[
+                tools.download_video,
+                *tools.create_video_rag_tools(self.video_rag)
+            ],
+            model=OpenAIServerModel(
+                model_id=model,
+                api_base=api_base,
+                api_key=api_key
+            ),
+            step_callbacks=[self._step_callback],
+        )
+    def chat(self, message: str, attachments: list[str] = None) -> Generator:
+        """Chats with the bot, including handling attachments (images and videos).
+        Args:
+            message: The text message to send to the bot.
+            attachments: A list of file paths for images or videos to include in the chat.
+        Returns:
+            A generator yielding step objects representing the bot's responses and actions.
+        """
+        images = []
+        for filepath in attachments or []:
+            if filepath.endswith(('.jpg', '.jpeg', '.png')):
+                images.append(Image.open(filepath))
+            if filepath.endswith('.mp4'):
+                message = video_to_text_prompt(filepath) + message
+        for step in self.agent.run(
+            message,
+            stream=True,
+            reset=False,
+            images=images,
+        ):
+            yield step
+    def clear(self):
+        """Clears the chatbot message history and context."""
+        self.agent.state.clear()
+        self.agent.memory.reset()
+        self.agent.monitor.reset()
+        self.video_rag.clear()
+    def _step_callback(self, step: ActionStep, agent: ToolCallingAgent):
+        if step.observations:
+            image_index = 0
+            for image_path in re.findall(r'<observation_image>(.*?)</observation_image>', step.observations):
+                try:
+                    image = Image.open(image_path)
+                    step.observations_images.append(image)
+                    step.observations = step.observations.replace(image_path, str(image_index))
+                    image_index += 1
+                except Exception as e:
+                    print(f'Error loading image {image_path}: {e}')

app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import shutil
+import gradio as gr
+from smolagents import ChatMessageToolCall, ActionStep, FinalAnswerStep
+from agent import VideoChatbot
+from configs import settings
+bot = VideoChatbot(
+    model=settings.CHATBOT_MODEL,
+    api_base=settings.MODEL_BASE_API,
+    api_key=os.environ['GEMINI_API_KEY']
+)
+def chat(message: dict, history: list[dict]):
+    # move the file to the data directory
+    message['files'] = [shutil.copy(file, settings.DATA_DIR) for file in message['files']]
+    # add the input message to the history
+    history.extend([{'role': 'user', 'content': {'path': file}} for file in message['files']])
+    history.append({'role': 'user', 'content': message['text']})
+    yield history, ''
+    for step in bot.chat(message['text'], message['files']):
+        match step:
+            case ChatMessageToolCall():
+                if step.function.name == 'download_video':
+                    history.append({
+                        'role': 'assistant',
+                        'content': f'📥 Downloading video from {step.function.arguments['url']}'
+                    })
+                elif step.function.name == 'add_video':
+                    history.append({
+                        'role': 'assistant',
+                        'content': f'🎥 Processing and adding video `{step.function.arguments["filename"]}` '
+                                   f'to the knowledge base. This may take a while...'
+                    })
+                elif step.function.name == 'search_in_video':
+                    filename = os.path.basename(bot.video_rag.videos[step.function.arguments["video_id"]]['video_path'])
+                    history.append({
+                        'role': 'assistant',
+                        'content': f'🔍 Searching in video `{filename}` '
+                                   f'for query: *{step.function.arguments.get("text_query", step.function.arguments.get("image_query", ""))}*'
+                    })
+                elif step.function.name == 'final_answer':
+                    continue
+                yield history, ''
+            case ActionStep():
+                yield history, ''
+            case FinalAnswerStep():
+                history.append({'role': 'assistant', 'content': step.output})
+                yield history, ''
+def clear_chat(chatbot):
+    chatbot.clear()
+    return chatbot, gr.update(value='')
+def main():
+    with gr.Blocks() as demo:
+        gr.Markdown('# Video Chatbot Demo')
+        gr.Markdown('This demo showcases a video chatbot that can process and search videos using '
+                    'RAG (Retrieval-Augmented Generation). You can upload videos/images or link to YouTube videos, '
+                    'ask questions, and get answers based on the video content.')
+        chatbot = gr.Chatbot(type='messages', label='Video Chatbot', height=800, resizable=True)
+        textbox = gr.MultimodalTextbox(
+            sources=['upload'],
+            file_types=['image', '.mp4'],
+            show_label=False,
+            placeholder='Type a message or upload an image/video...',
+        )
+        textbox.submit(chat, [textbox, chatbot], [chatbot, textbox])
+        clear = gr.Button('Clear Chat')
+        clear.click(clear_chat, [chatbot], [chatbot, textbox])
+    demo.launch(debug=True)
+if __name__ == '__main__':
+    main()

configs.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from dataclasses import dataclass
+from dotenv import load_dotenv
+load_dotenv()
+@dataclass
+class Settings:
+    DATA_DIR: str = 'data'
+    FFMPEG_PATH: str = 'ffmpeg'
+    MAX_VIDEO_RESOLUTION: int = 360
+    MAX_VIDEO_FPS: float = 30
+    VIDEO_EXTENSION: str = 'mp4'
+    VIDEO_EXTRACTION_FRAME_RATE: float = 1.0
+    AUDIO_SEGMENT_LENGTH: int = 300
+    CHATBOT_MODEL: str = 'gemini-2.0-flash'
+    MODEL_BASE_API: str = 'https://generativelanguage.googleapis.com/v1beta/'
+    TEXT_EMBEDDING_MODEL: str = 'sentence-transformers/all-MiniLM-L6-v2'
+    IMAGE_EMBEDDING_MODEL: str = 'facebook/dinov2-small'
+settings = Settings()

embeder.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+from PIL import Image
+class MultimodalEmbedder:
+    """A multimodal embedder that supports text and image embeddings."""
+    def __init__(
+            self,
+            text_model: str = 'sentence-transformers/all-MiniLM-L6-v2',
+            image_model: str = 'facebook/dinov2-small'
+    ):
+        self.text_model = SentenceTransformer(text_model)
+        self.image_model = pipeline(
+            'image-feature-extraction',
+            model=image_model,
+            device=0 if torch.cuda.is_available() else -1,
+            pool=True
+        )
+    def embed_texts(self, texts: list[str]) -> list[list[float]]:
+        """Embed a list of texts"""
+        return self.text_model.encode(
+            texts,
+            device='cuda' if torch.cuda.is_available() else 'cpu',
+            show_progress_bar=True
+        ).tolist()
+    def embed_images(self, images: list[str | Image.Image]) -> list[list[float]]:
+        """Embed a list of images, which can be file paths or PIL Image objects."""
+        images = [Image.open(img) if isinstance(img, str) else img for img in images]
+        images = [img.convert('RGB') for img in images]
+        embeddings = self.image_model(images)
+        return [emb[0] for emb in embeddings]

prompt.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+def video_to_text_prompt(video_path: str, metadata: dict = None) -> str:
+    """Generate a text prompt to represent a video file with its metadata."""
+    metadata = metadata or {}
+    return f'''<video>
+Filename: {os.path.basename(video_path)}
+Metadata:
+{'\n'.join(f'- {key}: {value}' for key, value in metadata.items())}
+</video>
+'''

rag.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import os.path
+import uuid
+import lancedb
+import pyarrow as pa
+from PIL import Image
+from scipy.spatial import distance
+from tqdm import tqdm
+import utils
+from configs import settings
+from embeder import MultimodalEmbedder
+from transcriber import AudioTranscriber
+class VideoRAG:
+    """Video RAG (Retrieval-Augmented Generation) system for processing and searching video content."""
+    def __init__(self, video_frame_rate: float = 1, audio_segment_length: int = 300):
+        self.video_frame_rate = video_frame_rate
+        self.audio_segment_length = audio_segment_length
+        print('Loading embedding and audio transcription models...')
+        self.embedder = MultimodalEmbedder(
+            text_model=settings.TEXT_EMBEDDING_MODEL,
+            image_model=settings.IMAGE_EMBEDDING_MODEL,
+        )
+        self.transcriber = AudioTranscriber()
+        # init DB and tables
+        self._init_db()
+    def _init_db(self):
+        print('Initializing LanceDB...')
+        self.db = lancedb.connect(f'{settings.DATA_DIR}/vectordb')
+        self.frames_table = self.db.create_table('frames', mode='overwrite', schema=pa.schema([
+            pa.field('vector', pa.list_(pa.float32(), 384)),
+            pa.field('video_id', pa.string()),
+            pa.field('frame_index', pa.int32()),
+            pa.field('frame_path', pa.string()),
+        ]))
+        self.transcripts_table = self.db.create_table('transcripts', mode='overwrite', schema=pa.schema([
+            pa.field('vector', pa.list_(pa.float32(), 384)),
+            pa.field('video_id', pa.string()),
+            pa.field('segment_index', pa.int32()),
+            pa.field('start', pa.float64()),
+            pa.field('end', pa.float64()),
+            pa.field('text', pa.string()),
+        ]))
+        # save video metadata
+        self.videos = {}
+    def is_video_exists(self, video_id: str) -> bool:
+        """Check if a video exists in the RAG system by video ID.
+        Args:
+            video_id (str): The ID of the video to check.
+        Returns:
+            bool: True if the video exists, False otherwise.
+        """
+        return video_id in self.videos
+    def get_video(self, video_id: str) -> dict:
+        """Retrieve video metadata by video ID.
+        Args:
+            video_id (str): The ID of the video to retrieve.
+        Returns:
+            dict: A dictionary containing video metadata, including video path, frame directory, frame rate, and transcript segments.
+        """
+        if video_id not in self.videos:
+            raise ValueError(f'Video with ID {video_id} not found.')
+        return self.videos[video_id]
+    def add_video(self, video_path: str) -> str:
+        """Add a video to the RAG system by processing its frames and transcripts.
+        Args:
+            video_path (str): The path to the video file to be added.
+        Returns:
+            str: A unique video ID generated for the added video.
+        """
+        # create a unique video ID
+        video_id = uuid.uuid4().hex[:8]
+        print(f'Adding video "{video_path}" with ID {video_id} to the RAG system...')
+        print('Extracting video frames')
+        # process video frames
+        frame_paths = utils.extract_video_frames(video_path, output_dir=f'{video_path}_frames',
+                                                 frame_rate=self.video_frame_rate)
+        print(f'Computing embeddings for {len(frame_paths)} frames...')
+        # calculate embeddings for frames
+        frame_embeddings = self.embedder.embed_images(frame_paths)
+        # get significant frames to reduce the number of frames
+        frame_indexes = get_significant_frames(frame_embeddings, threshold=0.6)
+        # add frames to the database
+        self.frames_table.add(
+            [{
+                'vector': frame_embeddings[i],
+                'video_id': video_id,
+                'frame_index': i,
+                'frame_path': frame_paths[i],
+            } for i in frame_indexes]
+        )
+        print(f'Added {len(frame_indexes)} significant frames to the database.')
+        print('Extracting audio from video')
+        # transcribe video to text
+        audio_path = utils.extract_audio(video_path)
+        print(f'Splitting and transcribing audio...')
+        segments = []
+        for i, segment_path in tqdm(enumerate(utils.split_media_file(
+                audio_path,
+                output_dir=f'{video_path}_audio_segments',
+                segment_length=self.audio_segment_length
+        )), desc='Transcribing audio'):
+            for segment in self.transcriber.transcribe(segment_path)['segments']:
+                segment['start'] += i * self.audio_segment_length
+                segment['end'] += i * self.audio_segment_length
+                segments.append(segment)
+        segments = sorted(segments, key=lambda s: s['start'])
+        print(f'Computing embeddings for {len(segments)} transcript segments...')
+        # calculate embeddings for transcripts
+        transcript_embeddings = self.embedder.embed_texts([s['text'] for s in segments])
+        # add transcripts to the database
+        self.transcripts_table.add(
+            [{
+                'vector': transcript_embeddings[i],
+                'video_id': video_id,
+                'segment_index': i,
+                'start': segment['start'],
+                'end': segment['end'],
+                'text': segment['text'],
+            } for i, segment in enumerate(segments)],
+        )
+        print(f'Added {len(segments)} transcript segments to the database.')
+        # add video metadata to the database
+        self.videos[video_id] = {
+            'video_path': video_path,
+            'frame_dir': f'{video_path}_frames',
+            'video_frame_rate': self.video_frame_rate,
+            'transcript_segments': segments,
+        }
+        print(f'Video "{video_path}" added with ID {video_id}.')
+        return video_id
+    def search(self, video_id: str, text: str = None, image: str | Image.Image = None, limit: int = 10) -> list[dict]:
+        """Search for relevant video frames or transcripts based on text or image input.
+        Args:
+            video_id (str): The ID of the video to search in.
+            text (str, optional): The text query to search for in the video transcripts.
+            image (str | Image.Image, optional): The image query to search for in the video frames. If a string is provided, it should be the path to the image file.
+            limit (int, optional): The maximum number of results to return. Defaults to 10.
+        Returns:
+            list[dict]: A list of dictionaries containing the search results, each with start and end times, distance, frame paths, and transcript segments.
+        """
+        video_metadata = self.get_video(video_id)
+        # search for transcripts based on text
+        timespans = []
+        if text is not None:
+            text_embedding = self.embedder.embed_texts([text])[0]
+            query = (self.transcripts_table
+                    .search(text_embedding)
+                    .where(f'video_id = \'{video_id}\'')
+                    .limit(limit))
+            for result in query.to_list():
+                timespans.append({
+                    'start': result['start'],
+                    'end': result['end'],
+                    'distance': distance.cosine(text_embedding, result['vector']),
+                })
+        # search for frames based on image
+        if image is not None:
+            image_embedding = self.embedder.embed_images([image])[0]
+            query = (self.frames_table
+                    .search(image_embedding)
+                    .where(f'video_id = \'{video_id}\'')
+                    .limit(limit))
+            for result in query.to_list():
+                start = result['frame_index'] / self.video_frame_rate
+                timespans.append({
+                    'start': start,
+                    'end': start + 1,
+                    'distance': distance.cosine(image_embedding, result['vector']),  # Fix lancedb return large distance
+                })
+        # merge nearby timespans
+        timespans = merge_searched_timespans(timespans, threshold=5)
+        # sort timespans by distance
+        timespans = sorted(timespans, key=lambda x: x['distance'])
+        # limit to k results
+        timespans = timespans[:limit]
+        for timespan in timespans:
+            # extend timespans to at least 5 seconds
+            duration = timespan['end'] - timespan['start']
+            if duration < 5:
+                timespan['start'] = max(0, timespan['start'] - (5 - duration) / 2)
+                timespan['end'] = timespan['start'] + 5
+            # add frame paths
+            timespan['frame_paths'] = []
+            for frame_index in range(
+                    int(timespan['start'] * self.video_frame_rate),
+                    int(timespan['end'] * self.video_frame_rate)
+            ):
+                timespan['frame_paths'].append(os.path.join(video_metadata['frame_dir'], f'{frame_index + 1}.jpg'))
+            # add transcript segments
+            timespan['transcript_segments'] = []
+            for segment in video_metadata['transcript_segments']:
+                if utils.span_iou((segment['start'], segment['end']),
+                                  (timespan['start'], timespan['end'])) > 0:
+                    timespan['transcript_segments'].append(segment)
+        return timespans
+    def clear(self):
+        """Clear the RAG system by dropping all tables and resetting video metadata."""
+        self._init_db()
+def get_significant_frames(frame_embeddings: list[list[float]], threshold: float = 0.8) -> list[int]:
+    """Select significant frames by comparing embeddings."""
+    selected_frames = []
+    current_frame = 0
+    for i, embedding in enumerate(frame_embeddings):
+        similarity = 1 - distance.cosine(frame_embeddings[current_frame], embedding)
+        if similarity < threshold:
+            selected_frames.append(current_frame)
+            current_frame = i
+    selected_frames.append(current_frame)
+    return selected_frames
+def merge_searched_timespans(timespans: list[dict], threshold: float) -> list[dict]:
+    """Merge timespans if the gap between them is less than or equal to threshold."""
+    if not timespans:
+        return []
+    # Sort spans by start time
+    sorted_spans = sorted(timespans, key=lambda s: s['start'])
+    merged_spans = []
+    current_span = sorted_spans[0].copy()
+    for next_span in sorted_spans[1:]:
+        gap = next_span['start'] - current_span['end']
+        if gap <= threshold:
+            # Extend the current span’s end if needed
+            current_span['end'] = max(current_span['end'], next_span['end'])
+            current_span['distance'] = min(current_span['distance'], next_span['distance'])
+        else:
+            # No merge push current and start a new one
+            merged_spans.append(current_span)
+            current_span = next_span.copy()
+    # Add the last span
+    merged_spans.append(current_span)
+    return merged_spans

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+google-genai>=1.22.0
+lancedb>=0.24.0
+pillow>=10.4.0
+sentence-transformers>=4.1.0
+smolagents[openai]>=1.19.0
+tqdm>=4.67.1
+transformers>=4.53.0
+yt-dlp>=2025.6.25

tools.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import os
+from smolagents import tool, Tool
+import utils
+from configs import settings
+from prompt import video_to_text_prompt
+from rag import VideoRAG
+@tool
+def download_video(url: str) -> str:
+    """
+    Download a video from YouTube or other supported platforms.
+    Args:
+        url (str): The URL of the video.
+    Returns:
+        str: The video information, including the filename.
+    """
+    try:
+        filepath, info = utils.download_video(
+            url,
+            output_dir=settings.DATA_DIR,
+            max_resolution=settings.MAX_VIDEO_RESOLUTION,
+            max_fps=settings.MAX_VIDEO_FPS,
+            extension=settings.VIDEO_EXTENSION
+        )
+    except Exception as e:
+        return f'Error downloading video: {e.__class__.__name__}: {e}'
+    return video_to_text_prompt(
+        filepath,
+        metadata={
+            'URL': url,
+            'Title': info.get('title', 'N/A'),
+            'Channel': info.get('channel', 'N/A'),
+            'Duration': info.get('duration', 'N/A'),
+        }
+    )
+def create_video_rag_tools(video_rag: VideoRAG) -> list[Tool]:
+    @tool
+    def add_video(filename: str) -> str:
+        """
+        Add a video file to the RAG knowledge-base for further search and analysis.
+        Args:
+            filename (str): The video filename to add.
+        Returns:
+            str: The video ID if added successfully, or an error message.
+        """
+        try:
+            video_id = video_rag.add_video(os.path.join(settings.DATA_DIR, filename))
+            return f'Video added with ID: {video_id}'
+        except Exception as e:
+            return f'Error adding video: {e.__class__.__name__}: {e}'
+    @tool
+    def search_in_video(video_id: str, text_query: str = None, image_query: str = None) -> str:
+        """
+        Search for relevant video frames and transcripts based on text or image query. Allows searching within a specific video added to the RAG knowledge-base.
+        At least one of `text_query` or `image_query` must be provided.
+        Args:
+            video_id (str): The ID of the video to search in. This should be the ID returned by `add_video`.
+            text_query (str, optional): The text query to search for in the video transcripts.
+            image_query (str, optional): The image query to search for in the video frames. This is the filename of the image.
+        Returns:
+            str: A message indicating the search results or an error message if the video is not found.
+        """
+        if not video_rag.is_video_exists(video_id):
+            return f'Video with ID "{video_id}" not found in the knowledge-base. Please add the video first using `add_video` tool.'
+        if not text_query and not image_query:
+            return 'Please provide at least one of `text_query` or `image_query` to search in the video.'
+        try:
+            results = video_rag.search(
+                video_id=video_id,
+                text=text_query,
+                image=image_query,
+                limit=5
+            )
+        except Exception as e:
+            return f'Error searching in video: {e.__class__.__name__}: {e}'
+        if not results:
+            return f'No results found for the given query in video ID {video_id}.'
+        # build the output message
+        output = f'Search results for video ID {video_id}:\n'
+        for result in results:
+            # include timespans, transcript segments, and frame paths in the output
+            timespan_text = f'{utils.seconds_to_hms(int(result['start']))} - {utils.seconds_to_hms(int(result['end']))}'
+            transcript_texts = []
+            for segment in result['transcript_segments']:
+                transcript_texts.append(
+                    f'- {utils.seconds_to_hms(int(segment['start']), drop_hours=True)}'
+                    f'-{utils.seconds_to_hms(int(segment['end']), drop_hours=True)}: {segment['text']}')
+            observation_image_texts = []
+            for frame_path in result['frame_paths'][::5]:  # take every 5th frame for brevity
+                observation_image_texts.append(f'<observation_image>{frame_path}</observation_image>')
+            output += f'''<video_segment>
+Timespan: {timespan_text}
+Transcript:
+{'\n'.join(transcript_texts)}
+Frame images: {' '.join(observation_image_texts)}
+</video_segment>\n'''
+        return output
+    return [add_video, search_in_video]

transcriber.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import time
+from typing import Any
+from google import genai
+from google.genai import types
+class AudioTranscriber:
+    """A class to transcribe audio files"""
+    SYSTEM_INSTRUCTION = '''You are an advanced audio transcription model. Your task is to accurately transcribe provided audio input into a structured JSON format.
+**Output Format Specification:**
+Your response MUST be a valid JSON object with the following structure:
+```json
+{
+  "segments": [
+    {
+      "text": "The transcribed text for the segment.",
+      "start": "The start time of the segment in seconds.",
+      "end": "The end time of the segment in seconds.",
+      "speaker": "The speaker ID for the segment."
+    }
+  ],
+  "language": "The language of the transcribed text in ISO 639-1 format."
+}
+```
+**Detailed Instructions and Rules:**
+1. Segments:
+- A "segment" is defined as a continuous section of speech from a single speaker include multiple sentences or phrases.
+- Each segment object MUST contain `text`, `start`, `end`, and `speaker` fields.
+- `text`: The verbatim transcription of the speech within that segment.
+- `start`: The precise start time of the segment in seconds, represented as a floating-point number (e.g., 0.0, 5.25).
+- `end`: The precise end time of the segment in seconds, represented as a floating-point number (e.g., 4.9, 10.12).
+- `speaker`: An integer representing the speaker ID.
+  + Speaker IDs start at `0` for the first detected speaker.
+  + The speaker ID MUST increment by 1 each time a new, distinct speaker is identified in the audio. Do not reuse speaker IDs within the same transcription.
+  + If the same speaker talks again after another speaker, they retain their original speaker ID.
+  + **Segment Splitting Rule**: A segment for the same speaker should only be split if there is a period of silence lasting more than 5 seconds. Otherwise, continuous speech from the same speaker, even with short pauses, should remain within a single segment.
+2. Language:
+- `language`: A two-letter ISO 639-1 code representing the primary language of the transcribed text (e.g., "en" for English, "es" for Spanish, "fr" for French).
+-  If multiple languages are detected in the audio, you MUST select and output only the ISO 639-1 code for the primary language used throughout the audio.
+'''
+    RESPONSE_SCHEMA = {
+        'type': 'object',
+        'properties': {
+            'segments': {
+                'type': 'array',
+                "description": 'A list of transcribed segments from the audio file.',
+                'items': {
+                    'type': 'object',
+                    'properties': {
+                        'text': {
+                            'type': 'string',
+                            'description': 'The transcribed text for the segment.'
+                        },
+                        'start': {
+                            'type': 'number',
+                            'description': 'The start time of the segment in seconds.'
+                        },
+                        'end': {
+                            'type': 'number',
+                            'description': 'The end time of the segment in seconds.'
+                        },
+                        'speaker': {
+                            'type': 'integer',
+                            'description': 'The speaker ID for the segment.'
+                        }
+                    },
+                    'required': ['text', 'start', 'end', 'speaker'],
+                    'propertyOrdering': ['text', 'start', 'end', 'speaker']
+                },
+            },
+            'language': {
+                'type': 'string',
+                'description': 'The language of the transcribed text in ISO 639-1 format.',
+            }
+        },
+        'required': ['segments', 'language'],
+        'propertyOrdering': ['segments', 'language']
+    }
+    def __init__(self, model: str = 'gemini-2.0-flash', api_key: str = None):
+        self.model = model
+        self.client = genai.Client(api_key=api_key)
+    def transcribe(self, audio_path: str) -> dict[str, Any]:
+        """Transcribe an audio file from the given path.
+        Args:
+            audio_path (str): The path to the audio file to be transcribed.
+        Returns:
+            dict[str, Any]: The transcription result.
+            ```{
+                "segments": [
+                    {
+                        "text": "Transcribed text",
+                        "start": 0.0,
+                        "end": 5.0,
+                        "speaker": 0
+                    }
+                ],
+                "language": "en"
+            }```
+        """
+        uploaded_file = self.client.files.upload(file=audio_path)
+        while uploaded_file.state != 'ACTIVE':
+            time.sleep(1)
+            uploaded_file = self.client.files.get(name=uploaded_file.name)
+            if uploaded_file.state == 'FAILED':
+                raise ValueError('Failed to upload the audio file')
+        response = self.client.models.generate_content(
+            model=self.model,
+            contents=uploaded_file,
+            config=types.GenerateContentConfig(
+                system_instruction=self.SYSTEM_INSTRUCTION,
+                temperature=0.2,
+                response_mime_type='application/json',
+                response_schema=self.RESPONSE_SCHEMA,
+            )
+        )
+        if response.parsed is None:
+            raise ValueError('Failed to transcribe the audio file')
+        return response.parsed  # type: ignore

utils.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import glob
+import os.path
+import subprocess
+from yt_dlp import YoutubeDL
+from configs import settings
+def download_video(
+        url: str,
+        output_dir: str = None,
+        max_resolution: int = 1080,
+        max_fps: float = 60,
+        extension: str = 'mp4'
+) -> tuple[str, dict]:
+    """Download a video from YouTube or other supported sites. Returns the file path and video metadata.
+    Args:
+        url (str): The URL of the video.
+        output_dir (str, optional): Directory to save the downloaded video. Defaults to current directory.
+        max_resolution (int, optional): Maximum resolution of the video to download. Defaults to 1080.
+        max_fps (float, optional): Maximum frames per second of the video to download. Defaults to 60.
+        extension (str, optional): File extension for the downloaded video. Defaults to 'mp4'.
+    Returns:
+        tuple[str, dict]: A tuple containing the path to the downloaded video file and its metadata.
+    """
+    ydl_opts = {
+        'format': f'bestvideo[height<={max_resolution}][fps<={max_fps}][ext={extension}]+'
+                  f'bestaudio/best[height<={max_resolution}][fps<={max_fps}][ext={extension}]/best',
+        'merge_output_format': extension,
+        'outtmpl': f'{output_dir or "."}/%(title)s.%(ext)s',
+        'noplaylist': True,
+    }
+    with YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=True)
+        ydl.download([url])
+        if output_dir:
+            output_path = os.path.join(output_dir, ydl.prepare_filename(info))
+        else:
+            output_path = ydl.prepare_filename(info)
+    return output_path, info
+def extract_video_frames(video_path: str, output_dir: str, frame_rate: float = 1, extension: str = 'jpg') -> list[str]:
+    """Extract frames from a video file at a specified frame rate.
+    Args:
+        video_path (str): Path to the video file.
+        output_dir (str): Directory to save the extracted frames.
+        frame_rate (float, optional): Frame rate for extraction. Defaults to 1 frame per second.
+        extension (str, optional): File extension for the extracted frames. Defaults to 'jpg'.
+    Returns:
+        list[str]: A sorted list of paths to the extracted frame images.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    subprocess.run(
+        [
+            settings.FFMPEG_PATH,
+            # '-v', 'quiet',
+            '-i', video_path,
+            '-vf', f'fps={frame_rate}',
+            '-y',
+            f'{output_dir or "."}/%d.{extension}'
+        ],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    # Get all extracted frames
+    results = sorted(glob.glob(f'{output_dir or "."}/*.{extension}'),
+                     key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
+    if not results:
+        raise FileNotFoundError(f'No frames found in "{output_dir}" for video "{video_path}"')
+    return results
+def extract_audio(video_path: str, output_dir: str = None, extension: str = 'm4a') -> str:
+    """Extract audio from a video file and save it as an M4A file.
+    Args:
+        video_path (str): Path to the video file.
+        output_dir (str, optional): Directory to save the extracted audio. Defaults to the same directory as the video.
+        extension (str, optional): File extension for the extracted audio. Defaults to 'm4a'.
+    Returns:
+        str: Path to the extracted audio file.
+    """
+    if output_dir is None:
+        output_dir = os.path.dirname(video_path)
+    audio_path = os.path.join(output_dir, f'{os.path.splitext(os.path.basename(video_path))[0]}.{extension}')
+    subprocess.run(
+        [
+            settings.FFMPEG_PATH,
+            '-i', video_path,
+            '-q:a', '0',
+            '-map', 'a',
+            '-y',
+            audio_path
+        ],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    if not os.path.exists(audio_path):
+        raise FileNotFoundError(f'Audio extraction failed: "{audio_path}" does not exist.')
+    return audio_path
+def split_media_file(file_path: str, output_dir: str, segment_length: int = 60) -> list[str]:
+    """Split a media file into segments of specified length in seconds.
+    Args:
+        file_path (str): Path to the media file to be split.
+        output_dir (str): Directory to save the split segments.
+        segment_length (int, optional): Length of each segment in seconds. Defaults to 60 seconds.
+    Returns:
+        list[str]: A sorted list of paths to the split media segments.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    base_name = os.path.splitext(os.path.basename(file_path))[0]
+    extension = os.path.splitext(file_path)[1]
+    segment_pattern = os.path.join(output_dir, f'{base_name}_%03d.{extension}')
+    subprocess.run(
+        [
+            settings.FFMPEG_PATH,
+            '-i', file_path,
+            '-c', 'copy',
+            '-map', '0',
+            '-segment_time', str(segment_length),
+            '-f', 'segment',
+            '-y',
+            segment_pattern
+        ],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    return sorted(glob.glob(f'{output_dir}/*{base_name}_*.{extension}'))
+def span_iou(span1: tuple[float, float], span2: tuple[float, float]) -> float:
+    """Calculate the Intersection over Union (IoU) of two spans."""
+    start1, end1 = span1
+    start2, end2 = span2
+    intersection_start = max(start1, start2)
+    intersection_end = min(end1, end2)
+    if intersection_start >= intersection_end:
+        return 0.0  # No overlap
+    intersection_length = intersection_end - intersection_start
+    union_length = (end1 - start1) + (end2 - start2) - intersection_length
+    return intersection_length / union_length if union_length > 0 else 0.0
+def seconds_to_hms(total_seconds: int, drop_hours: bool = False) -> str:
+    """Convert a number of seconds to a string formatted as HH:MM:SS."""
+    # Ensure we’re working with non-negative integers
+    if total_seconds < 0:
+        raise ValueError('total_seconds must be non-negative')
+    hours, remainder = divmod(total_seconds, 3600)
+    minutes, seconds = divmod(remainder, 60)
+    if drop_hours and hours == 0:
+        return f'{minutes:02d}:{seconds:02d}'
+    return f'{hours:02d}:{minutes:02d}:{seconds:02d}'