Spaces:

MCP-1st-Birthday
/

MedLLM-Agent

Running on Zero

App Files Files Community

Y Phung Nguyen commited on 23 days ago

Commit

5096447

1 Parent(s): 0cd2df1

Upd MCP ASR&TTS

Browse files

Files changed (2) hide show

agent.py +161 -13
voice.py +63 -18

agent.py CHANGED Viewed

@@ -16,10 +16,14 @@ from pathlib import Path
 # MCP imports
 try:
     from mcp import types as mcp_types
-    from mcp.server import Server, NotificationOptions
-    from mcp.types import Tool, TextContent, ImageContent, EmbeddedResource
     from mcp.server.models import InitializationOptions
 except ImportError:
     print("Error: MCP SDK not installed. Install with: pip install mcp", file=sys.stderr)
     sys.exit(1)
@@ -60,8 +64,8 @@ GEMINI_MAX_FILES = int(os.environ.get("GEMINI_MAX_FILES", "10"))
 GEMINI_MAX_TOTAL_FILE_SIZE = int(os.environ.get("GEMINI_MAX_TOTAL_FILE_SIZE", "50"))  # MB
 GEMINI_TEMPERATURE = float(os.environ.get("GEMINI_TEMPERATURE", "0.2"))
-# Create MCP server
-app = Server("gemini-mcp-server")
 def decode_base64_file(content: str, mime_type: str = None) -> bytes:
     """Decode base64 encoded file content"""
@@ -117,7 +121,7 @@ def prepare_gemini_files(files: list) -> list:
     return gemini_parts
-@app.list_tools()
 async def list_tools() -> list[Tool]:
     """List available tools"""
     try:
@@ -159,6 +163,46 @@ async def list_tools() -> list[Tool]:
                     },
                     "required": ["user_prompt"]
                 }
             )
         ]
         return tools
@@ -166,7 +210,7 @@ async def list_tools() -> list[Tool]:
         logger.error(f"Error in list_tools(): {e}")
         raise
-@app.call_tool()
 async def call_tool(name: str, arguments: dict) -> Sequence[TextContent | ImageContent | EmbeddedResource]:
     """Handle tool calls"""
     logger.info(f"🔵 MCP tool call received: {name}")
@@ -277,6 +321,111 @@ async def call_tool(name: str, arguments: dict) -> Sequence[TextContent | ImageC
         except Exception as e:
             logger.error(f"Error in generate_content: {e}")
             return [TextContent(type="text", text=f"Error: {str(e)}")]
     else:
         return [TextContent(type="text", text=f"Unknown tool: {name}")]
@@ -289,21 +438,20 @@ async def main():
     logger.info(f"Default Lite Model: {GEMINI_MODEL_LITE}")
     logger.info("=" * 60)
-    # Use stdio_server from mcp.server.stdio
-    from mcp.server.stdio import stdio_server
     # Keep logging enabled for debugging
     original_root_level = logging.getLogger("root").level
     logging.getLogger("root").setLevel(logging.INFO)
     try:
         async with stdio_server() as streams:
             # Prepare server capabilities for initialization
             try:
-                if hasattr(app, "get_capabilities"):
                     notification_options = NotificationOptions()
                     experimental_capabilities: dict[str, dict[str, Any]] = {}
-                    server_capabilities = app.get_capabilities(
                         notification_options=notification_options,
                         experimental_capabilities=experimental_capabilities,
                     )
@@ -322,13 +470,13 @@ async def main():
             logger.info("MCP server ready")
             try:
                 # Run the server - it will automatically handle the initialization handshake
-                await app.run(
                     read_stream=streams[0],
                     write_stream=streams[1],
                     initialization_options=init_options,
                 )
             except Exception as run_error:
-                logger.error(f"Error in app.run(): {run_error}")
                 raise
     except Exception as e:
         logging.getLogger("root").setLevel(original_root_level)

 # MCP imports
 try:
+    from mcp.server import Server
+    from mcp.types import Tool, TextContent
+    import mcp.server.stdio
+    # Additional imports needed for server functionality
     from mcp import types as mcp_types
+    from mcp.types import ImageContent, EmbeddedResource
     from mcp.server.models import InitializationOptions
+    from mcp.server import NotificationOptions
 except ImportError:
     print("Error: MCP SDK not installed. Install with: pip install mcp", file=sys.stderr)
     sys.exit(1)
 GEMINI_MAX_TOTAL_FILE_SIZE = int(os.environ.get("GEMINI_MAX_TOTAL_FILE_SIZE", "50"))  # MB
 GEMINI_TEMPERATURE = float(os.environ.get("GEMINI_TEMPERATURE", "0.2"))
+# Initialize MCP server
+server = Server("gemini-mcp-server")
 def decode_base64_file(content: str, mime_type: str = None) -> bytes:
     """Decode base64 encoded file content"""
     return gemini_parts
+@server.list_tools()
 async def list_tools() -> list[Tool]:
     """List available tools"""
     try:
                     },
                     "required": ["user_prompt"]
                 }
+            ),
+            Tool(
+                name="transcribe_audio",
+                description="Transcribe audio file to text using Gemini AI. Supports various audio formats (WAV, MP3, M4A, etc.).",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "audio_path": {
+                            "type": "string",
+                            "description": "Path to audio file to transcribe (required)"
+                        },
+                        "language": {
+                            "type": "string",
+                            "description": "Language code (optional, defaults to auto-detect)"
+                        }
+                    },
+                    "required": ["audio_path"]
+                }
+            ),
+            Tool(
+                name="text_to_speech",
+                description="Convert text to speech audio using Gemini AI. Returns path to generated audio file.",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "text": {
+                            "type": "string",
+                            "description": "Text to convert to speech (required)"
+                        },
+                        "language": {
+                            "type": "string",
+                            "description": "Language code (optional, defaults to 'en')"
+                        },
+                        "voice": {
+                            "type": "string",
+                            "description": "Voice selection (optional)"
+                        }
+                    },
+                    "required": ["text"]
+                }
             )
         ]
         return tools
         logger.error(f"Error in list_tools(): {e}")
         raise
+@server.call_tool()
 async def call_tool(name: str, arguments: dict) -> Sequence[TextContent | ImageContent | EmbeddedResource]:
     """Handle tool calls"""
     logger.info(f"🔵 MCP tool call received: {name}")
         except Exception as e:
             logger.error(f"Error in generate_content: {e}")
             return [TextContent(type="text", text=f"Error: {str(e)}")]
+    elif name == "transcribe_audio":
+        try:
+            audio_path = arguments.get("audio_path")
+            if not audio_path:
+                logger.error("❌ audio_path is required but missing")
+                return [TextContent(type="text", text="Error: audio_path is required")]
+            language = arguments.get("language", "auto")
+            # Check if file exists
+            if not os.path.exists(audio_path):
+                logger.error(f"❌ Audio file not found: {audio_path}")
+                return [TextContent(type="text", text=f"Error: Audio file not found: {audio_path}")]
+            # Use Gemini to transcribe audio
+            system_prompt = "You are a professional transcription service. Provide accurate, well-formatted transcripts."
+            user_prompt = "Please transcribe this audio file. Include speaker identification if multiple speakers are present, and format it with proper punctuation and paragraphs, remove mumble, ignore non-verbal noises."
+            files = [{"path": os.path.abspath(audio_path)}]
+            try:
+                generation_config = {
+                    "temperature": 0.2,
+                    "max_output_tokens": GEMINI_MAX_OUTPUT_TOKENS
+                }
+                timeout_seconds = min(GEMINI_TIMEOUT / 1000.0, 20.0)
+                logger.info(f"🔵 Transcribing audio with Gemini API, timeout={timeout_seconds}s...")
+                gemini_contents = [f"{system_prompt}\n\n{user_prompt}"]
+                file_parts = prepare_gemini_files(files)
+                for file_part in file_parts:
+                    gemini_contents.append({
+                        "inline_data": {
+                            "mime_type": file_part["mime_type"],
+                            "data": base64.b64encode(file_part["data"]).decode('utf-8')
+                        }
+                    })
+                def transcribe_sync():
+                    return gemini_client.models.generate_content(
+                        model=GEMINI_MODEL_LITE,
+                        contents=gemini_contents,
+                        config=generation_config,
+                    )
+                response = await asyncio.wait_for(
+                    asyncio.to_thread(transcribe_sync),
+                    timeout=timeout_seconds
+                )
+                logger.info(f"✅ Audio transcription completed successfully")
+                if response and hasattr(response, 'text') and response.text:
+                    return [TextContent(type="text", text=response.text.strip())]
+                elif response and hasattr(response, 'candidates') and response.candidates:
+                    text_parts = []
+                    for candidate in response.candidates:
+                        if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'):
+                            for part in candidate.content.parts:
+                                if hasattr(part, 'text'):
+                                    text_parts.append(part.text)
+                    if text_parts:
+                        text = ''.join(text_parts).strip()
+                        return [TextContent(type="text", text=text)]
+                    else:
+                        return [TextContent(type="text", text="Error: No text in transcription response")]
+                else:
+                    return [TextContent(type="text", text="Error: No response from transcription")]
+            except asyncio.TimeoutError:
+                error_msg = f"Audio transcription timed out"
+                logger.error(f"❌ {error_msg}")
+                return [TextContent(type="text", text=f"Error: {error_msg}")]
+            except Exception as e:
+                logger.error(f"❌ Error transcribing audio: {type(e).__name__}: {e}")
+                import traceback
+                logger.debug(f"Full traceback: {traceback.format_exc()}")
+                return [TextContent(type="text", text=f"Error: {str(e)}")]
+        except Exception as e:
+            logger.error(f"Error in transcribe_audio: {e}")
+            return [TextContent(type="text", text=f"Error: {str(e)}")]
+    elif name == "text_to_speech":
+        try:
+            text = arguments.get("text")
+            if not text:
+                logger.error("❌ text is required but missing")
+                return [TextContent(type="text", text="Error: text is required")]
+            language = arguments.get("language", "en")
+            # Note: Gemini API doesn't directly support TTS audio generation
+            # This tool is provided for MCP protocol compliance, but the client
+            # should use local TTS models (like maya1) for actual audio generation
+            logger.info(f"🔵 TTS request received for text: {text[:50]}...")
+            logger.info("ℹ️ Gemini API doesn't support direct TTS. Client should use local TTS model.")
+            # Return a signal that client should handle TTS locally
+            # The client will interpret this and use its local TTS model
+            return [TextContent(type="text", text="USE_LOCAL_TTS")]
+        except Exception as e:
+            logger.error(f"Error in text_to_speech: {e}")
+            return [TextContent(type="text", text=f"Error: {str(e)}")]
     else:
         return [TextContent(type="text", text=f"Unknown tool: {name}")]
     logger.info(f"Default Lite Model: {GEMINI_MODEL_LITE}")
     logger.info("=" * 60)
     # Keep logging enabled for debugging
     original_root_level = logging.getLogger("root").level
     logging.getLogger("root").setLevel(logging.INFO)
     try:
+        # Use stdio_server from mcp.server.stdio
+        from mcp.server.stdio import stdio_server
         async with stdio_server() as streams:
             # Prepare server capabilities for initialization
             try:
+                if hasattr(server, "get_capabilities"):
                     notification_options = NotificationOptions()
                     experimental_capabilities: dict[str, dict[str, Any]] = {}
+                    server_capabilities = server.get_capabilities(
                         notification_options=notification_options,
                         experimental_capabilities=experimental_capabilities,
                     )
             logger.info("MCP server ready")
             try:
                 # Run the server - it will automatically handle the initialization handshake
+                await server.run(
                     read_stream=streams[0],
                     write_stream=streams[1],
                     initialization_options=init_options,
                 )
             except Exception as run_error:
+                logger.error(f"Error in server.run(): {run_error}")
                 raise
     except Exception as e:
         logging.getLogger("root").setLevel(original_root_level)

voice.py CHANGED Viewed

@@ -15,26 +15,57 @@ except ImportError:
 async def transcribe_audio_gemini(audio_path: str) -> str:
-    """Transcribe audio using Gemini MCP"""
     if not MCP_AVAILABLE:
         return ""
     try:
-        audio_path_abs = os.path.abspath(audio_path)
-        files = [{"path": audio_path_abs}]
-        system_prompt = "You are a professional transcription service. Provide accurate, well-formatted transcripts."
-        user_prompt = "Please transcribe this audio file. Include speaker identification if multiple speakers are present, and format it with proper punctuation and paragraphs, remove mumble, ignore non-verbal noises."
-        result = await call_agent(
-            user_prompt=user_prompt,
-            system_prompt=system_prompt,
-            files=files,
-            model=config.GEMINI_MODEL_LITE,
-            temperature=0.2
         )
-        return result.strip()
     except Exception as e:
         logger.error(f"Gemini transcription error: {e}")
         return ""
@@ -83,24 +114,33 @@ def transcribe_audio(audio):
 async def generate_speech_mcp(text: str) -> str:
-    """Generate speech using MCP TTS tool"""
     if not MCP_AVAILABLE:
         return None
     try:
         session = await get_mcp_session()
         if session is None:
             return None
         tools = await get_cached_mcp_tools()
         tts_tool = None
         for tool in tools:
-            tool_name_lower = tool.name.lower()
-            if "tts" in tool_name_lower or "speech" in tool_name_lower or "synthesize" in tool_name_lower:
                 tts_tool = tool
-                logger.info(f"Found MCP TTS tool: {tool.name}")
                 break
         if tts_tool:
             result = await session.call_tool(
                 tts_tool.name,
@@ -110,8 +150,13 @@ async def generate_speech_mcp(text: str) -> str:
             if hasattr(result, 'content') and result.content:
                 for item in result.content:
                     if hasattr(item, 'text'):
-                        if os.path.exists(item.text):
-                            return item.text
                     elif hasattr(item, 'data') and item.data:
                         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                             tmp_file.write(item.data)

 async def transcribe_audio_gemini(audio_path: str) -> str:
+    """Transcribe audio using Gemini MCP transcribe_audio tool"""
     if not MCP_AVAILABLE:
         return ""
     try:
+        session = await get_mcp_session()
+        if session is None:
+            logger.warning("MCP session not available for transcription")
+            return ""
+        tools = await get_cached_mcp_tools()
+        transcribe_tool = None
+        for tool in tools:
+            if tool.name == "transcribe_audio":
+                transcribe_tool = tool
+                logger.info(f"Found MCP transcribe_audio tool: {tool.name}")
+                break
+        if not transcribe_tool:
+            logger.warning("transcribe_audio MCP tool not found, falling back to generate_content")
+            # Fallback to using generate_content
+            audio_path_abs = os.path.abspath(audio_path)
+            files = [{"path": audio_path_abs}]
+            system_prompt = "You are a professional transcription service. Provide accurate, well-formatted transcripts."
+            user_prompt = "Please transcribe this audio file. Include speaker identification if multiple speakers are present, and format it with proper punctuation and paragraphs, remove mumble, ignore non-verbal noises."
+            result = await call_agent(
+                user_prompt=user_prompt,
+                system_prompt=system_prompt,
+                files=files,
+                model=config.GEMINI_MODEL_LITE,
+                temperature=0.2
+            )
+            return result.strip()
+        # Use the transcribe_audio tool
+        audio_path_abs = os.path.abspath(audio_path)
+        result = await session.call_tool(
+            transcribe_tool.name,
+            arguments={"audio_path": audio_path_abs}
         )
+        if hasattr(result, 'content') and result.content:
+            for item in result.content:
+                if hasattr(item, 'text'):
+                    transcribed_text = item.text.strip()
+                    if transcribed_text:
+                        logger.info(f"✅ Transcribed via MCP transcribe_audio tool: {transcribed_text[:50]}...")
+                        return transcribed_text
+        logger.warning("MCP transcribe_audio returned empty result")
+        return ""
     except Exception as e:
         logger.error(f"Gemini transcription error: {e}")
         return ""
 async def generate_speech_mcp(text: str) -> str:
+    """Generate speech using MCP text_to_speech tool"""
     if not MCP_AVAILABLE:
         return None
     try:
         session = await get_mcp_session()
         if session is None:
+            logger.warning("MCP session not available for TTS")
             return None
         tools = await get_cached_mcp_tools()
         tts_tool = None
         for tool in tools:
+            if tool.name == "text_to_speech":
                 tts_tool = tool
+                logger.info(f"Found MCP text_to_speech tool: {tool.name}")
                 break
+        if not tts_tool:
+            # Fallback: search for any TTS-related tool
+            for tool in tools:
+                tool_name_lower = tool.name.lower()
+                if "tts" in tool_name_lower or "speech" in tool_name_lower or "synthesize" in tool_name_lower:
+                    tts_tool = tool
+                    logger.info(f"Found MCP TTS tool (fallback): {tool.name}")
+                    break
         if tts_tool:
             result = await session.call_tool(
                 tts_tool.name,
             if hasattr(result, 'content') and result.content:
                 for item in result.content:
                     if hasattr(item, 'text'):
+                        text_result = item.text
+                        # Check if it's a signal to use local TTS
+                        if text_result == "USE_LOCAL_TTS":
+                            logger.info("MCP TTS tool indicates client-side TTS should be used")
+                            return None  # Return None to trigger client-side TTS
+                        elif os.path.exists(text_result):
+                            return text_result
                     elif hasattr(item, 'data') and item.data:
                         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                             tmp_file.write(item.data)