Spaces:

cosmosai471
/

come_onnn

Running

App Files Files Community

cosmosai471 commited on 15 days ago

Commit

bb016b3

verified ·

1 Parent(s): 6ac02a9

Update app.py

Browse files

Files changed (1) hide show

app.py +204 -129

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from diffusers import StableDiffusionPipeline
 from docx import Document
 from pptx import Presentation
 from io import BytesIO
-import numpy as np
 # --- CONFIGURATION & INITIALIZATION ---
 STT_DEVICE = "cpu"
@@ -30,12 +30,15 @@ MODEL_FILE = "luna.gguf"
 LOCAL_MODEL_PATH = MODEL_FILE
 SYSTEM_PROMPT = (
     "You are Luna, a helpful and friendly AI assistant. Your response must begin with two separate "
-    "tags: an **Intent** tag and a **Confidence** tag (0-100). Example: '[Intent: qa_general][Confidence: 85]'. "
     "Your full response must follow these tags."
 )
-# Configuration: confidence threshold for triggering web search fallback
-CONFIDENCE_THRESHOLD = 30  # only trigger web-search fallback if confidence is less than this
 def safe_del(self):
     try:
@@ -100,34 +103,46 @@ def simulate_recording_delay():
     time.sleep(3)
     return None
-def clean_response_stream(raw_text: str) -> str:
-    """Cleans up raw response text by removing tags and repeats.
-    We intentionally DO NOT split on plain words 'Intent' or 'Action' to avoid chopping tags.
     """
-    # Cut at common separators marking model streaming boundaries
-    clean_text = re.split(r'\nUser:|\nAssistant:|</s>', raw_text, 1)[0].strip()
-    # Remove bracketed instruction tokens and inline actions
-    clean_text = re.sub(r'\[/?INST\]|\[/?s\]|\s*<action>.*?</action>\s*', '', clean_text, flags=re.DOTALL).strip()
-    # Remove tags for display ([Intent: ...], [Confidence: ...]) — keep them for parsing elsewhere
-    clean_text = re.sub(r'\[Intent:\s*[\w\-\_]+\]|\[Confidence:\s*\d{1,3}\]', '', clean_text, flags=re.IGNORECASE).strip()
-    # Deduplicate trailing repeated words (simple heuristic)
-    words = clean_text.split()
     if len(words) > 4 and words[-2:] == words[-4:-2]:
-        clean_text = ' '.join(words[:-2])
-    return clean_text
 def web_search_tool(query: str) -> str:
-    time.sleep(1.5)
     print(f"Simulating Google Search fallback for: {query}")
-    return f"\n\n🌐 **Web Search Results for '{query}':** I've gathered information from external sources to supplement my knowledge."
 def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> str:
-    """Checks confidence from the raw response tag and triggers fallback if very low.
-    - If explicit [Confidence: N] exists, use it.
-    - Otherwise fall back to heuristic based on cleaned response length.
-    - Only triggers web search if below CONFIDENCE_THRESHOLD.
-    """
     confidence_match = re.search(r'\[Confidence:\s*([0-9]{1,3})\]', raw_response_with_tags, flags=re.IGNORECASE)
     cleaned_response = clean_response_stream(raw_response_with_tags)
@@ -138,33 +153,35 @@ def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> st
         except Exception:
             confidence_score = 0
     else:
-        # heuristic: very short or empty cleaned response -> low confidence
-        if not cleaned_response or len(cleaned_response.strip()) < 30:
-            confidence_score = 10
-        else:
-            confidence_score = 85
     if confidence_score < CONFIDENCE_THRESHOLD:
-        print(f"Low confidence ({confidence_score}%) detected (threshold={CONFIDENCE_THRESHOLD}). Triggering web-search fallback.")
         search_snippet = web_search_tool(prompt)
         if "error" in cleaned_response.lower() or confidence_score <= 5:
-            final_response = f"I apologize for the limited response (Confidence: {confidence_score}%). {search_snippet} I will use this to generate a more comprehensive answer."
         else:
-            final_response = f"{cleaned_response} {search_snippet} I can elaborate further based on this."
     else:
         final_response = cleaned_response
     return final_response
 def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
-    """Perform VQA via the image_pipe. Returns a prompt-injection string for the LLM and success flag.
-    If the VLM fails or returns nothing meaningful, return helpful instructions to the LLM rather than empty.
     """
     global image_pipe
     success = False
     if image_pipe is None:
-        return f"[Image Processing Error: VLM model is not loaded.] **User Query:** {message}", success
     image = None
     try:
@@ -172,33 +189,76 @@ def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
             image = Image.open(image_data_or_path).convert("RGB")
         elif isinstance(image_data_or_path, np.ndarray):
             image = Image.fromarray(image_data_or_path).convert("RGB")
-        if image:
-            vqa_prompt = f"USER: <image>\n{message}\nASSISTANT:"
-            results = image_pipe(image, prompt=vqa_prompt, generate_kwargs={"max_new_tokens": 1024})
-            raw_vlm_output = results[0].get('generated_text', "") if results and isinstance(results, list) else ""
-            vqa_response = raw_vlm_output.split("ASSISTANT:")[-1].strip() if raw_vlm_output else ""
-            # If empty or nonsense, produce a friendly fallback message
-            if not vqa_response:
-                vqa_response = (
-                    "VQA analysis returned no clear answer. Possible reasons: image unreadable, wrong crop, or "
-                    "ambiguous content. Please re-upload a clearer image or provide more context about what you want."
-                )
-                success = False
-            else:
-                success = True
-            del image
-            prompt_injection = f"**VQA Analysis:** {vqa_response}\n\n**User Query:** {message}"
-            return prompt_injection, success
     except Exception as e:
-        print(f"Image Pipeline Error: {e}")
         return f"[Image Processing Error: {e}] **User Query:** {message}", success
-    return f"[Image Processing Error: Could not load image data.] **User Query:** {message}", success
 def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.update, bool, gr.update]:
     if stt_pipe is None or audio_file_path is None:
         error_msg = "Error: Whisper model failed to load or no audio recorded."
@@ -248,29 +308,27 @@ INTENT_STATUS_MAP = {
 }
 def get_intent_status(raw_response: str, is_vqa_flow: bool) -> Tuple[str, str, str]:
-    """Parses intent (and removes tags for display). Returns (intent, status, cleaned_text_for_display)."""
-    match = re.search(r'\[Intent:\s*([\w\-\_]+)\]', raw_response, re.IGNORECASE)
-    intent = match.group(1).lower() if match else "default"
     if is_vqa_flow:
         intent = "vqa"
-    # Remove only the display tags, keep raw_response intact elsewhere
-    cleaned_text = re.sub(r'\[Intent:\s*[\w\-\_]+\]\s*', '', raw_response, count=1, flags=re.IGNORECASE).strip()
-    cleaned_text = re.sub(r'\[Confidence:\s*\d{1,3}\]\s*', '', cleaned_text, count=1, flags=re.IGNORECASE).strip()
-    cleaned_text = clean_response_stream(cleaned_text)  # extra clean
     status = INTENT_STATUS_MAP.get(intent, INTENT_STATUS_MAP["default"])
     return intent, status, cleaned_text
 def generate_file_content(content: str, history: List[Dict[str, str]], file_type: str):
-    """Generates a file (Image, DOCX, PPTX) and returns the file path for download.
-    If content is too short or missing, ask the user to clarify instead of producing empty files.
-    """
     file_path = None
     try:
-        if not content or len(content.strip()) < 20:
             history[-1]['content'] = (
-                f"⚠️ I was instructed to generate a {file_type}, but I don't have enough details. "
-                "Could you please provide a short description or title for the file (what should it contain)?"
             )
             return history, None
@@ -297,7 +355,6 @@ def generate_file_content(content: str, history: List[Dict[str, str]], file_type
             try:
                 slide.placeholders[1].text = content[:200] + "..."
             except Exception:
-                # fallback if layout mismatch
                 pass
             file_filename = f"generated_ppt_{random.randint(1000, 9999)}.pptx"
             file_path = os.path.join(DOC_DIR, file_filename)
@@ -313,22 +370,21 @@ def generate_file_content(content: str, history: List[Dict[str, str]], file_type
         file_path = None
     return history, file_path
 # --- CORE GENERATOR FUNCTION ---
 def chat_generator(message_from_input: str, image_input_data: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
     """
-    Returns: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
-    Changes made:
-    - user_turn will now only append the user message. We add the assistant entry here once generation starts,
-      so there's no empty assistant box created prematurely.
     """
-    # Validate that last item is a USER (we expect user_turn to add only the user record)
     if not history or history[-1]['role'] != 'user':
         yield history, False, "Error: Generator called in unexpected state (no user message found).", gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
         return
     last_user_index = len(history) - 1
-    original_message = history[last_user_index]['content'] if history[last_user_index]['content'] is not None else ""
     # Detect VQA flow
     is_vqa_flow = False
@@ -339,36 +395,37 @@ def chat_generator(message_from_input: str, image_input_data: Any, history: List
     else:
         is_vqa_flow = image_input_data is not None
-    # Process image if present (returns prompt injection for LLM)
     vqa_success = False
     llm_input_message = original_message
     if is_vqa_flow:
         processed_message, vqa_success = process_image(image_input_data, original_message)
-        # Replace the user's content with tag for logging while preserving original_message separately
         history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
         llm_input_message = processed_message
-    # Build prompt (system + conversation)
     prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
-    for item in history[:-1]:  # all conversation before last user
         role = item['role'].upper()
-        content = item['content'] if item['content'] is not None else ""
         if role == "ASSISTANT":
             prompt += f"LUNA: {content}\n"
         elif role == "USER":
             prompt += f"USER: {content}\n"
     prompt += f"USER: {llm_input_message}\nLUNA: "
-    # Now create assistant entry only when we begin generation (avoids empty assistant box)
     assistant_initial_text = "✨ Luna is starting to think..."
     history.append({"role": "assistant", "content": assistant_initial_text})
-    # Early UI update to show the thinking state (assistant box will appear now)
     yield history, stop_signal, assistant_initial_text, gr.update(value="", interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
-    time.sleep(0.2)
     full_response = ""
     current_intent = "default"
     try:
         stream = llm.create_completion(
@@ -378,70 +435,99 @@ def chat_generator(message_from_input: str, image_input_data: Any, history: List
         )
     except Exception as e:
         error_text = f"❌ Error generating response: {e}"
-        # update assistant with error
         history[-1]['content'] = error_text
         yield history, False, error_text, gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
         return
-    # Stream tokens and update assistant content incrementally (without exposing tags)
     try:
         for output in stream:
             token = output["choices"][0].get("text", "")
             full_response += token
-            current_intent, current_hint, display_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
-            # display_text is cleaned (no [Intent] or [Confidence])
-            # Ensure we never set assistant content to empty — if cleaned is empty, show a small typing indicator
-            history[-1]['content'] = display_text if display_text.strip() else "✨ Luna is forming a reply..."
             yield history, stop_signal, current_hint, gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
     except Exception as e:
-        # Stream interruption — salvage what we have
-        _, _, final_response_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
-        error_msg = f"⚠️ Streaming interrupted: {e}"
-        history[-1]['content'] = final_response_text if final_response_text.strip() else error_msg
-        yield history, False, error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=True), image_input_data, gr.update(), gr.update()
         return
     # POST-PROCESSING & TOOL EXECUTION
     file_download_path = None
     _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow and vqa_success)
-    # If model wants to run a tool but content is weak, ask for clarification instead of generating empty files
     if current_intent == "image_generate":
-        if not content_for_tool or len(content_for_tool.strip()) < 20:
-            history[-1]['content'] = "I detected a request to generate an image but I don't have enough prompt details. Please give a short description: e.g. 'sunset over mountains, vibrant colors'."
         else:
             history[-1]['content'] = INTENT_STATUS_MAP[current_intent]
             yield history, stop_signal, history[-1]['content'], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
             history, file_download_path = generate_file_content(content_for_tool, history, "image")
     elif current_intent == "doc_generate":
-        if not content_for_tool or len(content_for_tool.strip()) < 20:
-            history[-1]['content'] = "I was asked to generate a document but I need more details — what's the document about? (1–2 sentences.)"
         else:
             history[-1]['content'] = INTENT_STATUS_MAP[current_intent]
             yield history, stop_signal, history[-1]['content'], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
             history, file_download_path = generate_file_content(content_for_tool, history, "doc")
     elif current_intent == "ppt_generate":
-        if not content_for_tool or len(content_for_tool.strip()) < 20:
-            history[-1]['content'] = "I can make a short presentation, but please give me a title and 3���5 bullet points to include."
         else:
             history[-1]['content'] = INTENT_STATUS_MAP[current_intent]
             yield history, stop_signal, history[-1]['content'], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
             history, file_download_path = generate_file_content(content_for_tool, history, "ppt")
     elif current_intent == "open_google":
-        final_cleaned_response = content_for_tool + "\n\n🔗 **Action:** [Search Google](https://www.google.com/search?q=open+google+simulated+search)"
-        history[-1]['content'] = final_cleaned_response
     elif current_intent == "open_camera":
-        final_cleaned_response = content_for_tool + "\n\n📸 **Action:** Use the 'Google Lens' button to capture an image."
-        history[-1]['content'] = final_cleaned_response
     else:
-        # Normal response path — check confidence and maybe augment with web-search snippet
         final_response_content = check_confidence_and_augment(full_response, original_message)
         history[-1]['content'] = final_response_content
-    # If after all processing the assistant content is empty (defensive), fill a friendly fallback
     if not history[-1]['content'] or not str(history[-1]['content']).strip():
         history[-1]['content'] = "Sorry — I couldn't produce a good response. Can you rephrase or give more details?"
     audio_file_path = text_to_audio(history[-1]['content'], is_voice_chat)
     hint = "✅ Response generated."
@@ -456,9 +542,8 @@ def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, g
 def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]]]:
     """
-    Appends only the USER message to chat_history. We no longer append an assistant placeholder here,
-    so the UI won't show an empty assistant box immediately after user sends a message.
-    The assistant will be appended inside chat_generator when generation begins.
     """
     has_text = bool(user_message and user_message.strip())
     has_image = False
@@ -472,17 +557,13 @@ def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_imag
     if not has_text and not has_image:
         return user_message, chat_history
-    # Prevent double-sending if assistant is already generating (detect last assistant placeholder)
     if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] and "thinking" in chat_history[-1]['content'].lower():
         return user_message, chat_history
-    if not has_text and has_image:
-        user_message_to_add = "Analyzing Staged Media."
-    else:
-        user_message_to_add = user_message.strip()
     chat_history.append({"role": "user", "content": user_message_to_add})
-    # do NOT append assistant here — chat_generator will append assistant entry when it starts
     return "", chat_history
 def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
@@ -491,7 +572,6 @@ def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
     return None, "File upload cancelled.", gr.update(value="", interactive=True), gr.update(interactive=False)
 def clear_staged_media() -> gr.update:
-    """Clears the staged media state component."""
     return gr.update(value=None)
 def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str, gr.update]:
@@ -517,10 +597,7 @@ def auto_capture_camera(user_message: str, chat_history: List[Dict[str, str]], s
 # --- GRADIO INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
-    # --- State Components ---
     stop_signal = gr.State(value=False)
     is_voice_chat = gr.State(value=False)
     staged_image = gr.State(value=None)
@@ -558,7 +635,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
     output_components = [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
-    # --- WIRE EVENTS ---
     btn_menu.click(
         fn=toggle_menu, inputs=[menu_visible_state], outputs=[menu_visible_state, menu_options_row, fact_check_btn_row, btn_menu], queue=False
     )
@@ -600,7 +677,6 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
     generator_inputs = [txt, staged_image, chatbot, stop_signal, is_voice_chat]
-    # Text submit (Enter key)
     txt.submit(
         fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
     ).then(
@@ -609,7 +685,6 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
         fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False
     )
-    # Send button click
     combined_btn.click(
         fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
     ).then(

 from docx import Document
 from pptx import Presentation
 from io import BytesIO
+import numpy as np
 # --- CONFIGURATION & INITIALIZATION ---
 STT_DEVICE = "cpu"
 LOCAL_MODEL_PATH = MODEL_FILE
 SYSTEM_PROMPT = (
     "You are Luna, a helpful and friendly AI assistant. Your response must begin with two separate "
+    "tags: an Intent tag and a Confidence tag (0-100). Example: '[Intent: qa_general][Confidence: 85]'. "
     "Your full response must follow these tags."
 )
+# --- TUNABLES / GUARDS ---
+CONFIDENCE_THRESHOLD = 30         # only trigger web-search fallback if confidence is less than this
+STREAM_CHAR_LIMIT = 35000         # hard cap on streaming response size (prevents runaway)
+STREAM_ITER_LIMIT = 20000         # hard cap on streaming token iterations
+MIN_MEANINGFUL_LENGTH = 20        # used when determining if a tool prompt is sufficient
 def safe_del(self):
     try:
     time.sleep(3)
     return None
+def remove_all_tags(text: str) -> str:
+    """Remove ALL bracketed Intent/Confidence tags and any plain 'Intent:' or 'Confidence:' lines.
+    This ensures tags never leak into the UI.
     """
+    if not text:
+        return ""
+    # remove bracketed tags like [Intent: xyz] [Confidence: 85]
+    text = re.sub(r'\[Intent:\s*[\w\-\_]+\]', '', text, flags=re.IGNORECASE)
+    text = re.sub(r'\[Confidence:\s*\d{1,3}\]', '', text, flags=re.IGNORECASE)
+    # remove any lines that start with "Intent:" or "Confidence:" (plain text)
+    text = re.sub(r'(?im)^\s*Intent:\s*.*$', '', text)
+    text = re.sub(r'(?im)^\s*Confidence:\s*.*$', '', text)
+    # collapse multiple blank lines
+    text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
+    return text
+def clean_response_stream(raw_text: str) -> str:
+    """Cleans up raw response text and removes any tag artifacts (safe for UI)."""
+    if not raw_text:
+        return ""
+    # Trim at common model separators to avoid carrying lots of leftover prompt text
+    truncated = re.split(r'\nUser:|\nAssistant:|</s>', raw_text, 1)[0].strip()
+    # Remove instruction tokens
+    truncated = re.sub(r'\[/?INST\]|\[/?s\]|\s*<action>.*?</action>\s*', '', truncated, flags=re.DOTALL).strip()
+    # Remove any explicit tags and any plain lines referencing Intent/Confidence
+    truncated = remove_all_tags(truncated)
+    # Deduplicate trivial repeated endings
+    words = truncated.split()
     if len(words) > 4 and words[-2:] == words[-4:-2]:
+        truncated = ' '.join(words[:-2])
+    return truncated.strip()
 def web_search_tool(query: str) -> str:
+    time.sleep(1.2)
     print(f"Simulating Google Search fallback for: {query}")
+    return f"\n\n🌐 **Web Search Results for '{query}':** I found supplemental info to help answer this."
 def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> str:
+    """Checks confidence and optionally augments via web search. This function never returns raw tags."""
+    # Find only the first confidence occurrence (if any)
     confidence_match = re.search(r'\[Confidence:\s*([0-9]{1,3})\]', raw_response_with_tags, flags=re.IGNORECASE)
     cleaned_response = clean_response_stream(raw_response_with_tags)
         except Exception:
             confidence_score = 0
     else:
+        # heuristic: if the cleaned response is short or empty, regard as low confidence
+        confidence_score = 10 if not cleaned_response or len(cleaned_response) < 30 else 85
     if confidence_score < CONFIDENCE_THRESHOLD:
+        # very low confidence -> append search snippet
+        print(f"Low confidence ({confidence_score}%) detected (threshold={CONFIDENCE_THRESHOLD}). Using web fallback.")
         search_snippet = web_search_tool(prompt)
         if "error" in cleaned_response.lower() or confidence_score <= 5:
+            final_response = f"I apologize — I couldn't produce a reliable answer (Confidence: {confidence_score}%). {search_snippet}"
         else:
+            final_response = f"{cleaned_response}\n\n{search_snippet}\n\nIf you'd like, I can attempt a deeper search or try again."
     else:
         final_response = cleaned_response
+    # Ensure final_response contains no tags
+    final_response = remove_all_tags(final_response)
     return final_response
 def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
+    """Perform VQA via the image_pipe. Robust to different pipeline return types.
+    Returns:
+      - prompt_injection: text to include in LLM prompt describing the VQA output or error guidance
+      - success: whether the VLM produced a clear answer
     """
     global image_pipe
     success = False
     if image_pipe is None:
+        return f"[Image Processing Error: VLM model not loaded.] **User Query:** {message}", success
     image = None
     try:
             image = Image.open(image_data_or_path).convert("RGB")
         elif isinstance(image_data_or_path, np.ndarray):
             image = Image.fromarray(image_data_or_path).convert("RGB")
+        else:
+            # Unknown image container (e.g., bytes)
+            try:
+                image = Image.open(BytesIO(image_data_or_path)).convert("RGB")
+            except Exception:
+                image = None
+        if image is None:
+            return f"[Image Processing Error: Could not open image.] **User Query:** {message}", success
+        vqa_prompt = f"USER: <image>\n{message}\nASSISTANT:"
+        # Some pipelines accept (image, prompt=...), some accept kwargs. Try both patterns and be defensive.
+        results = None
+        try:
+            results = image_pipe(image, prompt=vqa_prompt)
+        except TypeError:
+            # fallback signature
+            try:
+                results = image_pipe(image)
+            except Exception as e:
+                print(f"Image pipeline call failed: {e}")
+                results = None
+        except Exception as e:
+            print(f"Image pipeline call error: {e}")
+            results = None
+        raw_vlm_output = ""
+        if results is None:
+            raw_vlm_output = ""
+        elif isinstance(results, dict):
+            # some pipelines return dict with 'generated_text' or 'text'
+            raw_vlm_output = results.get('generated_text') or results.get('text') or ""
+        elif isinstance(results, list):
+            # list of dicts or strings
+            first = results[0]
+            if isinstance(first, dict):
+                raw_vlm_output = first.get('generated_text') or first.get('text') or ""
+            elif isinstance(first, str):
+                raw_vlm_output = first
+        elif isinstance(results, str):
+            raw_vlm_output = results
+        else:
+            # unknown shape -> convert to string safe
+            try:
+                raw_vlm_output = str(results)
+            except Exception:
+                raw_vlm_output = ""
+        # extract assistant part
+        vqa_response = raw_vlm_output.split("ASSISTANT:")[-1].strip() if raw_vlm_output else ""
+        # If no meaningful vqa_response, return a helpful fallback message
+        if not vqa_response or len(vqa_response) < 10:
+            vqa_response = (
+                "VQA analysis did not return a clear answer. The image might be unclear or the request ambiguous. "
+                "Please try re-uploading a clearer image, crop to the subject, or add a short instruction about what you'd like answered."
+            )
+            success = False
+        else:
+            success = True
+        # Always remove any tags to prevent leaking
+        vqa_response = remove_all_tags(vqa_response)
+        prompt_injection = f"**VQA Analysis:** {vqa_response}\n\n**User Query:** {message}"
+        return prompt_injection, success
     except Exception as e:
+        print(f"Image Pipeline Exception: {e}")
         return f"[Image Processing Error: {e}] **User Query:** {message}", success
 def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.update, bool, gr.update]:
     if stt_pipe is None or audio_file_path is None:
         error_msg = "Error: Whisper model failed to load or no audio recorded."
 }
 def get_intent_status(raw_response: str, is_vqa_flow: bool) -> Tuple[str, str, str]:
+    """Parses intent (first occurrence only) and returns (intent, status, cleaned_display_text).
+    Importantly: this DOES NOT expose any tags — we remove them for display.
+    """
+    intent_match = re.search(r'\[Intent:\s*([\w\-\_]+)\]', raw_response, re.IGNORECASE)
+    intent = intent_match.group(1).lower() if intent_match else "default"
     if is_vqa_flow:
         intent = "vqa"
+    # Clean raw_response for display: remove all tags and noisy prompt leftovers
+    cleaned_text = clean_response_stream(raw_response)
     status = INTENT_STATUS_MAP.get(intent, INTENT_STATUS_MAP["default"])
     return intent, status, cleaned_text
 def generate_file_content(content: str, history: List[Dict[str, str]], file_type: str):
+    """Generates a file and writes it to disk. If content insufficient, asks for clarification."""
     file_path = None
     try:
+        if not content or len(content.strip()) < MIN_MEANINGFUL_LENGTH:
             history[-1]['content'] = (
+                f"⚠️ I was asked to create a {file_type}, but I don't have enough details. "
+                "Please provide a 1–2 sentence description of what the file should contain."
             )
             return history, None
             try:
                 slide.placeholders[1].text = content[:200] + "..."
             except Exception:
                 pass
             file_filename = f"generated_ppt_{random.randint(1000, 9999)}.pptx"
             file_path = os.path.join(DOC_DIR, file_filename)
         file_path = None
     return history, file_path
 # --- CORE GENERATOR FUNCTION ---
 def chat_generator(message_from_input: str, image_input_data: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
     """
+    - Assistant entry is appended ONLY when generation begins (avoids empty assistant box).
+    - Strict caps on streaming to avoid infinite printing.
+    - Strict removal of tags before any content is written to history for UI.
     """
+    # Validate last item is user's message
     if not history or history[-1]['role'] != 'user':
         yield history, False, "Error: Generator called in unexpected state (no user message found).", gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
         return
     last_user_index = len(history) - 1
+    original_message = history[last_user_index]['content'] or ""
     # Detect VQA flow
     is_vqa_flow = False
     else:
         is_vqa_flow = image_input_data is not None
     vqa_success = False
     llm_input_message = original_message
     if is_vqa_flow:
         processed_message, vqa_success = process_image(image_input_data, original_message)
+        # mark user's entry (but keep original message around)
         history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
         llm_input_message = processed_message
+    # Build prompt
     prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
+    for item in history[:-1]:
         role = item['role'].upper()
+        content = item['content'] or ""
         if role == "ASSISTANT":
+            # ensure assistant content used in prompt still includes tags if model expects them (we don't alter)
             prompt += f"LUNA: {content}\n"
         elif role == "USER":
             prompt += f"USER: {content}\n"
     prompt += f"USER: {llm_input_message}\nLUNA: "
+    # Add assistant entry now (so it appears only when generation starts)
     assistant_initial_text = "✨ Luna is starting to think..."
     history.append({"role": "assistant", "content": assistant_initial_text})
+    # Early UI update: show thinking state
     yield history, stop_signal, assistant_initial_text, gr.update(value="", interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
+    time.sleep(0.15)
     full_response = ""
     current_intent = "default"
+    iter_count = 0
     try:
         stream = llm.create_completion(
         )
     except Exception as e:
         error_text = f"❌ Error generating response: {e}"
         history[-1]['content'] = error_text
         yield history, False, error_text, gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
         return
+    # Stream tokens with strict caps and tag-removal for UI
     try:
         for output in stream:
+            iter_count += 1
+            if iter_count > STREAM_ITER_LIMIT:
+                # safety abort
+                full_response += "\n\n[Stream stopped: reached iteration limit]"
+                print("Stream aborted: iteration limit reached.")
+                break
             token = output["choices"][0].get("text", "")
+            if not isinstance(token, str):
+                token = str(token)
             full_response += token
+            # safety cap on characters to prevent runaway printing
+            if len(full_response) > STREAM_CHAR_LIMIT:
+                full_response = full_response[:STREAM_CHAR_LIMIT] + "\n\n[Truncated: stream length limit reached]"
+                print("Stream truncated: char limit reached.")
+                break
+            # parse intent/status and cleaned display without exposing tags
+            current_intent, current_hint, cleaned_display = get_intent_status(full_response, is_vqa_flow and vqa_success)
+            # enforce tag-suppression: cleaned_display MUST NOT contain tag patterns
+            cleaned_display = remove_all_tags(cleaned_display)
+            # guarantee non-empty display while streaming
+            if not cleaned_display.strip():
+                cleaned_display = "✨ Luna is forming a reply..."
+            history[-1]['content'] = cleaned_display
             yield history, stop_signal, current_hint, gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
     except Exception as e:
+        # On exception, salvage partial output
+        _, _, final_clean = get_intent_status(full_response, is_vqa_flow and vqa_success)
+        final_clean = remove_all_tags(final_clean) or f"⚠️ Streaming interrupted: {e}"
+        history[-1]['content'] = final_clean
+        yield history, False, f"⚠️ Streaming interrupted: {e}", gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=True), image_input_data, gr.update(), gr.update()
         return
     # POST-PROCESSING & TOOL EXECUTION
     file_download_path = None
     _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow and vqa_success)
+    content_for_tool = remove_all_tags(content_for_tool)  # ensure no tags in tool prompts
+    # Handle tool intents, but require sufficient content; otherwise ask for clarification
     if current_intent == "image_generate":
+        if not content_for_tool or len(content_for_tool.strip()) < MIN_MEANINGFUL_LENGTH:
+            history[-1]['content'] = "I detected an image generation intent but didn't get enough details. Please give a short prompt (e.g. 'a red bicycle in a park at sunrise')."
         else:
             history[-1]['content'] = INTENT_STATUS_MAP[current_intent]
             yield history, stop_signal, history[-1]['content'], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
             history, file_download_path = generate_file_content(content_for_tool, history, "image")
     elif current_intent == "doc_generate":
+        if not content_for_tool or len(content_for_tool.strip()) < MIN_MEANINGFUL_LENGTH:
+            history[-1]['content'] = "I can create a document, but I need a 1–2 sentence description of what to include."
         else:
             history[-1]['content'] = INTENT_STATUS_MAP[current_intent]
             yield history, stop_signal, history[-1]['content'], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
             history, file_download_path = generate_file_content(content_for_tool, history, "doc")
     elif current_intent == "ppt_generate":
+        if not content_for_tool or len(content_for_tool.strip()) < MIN_MEANINGFUL_LENGTH:
+            history[-1]['content'] = "I can make a presentation — please give a title and 3–5 bullet points."
         else:
             history[-1]['content'] = INTENT_STATUS_MAP[current_intent]
             yield history, stop_signal, history[-1]['content'], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
             history, file_download_path = generate_file_content(content_for_tool, history, "ppt")
     elif current_intent == "open_google":
+        final_cleaned_response = (content_for_tool or "").strip() + "\n\n🔗 **Action:** [Search Google](https://www.google.com/search?q=open+google+simulated+search)"
+        history[-1]['content'] = remove_all_tags(final_cleaned_response)
     elif current_intent == "open_camera":
+        final_cleaned_response = (content_for_tool or "").strip() + "\n\n📸 **Action:** Use the 'Google Lens' button to capture an image."
+        history[-1]['content'] = remove_all_tags(final_cleaned_response)
     else:
+        # default path: evaluate confidence and optionally augment with web search
         final_response_content = check_confidence_and_augment(full_response, original_message)
         history[-1]['content'] = final_response_content
+    # Final defensive fallback
     if not history[-1]['content'] or not str(history[-1]['content']).strip():
         history[-1]['content'] = "Sorry — I couldn't produce a good response. Can you rephrase or give more details?"
+    # convert to audio if requested
     audio_file_path = text_to_audio(history[-1]['content'], is_voice_chat)
     hint = "✅ Response generated."
 def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]]]:
     """
+    Appends only the USER message to chat_history. Assistant entry is appended inside chat_generator
+    once generation starts (avoids empty assistant box).
     """
     has_text = bool(user_message and user_message.strip())
     has_image = False
     if not has_text and not has_image:
         return user_message, chat_history
+    # Prevent double-send if assistant already generating
     if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] and "thinking" in chat_history[-1]['content'].lower():
         return user_message, chat_history
+    user_message_to_add = "Analyzing Staged Media." if (not has_text and has_image) else user_message.strip()
     chat_history.append({"role": "user", "content": user_message_to_add})
+    # DO NOT append assistant here
     return "", chat_history
 def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
     return None, "File upload cancelled.", gr.update(value="", interactive=True), gr.update(interactive=False)
 def clear_staged_media() -> gr.update:
     return gr.update(value=None)
 def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str, gr.update]:
 # --- GRADIO INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
     stop_signal = gr.State(value=False)
     is_voice_chat = gr.State(value=False)
     staged_image = gr.State(value=None)
     output_components = [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
+    # --- WIRING ---
     btn_menu.click(
         fn=toggle_menu, inputs=[menu_visible_state], outputs=[menu_visible_state, menu_options_row, fact_check_btn_row, btn_menu], queue=False
     )
     generator_inputs = [txt, staged_image, chatbot, stop_signal, is_voice_chat]
     txt.submit(
         fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
     ).then(
         fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False
     )
     combined_btn.click(
         fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
     ).then(