eudr_chabo_orchestrator

Running on CPU Upgrade

App Files Files Community

mtyrrell commited on Sep 8

Commit

335202a

1 Parent(s): c245449

routing changes

Browse files

Files changed (1) hide show

app/main.py +142 -91

app/main.py CHANGED Viewed

@@ -16,6 +16,7 @@ from contextlib import asynccontextmanager
 import threading
 from langchain_core.runnables import RunnableLambda
 import tempfile
 from utils import getconfig
@@ -23,11 +24,53 @@ config = getconfig("params.cfg")
 RETRIEVER = config.get("retriever", "RETRIEVER", fallback="https://giz-chatfed-retriever.hf.space")
 GENERATOR = config.get("generator", "GENERATOR", fallback="https://giz-chatfed-generator.hf.space")
 INGESTOR = config.get("ingestor", "INGESTOR", fallback="https://mtyrrell-chatfed-ingestor.hf.space")
 MAX_CONTEXT_CHARS = config.get("general", "MAX_CONTEXT_CHARS")
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # Models
 class GraphState(TypedDict):
@@ -42,6 +85,8 @@ class GraphState(TypedDict):
     file_content: Optional[bytes]
     filename: Optional[str]
     metadata: Optional[Dict[str, Any]]
 class ChatFedInput(TypedDict):
     query: str
@@ -61,9 +106,38 @@ class ChatFedOutput(TypedDict):
 class ChatUIInput(BaseModel):
     text: str
 # Module functions
 def ingest_node(state: GraphState) -> GraphState:
-    """Process file through ingestor if file is provided"""
     start_time = datetime.now()
     # If no file provided, skip this step
@@ -71,10 +145,19 @@ def ingest_node(state: GraphState) -> GraphState:
         logger.info("No file provided, skipping ingestion")
         return {"ingestor_context": "", "metadata": state.get("metadata", {})}
-    logger.info(f"Ingesting file: {state['filename']}")
     try:
-        client = Client(INGESTOR)
         # Create a temporary file to upload
         with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(state["filename"])[1]) as tmp_file:
@@ -82,9 +165,9 @@ def ingest_node(state: GraphState) -> GraphState:
             tmp_file_path = tmp_file.name
         try:
-            # Call the ingestor's ingest endpoint - use gradio_client.file() for proper formatting
             ingestor_context = client.predict(
-                file(tmp_file_path),  # Use gradio_client.file() to properly format
                 api_name="/ingest"
             )
@@ -103,7 +186,8 @@ def ingest_node(state: GraphState) -> GraphState:
         metadata.update({
             "ingestion_duration": duration,
             "ingestor_context_length": len(ingestor_context) if ingestor_context else 0,
-            "ingestion_success": True
         })
         return {
@@ -122,52 +206,26 @@ def ingest_node(state: GraphState) -> GraphState:
             "ingestion_error": str(e)
         })
         return {"ingestor_context": "", "metadata": metadata}
-    try:
-        client = Client(INGESTOR)
-        # Create a temporary file to upload
-        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(state["filename"])[1]) as tmp_file:
-            tmp_file.write(state["file_content"])
-            tmp_file_path = tmp_file.name
-        try:
-            # Call the ingestor's ingest endpoint - returns context directly
-            ingestor_context = client.predict(
-                file=tmp_file_path,
-                api_name="/ingest"
-            )
-            logger.info(f"Ingest result length: {len(ingestor_context) if ingestor_context else 0}")
-        finally:
-            # Clean up temporary file
-            os.unlink(tmp_file_path)
-        duration = (datetime.now() - start_time).total_seconds()
-        metadata = state.get("metadata", {})
-        metadata.update({
-            "ingestion_duration": duration,
-            "ingestor_context_length": len(ingestor_context) if ingestor_context else 0,
-            "ingestion_success": True
-        })
-        return {
-            "ingestor_context": ingestor_context,
-            "metadata": metadata
-        }
-    except Exception as e:
-        duration = (datetime.now() - start_time).total_seconds()
-        logger.error(f"Ingestion failed: {str(e)}")
-        metadata = state.get("metadata", {})
-        metadata.update({
-            "ingestion_duration": duration,
-            "ingestion_success": False,
-            "ingestion_error": str(e)
-        })
-        return {"ingestor_context": "", "metadata": metadata}
 def retrieve_node(state: GraphState) -> GraphState:
     start_time = datetime.now()
@@ -260,15 +318,41 @@ def generate_node(state: GraphState) -> GraphState:
         })
         return {"result": f"Error: {str(e)}", "metadata": metadata}
-# Updated graph with ingest node
 workflow = StateGraph(GraphState)
 workflow.add_node("ingest", ingest_node)
 workflow.add_node("retrieve", retrieve_node)
 workflow.add_node("generate", generate_node)
-workflow.add_edge(START, "ingest")
-workflow.add_edge("ingest", "retrieve")
 workflow.add_edge("retrieve", "generate")
 workflow.add_edge("generate", END)
 compiled_graph = workflow.compile()
 def process_query_core(
@@ -299,6 +383,8 @@ def process_query_core(
             "year_filter": year_filter or "",
             "file_content": file_content,
             "filename": filename,
             "metadata": {
                 "session_id": session_id,
                 "user_id": user_id,
@@ -404,12 +490,12 @@ def process_query_langserve(input_data: ChatFedInput) -> ChatFedOutput:
 def create_gradio_interface():
     with gr.Blocks(title="ChatFed Orchestrator") as demo:
         gr.Markdown("# ChatFed Orchestrator")
-        gr.Markdown("Upload documents (PDF/DOCX) alongside your queries for enhanced context. MCP endpoints available at `/gradio_api/mcp/sse`")
         with gr.Row():
             with gr.Column():
                 query_input = gr.Textbox(label="Query", lines=2, placeholder="Enter your question...")
-                file_input = gr.File(label="Upload Document (PDF/DOCX)", file_types=[".pdf", ".docx"])
                 with gr.Accordion("Filters (Optional)", open=False):
                     reports_filter_input = gr.Textbox(label="Reports Filter", placeholder="e.g., annual_reports")
@@ -496,41 +582,6 @@ async def chatfed_with_file(
     return ChatFedOutput(result=result["result"], metadata=result["metadata"])
-# Additional endpoint for file uploads via API
-@app.post("/chatfed-with-file")
-async def chatfed_with_file(
-    query: str = Form(...),
-    file: Optional[UploadFile] = File(None),
-    reports_filter: Optional[str] = Form(""),
-    sources_filter: Optional[str] = Form(""),
-    subtype_filter: Optional[str] = Form(""),
-    year_filter: Optional[str] = Form(""),
-    session_id: Optional[str] = Form(None),
-    user_id: Optional[str] = Form(None)
-):
-    """Endpoint for queries with optional file attachments"""
-    file_content = None
-    filename = None
-    if file:
-        file_content = await file.read()
-        filename = file.filename
-    result = process_query_core(
-        query=query,
-        reports_filter=reports_filter,
-        sources_filter=sources_filter,
-        subtype_filter=subtype_filter,
-        year_filter=year_filter,
-        file_content=file_content,
-        filename=filename,
-        session_id=session_id,
-        user_id=user_id,
-        return_metadata=True
-    )
-    return ChatFedOutput(result=result["result"], metadata=result["metadata"])
 # LangServe routes (these are the main endpoints)
 add_routes(
     app,

 import threading
 from langchain_core.runnables import RunnableLambda
 import tempfile
+import mimetypes
 from utils import getconfig
 RETRIEVER = config.get("retriever", "RETRIEVER", fallback="https://giz-chatfed-retriever.hf.space")
 GENERATOR = config.get("generator", "GENERATOR", fallback="https://giz-chatfed-generator.hf.space")
 INGESTOR = config.get("ingestor", "INGESTOR", fallback="https://mtyrrell-chatfed-ingestor.hf.space")
+GEOJSON_INGESTOR = config.get("ingestor", "GEOJSON_INGESTOR", fallback="https://giz-eudr-chatfed-ingestor.hf.space")
 MAX_CONTEXT_CHARS = config.get("general", "MAX_CONTEXT_CHARS")
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# File type detection
+def detect_file_type(filename: str, file_content: bytes = None) -> str:
+    """Detect file type based on extension and content"""
+    if not filename:
+        return "unknown"
+    # Get file extension
+    _, ext = os.path.splitext(filename.lower())
+    # Define file type mappings
+    file_type_mappings = {
+        '.geojson': 'geojson',
+        '.json': 'json',  # Could be geojson, will check content
+        '.pdf': 'text',
+        '.docx': 'text',
+        '.doc': 'text',
+        '.txt': 'text',
+        '.md': 'text',
+        '.csv': 'text',
+        '.xlsx': 'text',
+        '.xls': 'text'
+    }
+    detected_type = file_type_mappings.get(ext, 'unknown')
+    # For JSON files, check if it's actually GeoJSON
+    if detected_type == 'json' and file_content:
+        try:
+            import json
+            content_str = file_content.decode('utf-8')
+            data = json.loads(content_str)
+            # Check if it has GeoJSON structure
+            if isinstance(data, dict) and ('type' in data and data.get('type') == 'FeatureCollection'):
+                detected_type = 'geojson'
+            elif isinstance(data, dict) and ('type' in data and data.get('type') in ['Feature', 'Point', 'LineString', 'Polygon', 'MultiPoint', 'MultiLineString', 'MultiPolygon', 'GeometryCollection']):
+                detected_type = 'geojson'
+        except:
+            pass  # Keep as json if parsing fails
+    logger.info(f"Detected file type: {detected_type} for file: {filename}")
+    return detected_type
 # Models
 class GraphState(TypedDict):
     file_content: Optional[bytes]
     filename: Optional[str]
     metadata: Optional[Dict[str, Any]]
+    file_type: Optional[str]
+    workflow_type: Optional[str]  # 'standard' or 'geojson_direct'
 class ChatFedInput(TypedDict):
     query: str
 class ChatUIInput(BaseModel):
     text: str
+# File type detection node
+def detect_file_type_node(state: GraphState) -> GraphState:
+    """Detect file type and determine workflow"""
+    file_type = "unknown"
+    workflow_type = "standard"
+    if state.get("file_content") and state.get("filename"):
+        file_type = detect_file_type(state["filename"], state["file_content"])
+        # Determine workflow based on file type
+        if file_type == "geojson":
+            workflow_type = "geojson_direct"
+        else:
+            workflow_type = "standard"
+    logger.info(f"File type: {file_type}, Workflow: {workflow_type}")
+    metadata = state.get("metadata", {})
+    metadata.update({
+        "file_type": file_type,
+        "workflow_type": workflow_type
+    })
+    return {
+        "file_type": file_type,
+        "workflow_type": workflow_type,
+        "metadata": metadata
+    }
 # Module functions
 def ingest_node(state: GraphState) -> GraphState:
+    """Process file through appropriate ingestor based on file type"""
     start_time = datetime.now()
     # If no file provided, skip this step
         logger.info("No file provided, skipping ingestion")
         return {"ingestor_context": "", "metadata": state.get("metadata", {})}
+    file_type = state.get("file_type", "unknown")
+    logger.info(f"Ingesting {file_type} file: {state['filename']}")
     try:
+        # Choose ingestor based on file type
+        if file_type == "geojson":
+            ingestor_url = GEOJSON_INGESTOR
+            logger.info(f"Using GeoJSON ingestor: {ingestor_url}")
+        else:
+            ingestor_url = INGESTOR
+            logger.info(f"Using standard ingestor: {ingestor_url}")
+        client = Client(ingestor_url)
         # Create a temporary file to upload
         with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(state["filename"])[1]) as tmp_file:
             tmp_file_path = tmp_file.name
         try:
+            # Call the ingestor's ingest endpoint
             ingestor_context = client.predict(
+                file(tmp_file_path),
                 api_name="/ingest"
             )
         metadata.update({
             "ingestion_duration": duration,
             "ingestor_context_length": len(ingestor_context) if ingestor_context else 0,
+            "ingestion_success": True,
+            "ingestor_used": ingestor_url
         })
         return {
             "ingestion_error": str(e)
         })
         return {"ingestor_context": "", "metadata": metadata}
+def geojson_direct_result_node(state: GraphState) -> GraphState:
+    """For GeoJSON files, return ingestor results directly without retrieval/generation"""
+    logger.info("Processing GeoJSON file - returning direct results")
+    ingestor_context = state.get("ingestor_context", "")
+    # For GeoJSON files, the ingestor result is the final result
+    result = ingestor_context if ingestor_context else "No results from GeoJSON processing."
+    metadata = state.get("metadata", {})
+    metadata.update({
+        "processing_type": "geojson_direct",
+        "result_length": len(result)
+    })
+    return {
+        "result": result,
+        "metadata": metadata
+    }
 def retrieve_node(state: GraphState) -> GraphState:
     start_time = datetime.now()
         })
         return {"result": f"Error: {str(e)}", "metadata": metadata}
+# Conditional routing function
+def route_workflow(state: GraphState) -> str:
+    """Route to appropriate workflow based on file type"""
+    workflow_type = state.get("workflow_type", "standard")
+    return workflow_type
+# Updated graph with conditional routing
 workflow = StateGraph(GraphState)
+workflow.add_node("detect_file_type", detect_file_type_node)
 workflow.add_node("ingest", ingest_node)
+workflow.add_node("geojson_direct", geojson_direct_result_node)
 workflow.add_node("retrieve", retrieve_node)
 workflow.add_node("generate", generate_node)
+# Add edges
+workflow.add_edge(START, "detect_file_type")
+workflow.add_edge("detect_file_type", "ingest")
+# Conditional routing after ingestion
+workflow.add_conditional_edges(
+    "ingest",
+    route_workflow,
+    {
+        "geojson_direct": "geojson_direct",
+        "standard": "retrieve"
+    }
+)
+# Standard workflow
 workflow.add_edge("retrieve", "generate")
 workflow.add_edge("generate", END)
+# GeoJSON direct workflow
+workflow.add_edge("geojson_direct", END)
 compiled_graph = workflow.compile()
 def process_query_core(
             "year_filter": year_filter or "",
             "file_content": file_content,
             "filename": filename,
+            "file_type": "unknown",
+            "workflow_type": "standard",
             "metadata": {
                 "session_id": session_id,
                 "user_id": user_id,
 def create_gradio_interface():
     with gr.Blocks(title="ChatFed Orchestrator") as demo:
         gr.Markdown("# ChatFed Orchestrator")
+        gr.Markdown("Upload documents (PDF/DOCX/GeoJSON) alongside your queries for enhanced context. MCP endpoints available at `/gradio_api/mcp/sse`")
         with gr.Row():
             with gr.Column():
                 query_input = gr.Textbox(label="Query", lines=2, placeholder="Enter your question...")
+                file_input = gr.File(label="Upload Document (PDF/DOCX/GeoJSON)", file_types=[".pdf", ".docx", ".geojson", ".json"])
                 with gr.Accordion("Filters (Optional)", open=False):
                     reports_filter_input = gr.Textbox(label="Reports Filter", placeholder="e.g., annual_reports")
     return ChatFedOutput(result=result["result"], metadata=result["metadata"])
 # LangServe routes (these are the main endpoints)
 add_routes(
     app,