Skip to content

Commit beb6cd8

Browse files
authored
Merge pull request #85 from owndev/copilot/fix-azure-ai-citations-issue
Fix Azure AI Foundry citations: filter unreferenced, handle empty fields
2 parents c3b997b + 8ff285a commit beb6cd8

File tree

1 file changed

+115
-19
lines changed

1 file changed

+115
-19
lines changed

pipelines/azure/azure_ai_foundry.py

Lines changed: 115 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
author_url: https://github.com/owndev/
55
project_url: https://github.com/owndev/Open-WebUI-Functions
66
funding_url: https://github.com/sponsors/owndev
7-
version: 2.5.1
7+
version: 2.5.2
88
license: Apache License 2.0
99
description: A pipeline for interacting with Azure AI services, enabling seamless communication with various AI models via configurable headers and robust error handling. This includes support for Azure OpenAI models as well as other Azure AI models by dynamically managing headers and request configurations. Azure AI Search (RAG) integration is only supported with Azure OpenAI endpoints.
1010
features:
@@ -18,7 +18,17 @@
1818
- Azure AI Search / RAG integration with enhanced citation display (Azure OpenAI only)
1919
"""
2020

21-
from typing import List, Union, Generator, Iterator, Optional, Dict, Any, AsyncIterator
21+
from typing import (
22+
List,
23+
Union,
24+
Generator,
25+
Iterator,
26+
Optional,
27+
Dict,
28+
Any,
29+
AsyncIterator,
30+
Set,
31+
)
2232
from urllib.parse import urlparse
2333
from fastapi.responses import StreamingResponse
2434
from pydantic import BaseModel, Field, GetCoreSchemaHandler
@@ -30,6 +40,7 @@
3040
import logging
3141
import base64
3242
import hashlib
43+
import re
3344
from pydantic_core import core_schema
3445

3546

@@ -375,7 +386,7 @@ def enhance_azure_search_response(self, response: Dict[str, Any]) -> Dict[str, A
375386
# Add citation section at the end
376387
if citation_details:
377388
citation_section = self._format_citation_section(
378-
citations, for_streaming=False
389+
citations, content, for_streaming=False
379390
)
380391
enhanced_content += citation_section
381392

@@ -574,6 +585,7 @@ async def stream_processor_with_citations(
574585

575586
try:
576587
full_response_buffer = ""
588+
response_content = "" # Track the actual response content
577589
citations_data = None
578590
citations_added = False
579591
all_chunks = []
@@ -586,6 +598,33 @@ async def stream_processor_with_citations(
586598
# Log chunk for debugging (only first 200 chars to avoid spam)
587599
log.debug(f"Processing chunk: {chunk_str[:200]}...")
588600

601+
# Extract content from delta messages to build the full response content
602+
try:
603+
lines = chunk_str.split("\n")
604+
for line in lines:
605+
if line.startswith("data: ") and line.strip() != "data: [DONE]":
606+
json_str = line[6:].strip()
607+
if json_str and json_str != "[DONE]":
608+
try:
609+
response_data = json.loads(json_str)
610+
if (
611+
isinstance(response_data, dict)
612+
and "choices" in response_data
613+
):
614+
for choice in response_data["choices"]:
615+
if (
616+
"delta" in choice
617+
and "content" in choice["delta"]
618+
):
619+
response_content += choice["delta"][
620+
"content"
621+
]
622+
except json.JSONDecodeError:
623+
# Malformed or incomplete JSON is expected in streamed chunks; safely skip.
624+
pass
625+
except Exception as e:
626+
log.debug(f"Exception while processing chunk: {e}")
627+
589628
# Look for citations in any part of the response
590629
if "citations" in chunk_str.lower() and not citations_data:
591630
log.debug("Found 'citations' in chunk, attempting to parse...")
@@ -674,8 +713,9 @@ async def stream_processor_with_citations(
674713
if citations_data and not citations_added:
675714
log.info("Adding citation summary at end of stream...")
676715

716+
# Pass the accumulated response content to filter citations
677717
citation_section = self._format_citation_section(
678-
citations_data, for_streaming=True
718+
citations_data, response_content, for_streaming=True
679719
)
680720
if citation_section:
681721
# Convert escaped newlines to actual newlines for display
@@ -740,14 +780,36 @@ async def stream_processor_with_citations(
740780
# Suppress close-time errors (e.g., SSL shutdown timeouts)
741781
pass
742782

783+
def _extract_referenced_citations(self, content: str) -> Set[int]:
784+
"""
785+
Extract citation references (e.g., [doc1], [doc2]) from the content.
786+
787+
Args:
788+
content: The response content containing citation references
789+
790+
Returns:
791+
Set of citation indices that are referenced (e.g., {1, 2, 7, 8, 9})
792+
"""
793+
# Find all [docN] references in the content
794+
pattern = r"\[doc(\d+)\]"
795+
matches = re.findall(pattern, content)
796+
797+
# Convert to integers and return as a set
798+
return {int(match) for match in matches}
799+
743800
def _format_citation_section(
744-
self, citations: List[Dict[str, Any]], for_streaming: bool = False
801+
self,
802+
citations: List[Dict[str, Any]],
803+
content: str = "",
804+
for_streaming: bool = False,
745805
) -> str:
746806
"""
747807
Creates a formatted citation section using collapsible details elements.
808+
Only includes citations that are actually referenced in the content.
748809
749810
Args:
750811
citations: List of citation objects
812+
content: The response content (used to filter only referenced citations)
751813
for_streaming: If True, format for streaming (with escaping), else for regular response
752814
753815
Returns:
@@ -756,44 +818,74 @@ def _format_citation_section(
756818
if not citations:
757819
return ""
758820

759-
# Collect all citation details
821+
# Extract which citations are actually referenced in the content
822+
referenced_indices = self._extract_referenced_citations(content)
823+
824+
# If we couldn't find any references, include all citations (backward compatibility)
825+
if not referenced_indices:
826+
referenced_indices = set(range(1, len(citations) + 1))
827+
828+
# Collect only referenced citation details
760829
citation_entries = []
761830

762831
for i, citation in enumerate(citations, 1):
832+
# Skip citations that are not referenced in the content
833+
if i not in referenced_indices:
834+
continue
835+
763836
if not isinstance(citation, dict):
764837
continue
765838

766839
doc_ref = f"[doc{i}]"
767-
title = citation.get("title", "Unknown Document")
768-
content = citation.get("content", "")
769-
filepath = citation.get("filepath")
770-
url = citation.get("url")
771-
chunk_id = citation.get("chunk_id", "0")
840+
841+
# Get title with fallback to filepath or url
842+
title = citation.get("title", "")
843+
# Check if title is empty (not just None) and use alternatives
844+
if not title or not title.strip():
845+
# Try filepath first
846+
filepath = citation.get("filepath", "")
847+
if filepath and filepath.strip():
848+
title = filepath
849+
else:
850+
# Try url next
851+
url = citation.get("url", "")
852+
if url and url.strip():
853+
title = url
854+
else:
855+
# Final fallback
856+
title = "Unknown Document"
857+
858+
content_text = citation.get("content", "")
859+
filepath = citation.get("filepath", "")
860+
url = citation.get("url", "")
861+
chunk_id = citation.get("chunk_id", "")
772862

773863
# Build individual citation details
774864
citation_info = []
775865

776-
if filepath:
866+
# Show filepath if available and not empty
867+
if filepath and filepath.strip():
777868
citation_info.append(f"📁 **File:** `{filepath}`")
778-
elif url:
869+
# Show URL if available, not empty, and no filepath was shown
870+
elif url and url.strip():
779871
citation_info.append(f"🔗 **URL:** {url}")
780872

781-
citation_info.append(f"📄 **Chunk ID:** {chunk_id}")
873+
# Show chunk_id if available and not empty
874+
if chunk_id is not None and str(chunk_id).strip():
875+
citation_info.append(f"📄 **Chunk ID:** {chunk_id}")
782876

783877
# Add full content if available
784-
if content:
878+
if content_text and str(content_text).strip():
785879
try:
786880
# Clean content for display
787-
clean_content = str(content).strip()
788-
# Replace problematic characters for HTML display
789-
clean_content = clean_content.replace("\n", " ").replace("\r", " ")
881+
clean_content = str(content_text).strip()
790882
if for_streaming:
791883
# Additional escaping for streaming
792884
clean_content = clean_content.replace("\\", "\\\\").replace(
793885
'"', '\\"'
794886
)
795887

796-
citation_info.append(f"**Content:**")
888+
citation_info.append("**Content:**")
797889
citation_info.append(f"> {clean_content}")
798890
except Exception:
799891
citation_info.append("**Content:** [Content unavailable]")
@@ -809,6 +901,10 @@ def _format_citation_section(
809901

810902
citation_entries.append(citation_entry)
811903

904+
# Only create the section if we have citations to show
905+
if not citation_entries:
906+
return ""
907+
812908
# Combine all citations into main collapsible section
813909
if for_streaming:
814910
all_citations = "\\n\\n".join(citation_entries)

0 commit comments

Comments
 (0)