diff --git a/demos/continuous_batching/README.md b/demos/continuous_batching/README.md index ab8c7be951..550ab2f516 100644 --- a/demos/continuous_batching/README.md +++ b/demos/continuous_batching/README.md @@ -16,7 +16,7 @@ ovms_demos_continuous_batching_accuracy ``` This demo shows how to deploy LLM models in the OpenVINO Model Server using continuous batching and paged attention algorithms. -Text generation use case is exposed via OpenAI API `chat/completions` and `completions` endpoints. +Text generation use case is exposed via OpenAI API `chat/completions`, `completions` and `responses` endpoints. That makes it easy to use and efficient especially on on Intel® Xeon® processors and ARC GPUs. > **Note:** This demo was tested on 4th - 6th generation Intel® Xeon® Scalable Processors, and Intel® Core Ultra Series on Ubuntu24 and Windows11. @@ -72,7 +72,7 @@ curl http://localhost:8000/v3/models ## Request Generation -Model exposes both `chat/completions` and `completions` endpoints with and without stream capabilities. +Model exposes both `chat/completions`, `completions` and `responses` endpoints with and without stream capabilities. Chat endpoint is expected to be used for scenarios where conversation context should be pasted by the client and the model prompt is created by the server based on the jinja model template. Completion endpoint should be used to pass the prompt directly by the client and for models without the jinja template. Here is demonstrated model `Qwen/Qwen3-30B-A3B-Instruct-2507` in int4 precision. It has chat capability so `chat/completions` endpoint will be employed: @@ -147,9 +147,76 @@ curl -s http://localhost:8000/v3/chat/completions -H "Content-Type: application/ ::: +### Unary calls via Responses API using cURL + +::::{tab-set} + +:::{tab-item} Linux +```bash +curl http://localhost:8000/v3/responses \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "max_output_tokens":30, + "input": "What is OpenVINO?" + }'| jq . +``` +::: + +:::{tab-item} Windows +Windows Powershell +```powershell +(Invoke-WebRequest -Uri "http://localhost:8000/v3/responses" ` + -Method POST ` + -Headers @{ "Content-Type" = "application/json" } ` + -Body '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "max_output_tokens": 30, "input": "What is OpenVINO?"}').Content +``` + +Windows Command Prompt +```bat +curl -s http://localhost:8000/v3/responses -H "Content-Type: application/json" -d "{\"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\", \"max_output_tokens\": 30, \"input\": \"What is OpenVINO?\"}" +``` +::: + +:::: + +:::{dropdown} Expected Response +```json +{ + "id": "resp-1724405400", + "object": "response", + "created_at": 1724405400, + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "status": "completed", + "output": [ + { + "id": "msg-0", + "type": "message", + "role": "assistant", + "status": "completed", + "content": [ + { + "type": "output_text", + "text": "OpenVINO is an open-source software framework developed by Intel for optimizing and deploying computer vision, machine learning, and deep learning models on various devices,", + "annotations": [] + } + ] + } + ], + "usage": { + "input_tokens": 27, + "input_tokens_details": { "cached_tokens": 0 }, + "output_tokens": 30, + "output_tokens_details": { "reasoning_tokens": 0 }, + "total_tokens": 57 + } +} +``` +::: + ### OpenAI Python package -The endpoints `chat/completions` and `completions` are compatible with OpenAI client so it can be easily used to generate code also in streaming mode: +The endpoints `chat/completions`, `completions` and `responses` are compatible with OpenAI client so it can be easily used to generate code also in streaming mode: Install the client library: ```console @@ -261,6 +328,31 @@ So, **6 = 3**. ``` ::: +:::{tab-item} Responses +```python +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:8000/v3", + api_key="unused" +) + +stream = client.responses.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + input="Say this is a test", + stream=True, +) +for event in stream: + if event.type == "response.output_text.delta": + print(event.delta, end="", flush=True) +``` + +Output: +``` +It looks like you're testing me! +``` +::: + :::: ## Check how to use AI agents with MCP servers and language models @@ -299,5 +391,6 @@ Check the [guide of using lm-evaluation-harness](./accuracy/README.md) - [Official OpenVINO LLM models in HuggingFace](https://huggingface.co/collections/OpenVINO/llm) - [Chat Completions API](../../docs/model_server_rest_api_chat.md) - [Completions API](../../docs/model_server_rest_api_completions.md) +- [Responses API](../../docs/model_server_rest_api_responses.md) - [Writing client code](../../docs/clients_genai.md) - [LLM calculator reference](../../docs/llm/reference.md) diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch index 48af0f5eac..17bbeddfce 100644 --- a/demos/continuous_batching/accuracy/gorilla.patch +++ b/demos/continuous_batching/accuracy/gorilla.patch @@ -1,8 +1,16 @@ diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py -index bb625d2..7204adb 100644 +index bb625d2..64c01de 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py -@@ -2153,6 +2153,30 @@ third_party_inference_model_map = { +@@ -24,6 +24,7 @@ from bfcl_eval.model_handler.api_inference.openai_completion import ( + OpenAICompletionsHandler, + ) + from bfcl_eval.model_handler.api_inference.openai_response import OpenAIResponsesHandler ++from bfcl_eval.model_handler.api_inference.ovms_response_stream import OVMSResponsesStreamHandler + from bfcl_eval.model_handler.api_inference.qwen import ( + QwenAgentNoThinkHandler, + QwenAgentThinkHandler, +@@ -2153,6 +2154,54 @@ third_party_inference_model_map = { is_fc_model=True, underscore_to_dot=True, ), @@ -29,6 +37,30 @@ index bb625d2..7204adb 100644 + output_price=None, + is_fc_model=True, + underscore_to_dot=True, ++ ), ++ "ovms-model-responses": ModelConfig( ++ model_name="ovms-model-responses", ++ display_name="ovms-model-responses", ++ url="http://localhost:8000/v3", ++ org="ovms", ++ license="apache-2.0", ++ model_handler=OpenAIResponsesHandler, ++ input_price=None, ++ output_price=None, ++ is_fc_model=True, ++ underscore_to_dot=True, ++ ), ++ "ovms-model-stream-responses": ModelConfig( ++ model_name="ovms-model-stream-responses", ++ display_name="ovms-model-stream-responses", ++ url="http://localhost:8000/v3", ++ org="ovms", ++ license="apache-2.0", ++ model_handler=OVMSResponsesStreamHandler, ++ input_price=None, ++ output_price=None, ++ is_fc_model=True, ++ underscore_to_dot=True, + ), } @@ -60,6 +92,229 @@ index 357584f..e45e12c 100644 "store": False, } +diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py +index 0953fdd..7f6919f 100644 +--- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py ++++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py +@@ -38,10 +38,10 @@ class OpenAIResponsesHandler(BaseHandler): + + kwargs = {} + +- if api_key := os.getenv("OPENAI_API_KEY"): ++ if api_key := os.getenv("OPENAI_API_KEY","unused"): + kwargs["api_key"] = api_key + +- if base_url := os.getenv("OPENAI_BASE_URL"): ++ if base_url := os.getenv("OPENAI_BASE_URL","http://localhost:8000/v3"): + kwargs["base_url"] = base_url + + if headers_env := os.getenv("OPENAI_DEFAULT_HEADERS"): +@@ -99,25 +99,12 @@ class OpenAIResponsesHandler(BaseHandler): + kwargs = { + "input": message, + "model": self.model_name, +- "store": False, +- "include": ["reasoning.encrypted_content"], +- "reasoning": {"summary": "auto"}, + "temperature": self.temperature, ++ "max_output_tokens": 2048, ++ "tool_choice": os.getenv("TOOL_CHOICE", "auto"), ++ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, + } + +- # OpenAI reasoning models don't support temperature parameter +- if ( +- "o3" in self.model_name +- or "o4-mini" in self.model_name +- or "gpt-5" in self.model_name +- ): +- del kwargs["temperature"] +- +- # Non-reasoning models don't support reasoning parameter +- else: +- del kwargs["reasoning"] +- del kwargs["include"] +- + if len(tools) > 0: + kwargs["tools"] = tools + +@@ -218,25 +205,10 @@ class OpenAIResponsesHandler(BaseHandler): + kwargs = { + "input": inference_data["message"], + "model": self.model_name, +- "store": False, +- "include": ["reasoning.encrypted_content"], +- "reasoning": {"summary": "auto"}, + "temperature": self.temperature, ++ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, + } + +- # OpenAI reasoning models don't support temperature parameter +- if ( +- "o3" in self.model_name +- or "o4-mini" in self.model_name +- or "gpt-5" in self.model_name +- ): +- del kwargs["temperature"] +- +- # Non-reasoning models don't support reasoning parameter +- else: +- del kwargs["reasoning"] +- del kwargs["include"] +- + return self.generate_with_backoff(**kwargs) + + def _pre_query_processing_prompting(self, test_entry: dict) -> dict: +diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py +new file mode 100644 +index 0000000..bc5ef1e +--- /dev/null ++++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py +@@ -0,0 +1,144 @@ ++import json ++import os ++ ++from bfcl_eval.model_handler.api_inference.openai_response import OpenAIResponsesHandler ++from overrides import override ++ ++ ++class OVMSResponsesStreamHandler(OpenAIResponsesHandler): ++ """Streaming variant of OpenAIResponsesHandler. ++ ++ Inherits all behavior from the (patched) OpenAIResponsesHandler and only overrides ++ _query_FC, _query_prompting (to add stream=True), and the two ++ _parse_query_response methods (to aggregate SSE events). ++ """ ++ ++ @staticmethod ++ ++ @staticmethod ++ def _parse_stream(stream) -> dict: ++ """Parse responses API SSE stream and return aggregated results.""" ++ text_content = "" ++ reasoning_content = "" ++ tool_calls = {} # keyed by item_id ++ usage = {"input_tokens": 0, "output_tokens": 0} ++ output_items = [] # final output items from response.completed ++ ++ for event in stream: ++ event_type = event.type ++ ++ if event_type == "response.output_text.delta": ++ text_content += event.delta or "" ++ ++ elif event_type == "response.reasoning.delta": ++ reasoning_content += event.delta or "" ++ ++ elif event_type == "response.function_call_arguments.delta": ++ item_id = event.item_id ++ if item_id not in tool_calls: ++ tool_calls[item_id] = {"call_id": "", "name": "", "arguments": ""} ++ tool_calls[item_id]["arguments"] += event.delta or "" ++ ++ elif event_type == "response.output_item.added": ++ item = event.item ++ if hasattr(item, "type") and item.type == "function_call": ++ item_id = item.id ++ tool_calls[item_id] = { ++ "call_id": getattr(item, "call_id", "") or "", ++ "name": getattr(item, "name", "") or "", ++ "arguments": "", ++ } ++ ++ elif event_type in ("response.completed", "response.incomplete"): ++ resp = event.response ++ if hasattr(resp, "usage") and resp.usage: ++ usage["input_tokens"] = resp.usage.input_tokens ++ usage["output_tokens"] = resp.usage.output_tokens ++ if hasattr(resp, "output"): ++ output_items = resp.output ++ ++ return { ++ "text": text_content, ++ "reasoning": reasoning_content, ++ "tool_calls": tool_calls, ++ "usage": usage, ++ "output_items": output_items, ++ } ++ ++ #### FC methods #### ++ ++ @override ++ def _query_FC(self, inference_data: dict): ++ message: list[dict] = inference_data["message"] ++ tools = inference_data["tools"] ++ ++ inference_data["inference_input_log"] = { ++ "message": repr(message), ++ "tools": tools, ++ } ++ ++ kwargs = { ++ "input": message, ++ "model": self.model_name, ++ "temperature": self.temperature, ++ "max_output_tokens": 2048, ++ "tool_choice": os.getenv("TOOL_CHOICE", "auto"), ++ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, ++ "stream": True, ++ } ++ ++ if len(tools) > 0: ++ kwargs["tools"] = tools ++ ++ return self.generate_with_backoff(**kwargs) ++ ++ @override ++ def _parse_query_response_FC(self, api_response) -> dict: ++ parsed = self._parse_stream(api_response) ++ ++ model_responses = [] ++ tool_call_ids = [] ++ ++ for item_id, tc in parsed["tool_calls"].items(): ++ model_responses.append({tc["name"]: tc["arguments"]}) ++ tool_call_ids.append(tc["call_id"]) ++ ++ if not model_responses: ++ model_responses = parsed["text"] ++ ++ return { ++ "model_responses": model_responses, ++ "model_responses_message_for_chat_history": parsed["output_items"], ++ "tool_call_ids": tool_call_ids, ++ "reasoning_content": parsed["reasoning"], ++ "input_token": parsed["usage"]["input_tokens"], ++ "output_token": parsed["usage"]["output_tokens"], ++ } ++ ++ #### Prompting methods #### ++ ++ @override ++ def _query_prompting(self, inference_data: dict): ++ inference_data["inference_input_log"] = {"message": repr(inference_data["message"])} ++ ++ kwargs = { ++ "input": inference_data["message"], ++ "model": self.model_name, ++ "temperature": self.temperature, ++ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, ++ "stream": True, ++ } ++ ++ return self.generate_with_backoff(**kwargs) ++ ++ @override ++ def _parse_query_response_prompting(self, api_response) -> dict: ++ parsed = self._parse_stream(api_response) ++ ++ return { ++ "model_responses": parsed["text"], ++ "model_responses_message_for_chat_history": parsed["output_items"], ++ "reasoning_content": parsed["reasoning"], ++ "input_token": parsed["usage"]["input_tokens"], ++ "output_token": parsed["usage"]["output_tokens"], ++ } diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py index 10f1a08..50890c7 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py diff --git a/demos/continuous_batching/vlm/README.md b/demos/continuous_batching/vlm/README.md index 0de93e27b8..0436bb4dd7 100644 --- a/demos/continuous_batching/vlm/README.md +++ b/demos/continuous_batching/vlm/README.md @@ -9,7 +9,7 @@ ovms_demos_vlm_npu ``` This demo shows how to deploy Vision Language Models in the OpenVINO Model Server. -Text generation use case is exposed via OpenAI API `chat/completions` endpoint. +Text generation use case is exposed via OpenAI API `chat/completions` and `responses` endpoints. > **Note:** This demo was tested on 4th - 6th generation Intel® Xeon® Scalable Processors, Intel® Arc™ GPU Series and Intel® Core Ultra Series on Ubuntu24, RedHat9 and Windows11. @@ -119,6 +119,65 @@ curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/js ``` ::: +:::{dropdown} **Unary call with cURL using Responses API** +**Note**: Using urls in request requires `--allowed_media_domains` parameter described [here](../../../docs/parameters.md) + +```bash +curl http://localhost:8000/v3/responses \ + -H "Content-Type: application/json" \ + -d '{ + "model": "OpenGVLab/InternVL2-2B", + "input": [ + { + "role": "user", + "content": [ + { + "type": "input_text", + "text": "Describe what is on the picture." + }, + { + "type": "input_image", + "image_url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/3/demos/common/static/images/zebra.jpeg" + } + ] + } + ], + "max_output_tokens": 100 + }' +``` +```json +{ + "id": "resp-1741731554", + "object": "response", + "created_at": 1741731554, + "model": "OpenGVLab/InternVL2-2B", + "status": "completed", + "output": [ + { + "id": "msg-0", + "type": "message", + "role": "assistant", + "status": "completed", + "content": [ + { + "type": "output_text", + "text": "The picture features a zebra standing in a grassy plain. Zebras are known for their distinctive black and white striped patterns, which help them blend in for camouflage purposes.", + "annotations": [] + } + ] + } + ], + "usage": { + "input_tokens": 19, + "input_tokens_details": { "cached_tokens": 0 }, + "output_tokens": 83, + "output_tokens_details": { "reasoning_tokens": 0 }, + "total_tokens": 102 + } +} +``` +::: + :::{dropdown} **Unary call with python requests library** ```console @@ -177,9 +236,9 @@ print(response.text) } ``` ::: -:::{dropdown} **Streaming request with OpenAI client** +:::{dropdown} **Streaming request with OpenAI client using chat/completions** -The endpoints `chat/completions` is compatible with OpenAI client so it can be easily used to generate code also in streaming mode: +The endpoints `chat/completions` and `responses` are compatible with OpenAI client so it can be easily used to generate code also in streaming mode: Install the client library: ```console @@ -223,6 +282,48 @@ The picture features a zebra standing in a grassy area. The zebra is characteriz ::: +:::{dropdown} **Streaming request with OpenAI client via Responses API** + +```console +pip3 install openai +``` +```python +from openai import OpenAI +import base64 +base_url='http://localhost:8080/v3' +model_name = "OpenGVLab/InternVL2-2B" + +client = OpenAI(api_key='unused', base_url=base_url) + +def convert_image(Image): + with open(Image,'rb' ) as file: + base64_image = base64.b64encode(file.read()).decode("utf-8") + return base64_image + +stream = client.responses.create( + model=model_name, + input=[ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "Describe what is on the picture."}, + {"type": "input_image", "image_url": f"data:image/jpeg;base64,{convert_image('zebra.jpeg')}"} + ] + } + ], + stream=True, +) +for event in stream: + if event.type == "response.output_text.delta": + print(event.delta, end="", flush=True) +``` + +Output: +``` +The picture features a zebra standing in a grassy area. The zebra is characterized by its distinctive black and white striped pattern, which covers its entire body, including its legs, neck, and head. Zebras have small, rounded ears and a long, flowing tail. The background appears to be a natural grassy habitat, typical of a savanna or plain. +``` + +::: ## Testing the model accuracy over serving API @@ -237,5 +338,6 @@ Check [VLM usage with NPU acceleration](../../vlm_npu/README.md) - [Export models to OpenVINO format](../common/export_models/README.md) - [Supported VLM models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#visual-language-models-vlms) - [Chat Completions API](../../../docs/model_server_rest_api_chat.md) +- [Responses API](../../../docs/model_server_rest_api_responses.md) - [Writing client code](../../../docs/clients_genai.md) - [LLM calculator reference](../../../docs/llm/reference.md) diff --git a/docs/llm/reference.md b/docs/llm/reference.md index 654c9b6d90..222777e4a7 100644 --- a/docs/llm/reference.md +++ b/docs/llm/reference.md @@ -44,7 +44,7 @@ struct HttpPayload { std::shared_ptr client; }; ``` -The input json content should be compatible with the [chat completions](../model_server_rest_api_chat.md) or [completions](../model_server_rest_api_completions.md) API. +The input json content should be compatible with the [chat completions](../model_server_rest_api_chat.md), [completions](../model_server_rest_api_completions.md) or [responses](../model_server_rest_api_responses.md) API. The input also includes a side packet with a reference to `LLM_NODE_RESOURCES` which is a shared object representing an LLM engine. It loads the model, runs the generation cycles and reports the generated results to the LLM calculator via a generation handler. diff --git a/src/drogon_http_server.cpp b/src/drogon_http_server.cpp index 1c14d5d57b..210776ac8b 100644 --- a/src/drogon_http_server.cpp +++ b/src/drogon_http_server.cpp @@ -88,7 +88,8 @@ Status DrogonHttpServer::startAcceptingRequests() { drogon::app().disableSigtermHandling(); drogon::app().setDefaultHandler([this](const drogon::HttpRequestPtr& req, std::function&& drogonResponseInitializeCallback) { - bool isTextGeneration = req->path().find("/completions") != std::string::npos; + bool isTextGeneration = req->path().find("/completions") != std::string::npos || + req->path().find("/responses") != std::string::npos; // Here we need to schedule the request to the separate thread pool // in order to use disconnection callback of drogon. diff --git a/src/http_rest_api_handler.cpp b/src/http_rest_api_handler.cpp index 33a81cb429..c295edab87 100644 --- a/src/http_rest_api_handler.cpp +++ b/src/http_rest_api_handler.cpp @@ -531,7 +531,7 @@ static Status createV3HttpPayload( return Status(StatusCode::JSON_INVALID, "model field is not a string"); } - bool isTextGenerationEndpoint = uri.find("completions") != std::string_view::npos; + bool isTextGenerationEndpoint = (uri.find("completions") != std::string_view::npos) || (uri.find("responses") != std::string_view::npos); if (isTextGenerationEndpoint) { auto streamIt = parsedJson->FindMember("stream"); if (streamIt != parsedJson->MemberEnd()) { diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 6898b51604..c8f2b18e09 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -45,6 +45,8 @@ using namespace rapidjson; namespace ovms { constexpr size_t DEFAULT_MAX_STOP_WORDS = 16; // same as deep-seek +constexpr std::string_view BASE64_PREFIX = "base64,"; +constexpr int64_t MAX_IMAGE_SIZE_BYTES = 20000000; // 20MB namespace { @@ -97,6 +99,372 @@ ov::genai::JsonContainer rapidJsonValueToJsonContainer(const rapidjson::Value& v } // namespace +void OpenAIChatCompletionsHandler::serializeResponsesToolChoice(Writer& writer) const { + writer.String("tool_choice"); + if (request.toolChoice.empty()) { + writer.String("auto"); + } else if (request.toolChoice == "auto" || request.toolChoice == "none" || request.toolChoice == "required") { + writer.String(request.toolChoice.c_str()); + } else { + writer.StartObject(); + writer.String("type"); + writer.String("function"); + writer.String("name"); + writer.String(request.toolChoice.c_str()); + writer.EndObject(); + } +} + +void OpenAIChatCompletionsHandler::serializeResponsesTools(Writer& writer) const { + writer.String("tools"); + writer.StartArray(); + for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) { + writer.StartObject(); + writer.String("type"); + writer.String("function"); + writer.String("name"); + writer.String(toolName.c_str()); + writer.String("parameters"); + writer.RawValue(toolSchemaWrapper.stringRepr.c_str(), toolSchemaWrapper.stringRepr.size(), rapidjson::kObjectType); + writer.EndObject(); + } + writer.EndArray(); +} + +void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer& writer, const std::string& responseId, int64_t createdAt, + const std::string& status, const std::string& fullOutputText, bool includeUsage, + const std::optional& incompleteReason, const std::optional& errorMessage, ResponsesErrorCode errorCode) const { + writer.StartObject(); + writer.String("id"); + writer.String(responseId.c_str()); + writer.String("object"); + writer.String("response"); + writer.String("created_at"); + writer.Int64(createdAt); + if (status == "completed") { + const auto completedAt = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); + writer.String("completed_at"); + writer.Int64(completedAt); + } + if (incompleteReason.has_value()) { + writer.String("incomplete_details"); + writer.StartObject(); + writer.String("reason"); + writer.String(incompleteReason.value().c_str()); + writer.EndObject(); + } + writer.String("error"); + if (errorMessage.has_value()) { + writer.StartObject(); + writer.String("code"); + writer.String(responsesErrorCodeToString(errorCode)); + writer.String("message"); + writer.String(errorMessage.value().c_str()); + writer.EndObject(); + } else { + writer.Null(); + } + writer.String("model"); + writer.String(request.model.c_str()); + writer.String("status"); + writer.String(status.c_str()); + + writer.String("parallel_tool_calls"); + writer.Bool(false); + // TODO: previous_response_id not supported + writer.String("store"); + writer.Bool(true); + // TODO: temperature/top_p are only included when explicitly provided in the request + if (request.temperature.has_value()) { + writer.String("temperature"); + writer.Double(static_cast(request.temperature.value())); + } + writer.String("text"); + writer.StartObject(); + writer.String("format"); + writer.StartObject(); + writer.String("type"); + writer.String("text"); + writer.EndObject(); + writer.EndObject(); + serializeResponsesToolChoice(writer); + serializeResponsesTools(writer); + if (request.topP.has_value()) { + writer.String("top_p"); + writer.Double(static_cast(request.topP.value())); + } + writer.String("truncation"); + writer.String("disabled"); + // TODO: user not supported + writer.String("metadata"); + writer.StartObject(); + writer.EndObject(); + + if (request.maxTokens.has_value()) { + writer.String("max_output_tokens"); + writer.Uint64(static_cast(request.maxTokens.value())); + } + + writer.String("output"); + writer.StartArray(); + // Include reasoning output item if reasoning was produced during streaming + if (!responsesState.reasoningText.empty()) { + writer.StartObject(); + writer.String("id"); + writer.String("rs-0"); + writer.String("type"); + writer.String("reasoning"); + writer.String("summary"); + writer.StartArray(); + writer.StartObject(); + writer.String("type"); + writer.String("summary_text"); + writer.String("text"); + writer.String(responsesState.reasoningText.c_str()); + writer.EndObject(); + writer.EndArray(); + writer.EndObject(); + } + // Include function_call output items if tool calls were produced during streaming + for (const auto& toolCall : responsesState.toolCalls) { + writer.StartObject(); + writer.String("id"); + writer.String(toolCall.id.c_str()); + writer.String("type"); + writer.String("function_call"); + writer.String("status"); + writer.String(status.c_str()); + writer.String("call_id"); + writer.String(toolCall.id.c_str()); + writer.String("name"); + writer.String(toolCall.name.c_str()); + writer.String("arguments"); + writer.String(toolCall.arguments.c_str()); + writer.EndObject(); + } + if (!fullOutputText.empty() || responsesState.toolCalls.empty()) { + writer.StartObject(); + writer.String("id"); + writer.String("msg-0"); + writer.String("type"); + writer.String("message"); + writer.String("role"); + writer.String("assistant"); + writer.String("status"); + writer.String(status.c_str()); + writer.String("content"); + writer.StartArray(); + serializeOutputTextPart(writer, fullOutputText); + writer.EndArray(); + writer.EndObject(); + } + writer.EndArray(); + + if (includeUsage) { + writer.String("usage"); + writer.StartObject(); + writer.String("input_tokens"); + writer.Uint64(static_cast(usage.promptTokens)); + // TODO: input_tokens_details.cached_tokens not supported + writer.String("output_tokens"); + writer.Uint64(static_cast(usage.completionTokens)); + // TODO: output_tokens_details.reasoning_tokens not supported + writer.String("total_tokens"); + writer.Uint64(static_cast(usage.calculateTotalTokens())); + writer.EndObject(); + } + + writer.EndObject(); +} + +void OpenAIChatCompletionsHandler::serializeResponsesOutputItem(Writer& writer, const std::string& outputItemId, + const std::string& text, const std::string& status) { + writer.StartObject(); + writer.String("id"); + writer.String(outputItemId.c_str()); + writer.String("type"); + writer.String("message"); + writer.String("role"); + writer.String("assistant"); + writer.String("status"); + writer.String(status.c_str()); + writer.String("content"); + writer.StartArray(); + if (status != "in_progress") { + serializeOutputTextPart(writer, text); + } + writer.EndArray(); + writer.EndObject(); +} + +void OpenAIChatCompletionsHandler::serializeOutputTextPart(Writer& writer, const std::string& text) { + writer.StartObject(); + writer.String("type"); + writer.String("output_text"); + writer.String("text"); + writer.String(text.c_str()); + writer.String("annotations"); + writer.StartArray(); + writer.EndArray(); + writer.EndObject(); +} + +std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const std::vector& parsedOutputs, + ov::genai::GenerationFinishReason finishReason) const { + const bool isIncomplete = (finishReason == ov::genai::GenerationFinishReason::LENGTH); + const std::string responseStatus = isIncomplete ? "incomplete" : "completed"; + const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); + const auto completedAt = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); + const std::string responseId = "resp-" + std::to_string(createdAt); + + StringBuffer buffer; + Writer writer(buffer); + + writer.StartObject(); + writer.String("id"); + writer.String(responseId.c_str()); + writer.String("object"); + writer.String("response"); + writer.String("created_at"); + writer.Int64(createdAt); + if (!isIncomplete) { + writer.String("completed_at"); + writer.Int64(completedAt); + } + if (isIncomplete) { + writer.String("incomplete_details"); + writer.StartObject(); + writer.String("reason"); + writer.String("max_tokens"); + writer.EndObject(); + } + // TODO: error not supported in unary response + writer.String("model"); + writer.String(request.model.c_str()); + writer.String("status"); + writer.String(responseStatus.c_str()); + + writer.String("parallel_tool_calls"); + writer.Bool(false); + // TODO: previous_response_id not supported + writer.String("store"); + writer.Bool(true); + // TODO: temperature/top_p are only included when explicitly provided in the request + if (request.temperature.has_value()) { + writer.String("temperature"); + writer.Double(static_cast(request.temperature.value())); + } + writer.String("text"); + writer.StartObject(); + writer.String("format"); + writer.StartObject(); + writer.String("type"); + writer.String("text"); + writer.EndObject(); + writer.EndObject(); + serializeResponsesToolChoice(writer); + serializeResponsesTools(writer); + if (request.topP.has_value()) { + writer.String("top_p"); + writer.Double(static_cast(request.topP.value())); + } + writer.String("truncation"); + writer.String("disabled"); + // TODO: user not supported + writer.String("metadata"); + writer.StartObject(); + writer.EndObject(); + + if (request.maxTokens.has_value()) { + writer.String("max_output_tokens"); + writer.Uint64(static_cast(request.maxTokens.value())); + } + + writer.String("output"); + writer.StartArray(); + int outputIndex = 0; + for (const auto& parsedOutput : parsedOutputs) { + // Emit reasoning output item if reasoning is available + if (!parsedOutput.reasoning.empty()) { + const std::string reasoningId = "rs-" + std::to_string(outputIndex); + writer.StartObject(); + writer.String("id"); + writer.String(reasoningId.c_str()); + writer.String("type"); + writer.String("reasoning"); + writer.String("summary"); + writer.StartArray(); + writer.StartObject(); + writer.String("type"); + writer.String("summary_text"); + writer.String("text"); + writer.String(parsedOutput.reasoning.c_str()); + writer.EndObject(); + writer.EndArray(); + writer.EndObject(); + } + + if (!parsedOutput.toolCalls.empty()) { + // Emit function_call output items for each tool call + for (const auto& toolCall : parsedOutput.toolCalls) { + writer.StartObject(); + writer.String("id"); + writer.String(toolCall.id.c_str()); + writer.String("type"); + writer.String("function_call"); + writer.String("status"); + writer.String(responseStatus.c_str()); + writer.String("call_id"); + writer.String(toolCall.id.c_str()); + writer.String("name"); + writer.String(toolCall.name.c_str()); + writer.String("arguments"); + writer.String(toolCall.arguments.c_str()); + writer.EndObject(); + } + } + + // Emit message output item if there is non-empty content or no tool calls + if (!parsedOutput.content.empty() || parsedOutput.toolCalls.empty()) { + const std::string outputId = "msg-" + std::to_string(outputIndex); + + writer.StartObject(); + writer.String("id"); + writer.String(outputId.c_str()); + writer.String("type"); + writer.String("message"); + writer.String("role"); + writer.String("assistant"); + writer.String("status"); + writer.String(responseStatus.c_str()); + writer.String("content"); + writer.StartArray(); + serializeOutputTextPart(writer, parsedOutput.content); + writer.EndArray(); + writer.EndObject(); + } + + outputIndex++; + } + writer.EndArray(); + + writer.String("usage"); + writer.StartObject(); + writer.String("input_tokens"); + writer.Uint64(static_cast(usage.promptTokens)); + // TODO: input_tokens_details.cached_tokens not supported + writer.String("output_tokens"); + writer.Uint64(static_cast(usage.completionTokens)); + // TODO: output_tokens_details.reasoning_tokens not supported + writer.String("total_tokens"); + writer.Uint64(static_cast(usage.calculateTotalTokens())); + writer.EndObject(); + + writer.EndObject(); + + return buffer.GetString(); +} + absl::Status OpenAIChatCompletionsHandler::parseCompletionsPart() { // prompt: string auto it = doc.FindMember("prompt"); @@ -247,6 +615,169 @@ absl::Status OpenAIChatCompletionsHandler::ensureArgumentsInToolCalls(Value& mes return absl::OkStatus(); } +absl::Status OpenAIChatCompletionsHandler::parseResponsesInput(std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains) { + auto inputIt = doc.FindMember("input"); + if (inputIt == doc.MemberEnd()) { + return absl::InvalidArgumentError("input missing in request"); + } + + if (inputIt->value.IsString()) { + request.prompt = inputIt->value.GetString(); + if (request.prompt.value().empty()) { + return absl::InvalidArgumentError("input cannot be empty"); + } + + request.chatHistory.push_back({}); + request.chatHistory.last()["role"] = "user"; + request.chatHistory.last()["content"] = request.prompt.value(); + } else if (inputIt->value.IsArray()) { + if (inputIt->value.GetArray().Size() == 0) { + return absl::InvalidArgumentError("Messages array cannot be empty"); + } + + for (size_t i = 0; i < inputIt->value.GetArray().Size(); ++i) { + auto& item = inputIt->value.GetArray()[i]; + if (!item.IsObject()) { + return absl::InvalidArgumentError("input array items must be objects"); + } + + auto itemObj = item.GetObject(); + auto roleIt = itemObj.FindMember("role"); + if (roleIt == itemObj.MemberEnd() || !roleIt->value.IsString()) { + return absl::InvalidArgumentError("input item role is missing or invalid"); + } + + request.chatHistory.push_back({}); + request.chatHistory.last()["role"] = roleIt->value.GetString(); + + auto contentIt = itemObj.FindMember("content"); + if (contentIt == itemObj.MemberEnd()) { + return absl::InvalidArgumentError("input item content is missing"); + } + + if (contentIt->value.IsString()) { + request.chatHistory.last()["content"] = contentIt->value.GetString(); + continue; + } + + if (!contentIt->value.IsArray()) { + return absl::InvalidArgumentError("input item content must be a string or array"); + } + if (contentIt->value.GetArray().Size() == 0) { + return absl::InvalidArgumentError("Invalid message structure - content array is empty"); + } + + std::string contentText; + for (auto& contentItem : contentIt->value.GetArray()) { + if (!contentItem.IsObject()) { + return absl::InvalidArgumentError("input content items must be objects"); + } + auto contentObj = contentItem.GetObject(); + auto typeIt = contentObj.FindMember("type"); + if (typeIt == contentObj.MemberEnd() || !typeIt->value.IsString()) { + return absl::InvalidArgumentError("input content item type is missing or invalid"); + } + + const std::string type = typeIt->value.GetString(); + if (type == "input_text") { + auto textIt = contentObj.FindMember("text"); + if (textIt == contentObj.MemberEnd() || !textIt->value.IsString()) { + return absl::InvalidArgumentError("input_text requires a valid text field"); + } + contentText = textIt->value.GetString(); + } else if (type == "input_image") { + std::string imageUrl; + auto imageUrlIt = contentObj.FindMember("image_url"); + if (imageUrlIt == contentObj.MemberEnd()) { + return absl::InvalidArgumentError("input_image requires image_url field"); + } + if (imageUrlIt->value.IsString()) { + imageUrl = imageUrlIt->value.GetString(); + } else if (imageUrlIt->value.IsObject()) { + auto imageUrlObj = imageUrlIt->value.GetObject(); + auto urlIt = imageUrlObj.FindMember("url"); + if (urlIt == imageUrlObj.MemberEnd() || !urlIt->value.IsString()) { + return absl::InvalidArgumentError("input_image.image_url.url is missing or invalid"); + } + imageUrl = urlIt->value.GetString(); + } else { + return absl::InvalidArgumentError("input_image.image_url must be a string or object"); + } + + std::size_t pos = imageUrl.find(BASE64_PREFIX); + std::string decoded; + ov::Tensor tensor; + if (pos != std::string::npos) { + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Loading image from base64 string"); + size_t offset = pos + BASE64_PREFIX.length(); + if (!absl::Base64Unescape(std::string_view(imageUrl.data() + offset, imageUrl.size() - offset), &decoded)) { + return absl::InvalidArgumentError("Invalid base64 string in request"); + } + try { + tensor = loadImageStbiFromMemory(decoded); + } catch (std::runtime_error& e) { + std::stringstream ss; + ss << "Image parsing failed: " << e.what(); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + return absl::InvalidArgumentError(ss.str()); + } + } else if (std::regex_match(imageUrl.c_str(), std::regex("^(http|https|ftp|sftp|)://(.*)"))) { + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Loading image using curl"); + if (!allowedMediaDomains.has_value() || !isDomainAllowed(allowedMediaDomains.value(), imageUrl.c_str())) { + return absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains"); + } + auto status = downloadImage(imageUrl.c_str(), decoded, MAX_IMAGE_SIZE_BYTES); + if (status != absl::OkStatus()) { + return status; + } + try { + tensor = loadImageStbiFromMemory(decoded); + } catch (std::runtime_error& e) { + std::stringstream ss; + ss << "Image parsing failed: " << e.what(); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + return absl::InvalidArgumentError("Image parsing failed"); + } + } else { + if (!allowedLocalMediaPath.has_value()) { + return absl::InvalidArgumentError("Loading images from local filesystem is disabled."); + } + if (FileSystem::isPathEscaped(imageUrl)) { + std::stringstream ss; + ss << "Path " << imageUrl.c_str() << " escape with .. is forbidden."; + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + return absl::InvalidArgumentError(ss.str()); + } + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Loading image from local filesystem"); + const auto firstMissmatch = std::mismatch(imageUrl.begin(), imageUrl.end(), allowedLocalMediaPath.value().begin(), allowedLocalMediaPath.value().end()); + if (firstMissmatch.second != allowedLocalMediaPath.value().end()) { + return absl::InvalidArgumentError("Given filepath is not subpath of allowed_local_media_path"); + } + try { + tensor = loadImageStbiFromFile(imageUrl.c_str()); + } catch (std::runtime_error& e) { + std::stringstream ss; + ss << "Image file " << imageUrl.c_str() << " parsing failed: " << e.what(); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + return absl::InvalidArgumentError(ss.str()); + } + } + request.imageHistory.push_back({i, tensor}); + } else { + return absl::InvalidArgumentError("Unsupported content type. Supported types are input_text and input_image."); + } + } + + request.chatHistory.last()["content"] = contentText; + } + } else { + return absl::InvalidArgumentError("input is not a string or array"); + } + + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Parsed responses input directly to chat history without mutating request JSON"); + return absl::OkStatus(); +} + absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains) { auto it = doc.FindMember("messages"); if (it == doc.MemberEnd()) @@ -304,13 +835,12 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optionalvalue.IsObject()) { - auto tool_choice_functionIt = tool_choice_it->value.GetObject().FindMember("function"); - if (tool_choice_functionIt != tool_choice_it->value.GetObject().MemberEnd() && tool_choice_functionIt->value.IsObject()) { + auto toolChoiceObj = tool_choice_it->value.GetObject(); + auto tool_choice_functionIt = toolChoiceObj.FindMember("function"); + if (tool_choice_functionIt != toolChoiceObj.MemberEnd() && tool_choice_functionIt->value.IsObject()) { auto nameIt = tool_choice_functionIt->value.GetObject().FindMember("name"); if (nameIt != tool_choice_functionIt->value.GetObject().MemberEnd() && nameIt->value.IsString()) { tool_choice = nameIt->value.GetString(); @@ -424,7 +954,16 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { return absl::InvalidArgumentError("tool_choice.function.name is not a valid string"); } } else { - return absl::InvalidArgumentError("tool_choice.function is not a valid JSON object"); + auto typeIt = toolChoiceObj.FindMember("type"); + auto nameIt = toolChoiceObj.FindMember("name"); + if (typeIt != toolChoiceObj.MemberEnd() && typeIt->value.IsString() && std::string(typeIt->value.GetString()) == "function") { + if (nameIt == toolChoiceObj.MemberEnd() || !nameIt->value.IsString()) { + return absl::InvalidArgumentError("tool_choice.name is not a valid string"); + } + tool_choice = nameIt->value.GetString(); + } else { + return absl::InvalidArgumentError("tool_choice.function is not a valid JSON object"); + } } } else { return absl::InvalidArgumentError("tool_choice is not a valid JSON object or string"); @@ -444,38 +983,67 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { auto& obj = it->value.GetArray()[i]; if (!obj.IsObject()) return absl::InvalidArgumentError("Tool is not a JSON object"); + rapidjson::Value* parametersValue = nullptr; + std::string functionName; + auto functionIt = obj.FindMember("function"); - if (functionIt != obj.MemberEnd() && functionIt->value.IsObject()) { - auto nameIt = functionIt->value.GetObject().FindMember("name"); - if (nameIt != functionIt->value.GetObject().MemberEnd() && nameIt->value.IsString()) { - std::string functionName = nameIt->value.GetString(); - // If tool_choice is set to "auto", we keep all tools - // If tool_choice is set to a specific function name, we keep only that tool - if (tool_choice != "auto" && tool_choice != "required" && tool_choice != functionName) { - it->value.Erase(&obj); - jsonChanged = true; - } else { - i++; - // If we keep the tool, add tool name and schema to the request - auto parametersIt = functionIt->value.GetObject().FindMember("parameters"); - if (parametersIt != functionIt->value.GetObject().MemberEnd() && parametersIt->value.IsObject()) { - // now we want to insert to a mapping of - // tool name -> tool schema representations struct - // Dump parameters object to string since this is the schema format expected by GenAI - // Keep the rapidjson::Value object as well to avoid re-parsing in outputParsers - rapidjson::StringBuffer buffer; - rapidjson::Writer writer(buffer); - parametersIt->value.Accept(writer); - std::string parametersStr = buffer.GetString(); - ToolSchemaWrapper schemaReprs{¶metersIt->value, std::move(parametersStr)}; - request.toolNameSchemaMap[nameIt->value.GetString()] = std::move(schemaReprs); - } - } - } else { + if (functionIt != obj.MemberEnd()) { + if (!functionIt->value.IsObject()) { + return absl::InvalidArgumentError("Function is not a valid JSON object"); + } + auto& functionObj = functionIt->value; + auto nameIt = functionObj.GetObject().FindMember("name"); + if (nameIt == functionObj.GetObject().MemberEnd() || !nameIt->value.IsString()) { return absl::InvalidArgumentError("Function object does not contain a valid name field"); } + functionName = nameIt->value.GetString(); + auto parametersIt = functionObj.GetObject().FindMember("parameters"); + if (parametersIt != functionObj.GetObject().MemberEnd()) { + parametersValue = ¶metersIt->value; + } } else { - return absl::InvalidArgumentError("Function is not a valid JSON object"); + auto typeIt = obj.FindMember("type"); + if (typeIt == obj.MemberEnd() || !typeIt->value.IsString()) { + return absl::InvalidArgumentError("Tool type is missing or invalid"); + } + if (std::string(typeIt->value.GetString()) != "function") { + return absl::InvalidArgumentError("Only function tools are supported"); + } + + auto nameIt = obj.FindMember("name"); + if (nameIt == obj.MemberEnd() || !nameIt->value.IsString()) { + return absl::InvalidArgumentError("Function object does not contain a valid name field"); + } + functionName = nameIt->value.GetString(); + + auto parametersIt = obj.FindMember("parameters"); + if (parametersIt != obj.MemberEnd()) { + parametersValue = ¶metersIt->value; + } + } + + // If tool_choice is set to "auto", we keep all tools + // If tool_choice is set to a specific function name, we keep only that tool + if (tool_choice != "auto" && tool_choice != "required" && tool_choice != functionName) { + it->value.Erase(&obj); + jsonChanged = true; + continue; + } + + i++; + // If we keep the tool, add tool name and schema to the request + if (parametersValue != nullptr) { + if (!parametersValue->IsObject()) { + return absl::InvalidArgumentError("Function parameters are not a valid JSON object"); + } + // Dump parameters object to string since this is the schema format expected by GenAI + // Keep the rapidjson::Value pointer as well to avoid re-parsing in outputParsers + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + parametersValue->Accept(writer); + std::string parametersStr = buffer.GetString(); + ToolSchemaWrapper schemaReprs{parametersValue, std::move(parametersStr)}; + request.toolNameSchemaMap[functionName] = std::move(schemaReprs); } } } else { @@ -656,6 +1224,145 @@ absl::Status OpenAIChatCompletionsHandler::parseChatCompletionsPart(std::optiona return absl::OkStatus(); } +absl::Status OpenAIChatCompletionsHandler::parseResponsesPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains) { + // input: string; required + auto it = doc.FindMember("input"); + if (it == doc.MemberEnd()) { + return absl::InvalidArgumentError("input missing in request"); + } + + auto messagesStatus = parseResponsesInput(allowedLocalMediaPath, allowedMediaDomains); + if (!messagesStatus.ok()) { + return messagesStatus; + } + + // reasoning: object; optional + // OpenAI Responses API reasoning parameter. Any effort value enables thinking mode. + it = doc.FindMember("reasoning"); + if (it != doc.MemberEnd() && !it->value.IsNull()) { + if (!it->value.IsObject()) { + return absl::InvalidArgumentError("reasoning is not an object"); + } + const auto& reasoningObj = it->value; + auto effortIt = reasoningObj.FindMember("effort"); + if (effortIt != reasoningObj.MemberEnd() && !effortIt->value.IsNull()) { + if (!effortIt->value.IsString()) { + return absl::InvalidArgumentError("reasoning.effort is not a string"); + } + const std::string effort = effortIt->value.GetString(); + if (effort != "low" && effort != "medium" && effort != "high") { + return absl::InvalidArgumentError("reasoning.effort must be one of: low, medium, high"); + } + // Inject enable_thinking: true into chat_template_kwargs if not already explicitly set + auto kwargsIt = doc.FindMember("chat_template_kwargs"); + if (kwargsIt == doc.MemberEnd()) { + rapidjson::Value kwargs(rapidjson::kObjectType); + kwargs.AddMember("enable_thinking", true, doc.GetAllocator()); + doc.AddMember("chat_template_kwargs", kwargs, doc.GetAllocator()); + } else if (kwargsIt->value.IsObject()) { + auto enableThinkingIt = kwargsIt->value.FindMember("enable_thinking"); + if (enableThinkingIt == kwargsIt->value.MemberEnd()) { + kwargsIt->value.AddMember("enable_thinking", true, doc.GetAllocator()); + } + // If enable_thinking is already set explicitly, the user's value takes precedence + } + } + // summary field is accepted but ignored + } + +#if (PYTHON_DISABLE == 0) + // Build processedJson with "messages" array from chatHistory so that + // the Python chat template path (which reads request_json["messages"]) + // can consume Responses API input without a separate code path. + { + Document processedDoc; + processedDoc.SetObject(); + auto& alloc = processedDoc.GetAllocator(); + + Value messagesArray(kArrayType); + for (size_t i = 0; i < request.chatHistory.size(); ++i) { + Value msgObj(kObjectType); + auto role = request.chatHistory[i]["role"].as_string(); + if (role.has_value()) { + msgObj.AddMember("role", Value(role.value().c_str(), alloc), alloc); + } + auto content = request.chatHistory[i]["content"].as_string(); + if (content.has_value()) { + msgObj.AddMember("content", Value(content.value().c_str(), alloc), alloc); + } + messagesArray.PushBack(msgObj, alloc); + } + processedDoc.AddMember("messages", messagesArray, alloc); + + // Copy tools from original doc if present + auto toolsIt = doc.FindMember("tools"); + if (toolsIt != doc.MemberEnd() && !toolsIt->value.IsNull()) { + Value toolsCopy(toolsIt->value, alloc); + processedDoc.AddMember("tools", toolsCopy, alloc); + } + + // Copy chat_template_kwargs from original doc if present + auto kwargsIt = doc.FindMember("chat_template_kwargs"); + if (kwargsIt != doc.MemberEnd() && !kwargsIt->value.IsNull()) { + Value kwargsCopy(kwargsIt->value, alloc); + processedDoc.AddMember("chat_template_kwargs", kwargsCopy, alloc); + } + + StringBuffer buffer; + Writer writer(buffer); + processedDoc.Accept(writer); + request.processedJson = buffer.GetString(); + } +#endif + // logprobs: bool; optional - defaults to false + it = doc.FindMember("logprobs"); + if (it != doc.MemberEnd() && !it->value.IsNull()) { + if (!it->value.IsBool()) + return absl::InvalidArgumentError("logprobs accepts values true or false"); + request.logprobschat = it->value.GetBool(); + } + if (request.logprobschat && request.stream) { + return absl::InvalidArgumentError("logprobs are not supported in streaming mode."); + } + + auto toolsStatus = parseTools(); + if (!toolsStatus.ok()) { + return toolsStatus; + } + + // max_output_tokens: uint; optional + // OpenAI Responses API uses this field for output token limit. + it = doc.FindMember("max_output_tokens"); + if (it != doc.MemberEnd() && !it->value.IsNull()) { + if (!it->value.IsUint()) { + if (it->value.IsUint64()) + return absl::InvalidArgumentError("max_output_tokens value can't be greater than 4294967295"); + return absl::InvalidArgumentError("max_output_tokens is not an unsigned integer"); + } + if (maxTokensLimit.has_value() && it->value.GetUint() > maxTokensLimit.value()) + return absl::InvalidArgumentError(absl::StrCat("max_output_tokens exceeds limit provided in graph config: ", maxTokensLimit.value())); + request.maxTokens = it->value.GetUint(); + } + + // specific part of max_output_tokens validation + if (request.maxTokens == 0) { + return absl::InvalidArgumentError("max_output_tokens value should be greater than 0"); + } + + // parse response_format + it = doc.FindMember("response_format"); + if (it != doc.MemberEnd()) { + if (it->value.IsNull()) + return absl::OkStatus(); + if (!it->value.IsObject()) + return absl::InvalidArgumentError("response_format is not an object"); + const rapidjson::Value& responseFormat = it->value; + request.responseFormat = convertOpenAIResponseFormatToStructuralTagStringFormat(responseFormat); + } + + return absl::OkStatus(); +} + absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength) { OVMS_PROFILE_FUNCTION(); // stream: bool; optional @@ -712,16 +1419,23 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optionalvalue.IsUint()) { - if (it->value.IsUint64()) - return absl::InvalidArgumentError("max_tokens value can't be greater than 4294967295"); - return absl::InvalidArgumentError("max_tokens is not an unsigned integer"); + // Not applicable for RESPONSES endpoint which uses max_output_tokens instead + if (endpoint != Endpoint::RESPONSES) { + it = doc.FindMember("max_tokens"); + if (it != doc.MemberEnd()) { + if (!it->value.IsUint()) { + if (it->value.IsUint64()) + return absl::InvalidArgumentError("max_tokens value can't be greater than 4294967295"); + return absl::InvalidArgumentError("max_tokens is not an unsigned integer"); + } + if (maxTokensLimit.has_value() && !(it->value.GetUint() < maxTokensLimit.value())) + return absl::InvalidArgumentError(absl::StrCat("max_tokens exceeds limit provided in graph config: ", maxTokensLimit.value())); + request.maxTokens = it->value.GetUint(); + } else { + if (maxTokensLimit.has_value()) { + request.maxTokens = maxTokensLimit.value(); + } } - if (maxTokensLimit.has_value() && !(it->value.GetUint() < maxTokensLimit.value())) - return absl::InvalidArgumentError(absl::StrCat("max_tokens exceeds limit provided in graph config: ", maxTokensLimit.value())); - request.maxTokens = it->value.GetUint(); } else { if (maxTokensLimit.has_value()) { request.maxTokens = maxTokensLimit.value(); @@ -848,6 +1562,7 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optionalvalue.IsNull()) { if (!it->value.IsUint()) @@ -863,12 +1578,15 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optionalvalue.IsNull()) { if (!it->value.IsUint()) return absl::InvalidArgumentError("n is not an unsigned integer"); if (it->value.GetUint() == 0) return absl::InvalidArgumentError("n value should be greater than 0"); + if (endpoint == Endpoint::RESPONSES && request.stream && it->value.GetUint() > 1) + return absl::InvalidArgumentError("n greater than 1 is not supported for Responses API streaming"); size_t bestOf = request.bestOf.has_value() ? request.bestOf.value() : 1; // 1 is default best_of value if (bestOf < it->value.GetUint()) { return absl::InvalidArgumentError("n value cannot be greater than best_of"); @@ -897,14 +1615,14 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optional OpenAIChatCompletionsHandler::getNumReturnSequences() const { StreamOptions OpenAIChatCompletionsHandler::getStreamOptions() const { return request.streamOptions; } bool OpenAIChatCompletionsHandler::isStream() const { return request.stream; } +Endpoint OpenAIChatCompletionsHandler::getEndpoint() const { return endpoint; } std::string OpenAIChatCompletionsHandler::getModel() const { return request.model; } std::string OpenAIChatCompletionsHandler::getToolChoice() const { return request.toolChoice; } const std::unique_ptr& OpenAIChatCompletionsHandler::getOutputParser() const { return outputParser; } @@ -937,6 +1656,8 @@ absl::Status OpenAIChatCompletionsHandler::parseRequest(std::optional return status; if (endpoint == Endpoint::COMPLETIONS) status = parseCompletionsPart(); + else if (endpoint == Endpoint::RESPONSES) + status = parseResponsesPart(maxTokensLimit, allowedLocalMediaPath, allowedMediaDomains); else status = parseChatCompletionsPart(maxTokensLimit, allowedLocalMediaPath, allowedMediaDomains); @@ -977,7 +1698,7 @@ static bool hasToolCallsInStreamingDelta(const rapidjson::Document& delta) { ParsedOutput OpenAIChatCompletionsHandler::parseOutputIfNeeded(const std::vector& generatedIds) { OVMS_PROFILE_FUNCTION(); ParsedOutput parsedOutput; - if (endpoint != Endpoint::CHAT_COMPLETIONS || outputParser == nullptr) { + if ((endpoint != Endpoint::CHAT_COMPLETIONS && endpoint != Endpoint::RESPONSES) || outputParser == nullptr) { parsedOutput.content = this->tokenizer.decode(generatedIds); } else { parsedOutput = outputParser->parse(generatedIds, this->areToolsAvailable()); @@ -987,6 +1708,20 @@ ParsedOutput OpenAIChatCompletionsHandler::parseOutputIfNeeded(const std::vector std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vector& generationOutputs) { OVMS_PROFILE_FUNCTION(); + if (endpoint == Endpoint::RESPONSES) { + std::vector parsedOutputs; + usage.completionTokens = 0; + ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP; + for (const ov::genai::GenerationOutput& generationOutput : generationOutputs) { + updateUsage(usage, generationOutput.generated_ids, request.echo); + parsedOutputs.push_back(parseOutputIfNeeded(generationOutput.generated_ids)); + if (generationOutput.finish_reason == ov::genai::GenerationFinishReason::LENGTH) { + responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH; + } + } + return serializeResponsesUnaryResponse(parsedOutputs, responsesFinishReason); + } + OpenAiJsonResponse jsonResponse; jsonResponse.StartObject(); @@ -1015,7 +1750,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect // index: integer; Choice index, only n=1 supported anyway jsonResponse.Index(index++); - // logprobs: object/null; Log probability information for the choice. TODO + // TODO: logprobs: object/null; Log probability information for the choice. if (this->request.logprobschat || this->request.logprobs) { jsonResponse.StartObject("logprobs"); if (endpoint == Endpoint::CHAT_COMPLETIONS) { @@ -1096,11 +1831,9 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect jsonResponse.UsageObject(usage); - // TODO - // id: string; A unique identifier for the chat completion. + // TODO: id: string; A unique identifier for the chat completion. - // TODO - // system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. + // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. // finish response object @@ -1112,6 +1845,14 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco OVMS_PROFILE_FUNCTION(); usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); + if (endpoint == Endpoint::RESPONSES) { + std::vector parsedOutputs; + for (const auto& tokens : results.tokens) { + parsedOutputs.push_back(parseOutputIfNeeded(tokens)); + } + return serializeResponsesUnaryResponse(parsedOutputs); + } + OpenAiJsonResponse jsonResponse; jsonResponse.StartObject(); @@ -1156,11 +1897,9 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco jsonResponse.UsageObject(usage); - // TODO - // id: string; A unique identifier for the chat completion. + // TODO: id: string; A unique identifier for the chat completion. - // TODO - // system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. + // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. // finish response object @@ -1172,6 +1911,34 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD OVMS_PROFILE_FUNCTION(); usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); + if (endpoint == Endpoint::RESPONSES) { + // Usage is already correctly set from perf_metrics above — no need for updateUsage. + std::vector parsedOutputs; + for (const std::string& text : results.texts) { + if (outputParser != nullptr) { + // Same workaround as in chat completions, line part + auto result = tokenizer.encode(text); + auto& input_ids = result.input_ids; + if (input_ids.get_shape().size() != 2) + throw std::runtime_error("input_ids should have 2 dimensions"); + if (input_ids.get_shape()[0] != 1) + throw std::runtime_error("input_ids should have 1 batch size"); + if (input_ids.get_element_type() != ov::element::i64) + throw std::runtime_error("input_ids should have i64 element type"); + + int64_t* inputIdsData = reinterpret_cast(input_ids.data()); + std::vector generatedTokens(inputIdsData, inputIdsData + input_ids.get_shape()[1]); + parsedOutputs.push_back(parseOutputIfNeeded(generatedTokens)); + } else { + // Fast path: no output parser, use decoded text directly. + ParsedOutput output; + output.content = text; + parsedOutputs.push_back(std::move(output)); + } + } + return serializeResponsesUnaryResponse(parsedOutputs); + } + OpenAiJsonResponse jsonResponse; jsonResponse.StartObject(); @@ -1205,7 +1972,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD jsonResponse.FinishReason(finishReason.value_or("unknown")); // index: integer; Choice index, only n=1 supported anyway jsonResponse.Index(index++); - // logprobs: object/null; Log probability information for the choice. TODO + // TODO: logprobs: object/null; Log probability information for the choice. if (endpoint == Endpoint::CHAT_COMPLETIONS) { jsonResponse.MessageObject(parsedOutput); @@ -1234,11 +2001,9 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD jsonResponse.UsageObject(usage); - // TODO - // id: string; A unique identifier for the chat completion. + // TODO: id: string; A unique identifier for the chat completion. - // TODO - // system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. + // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. // finish response object @@ -1246,8 +2011,512 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD return jsonResponse.ToString(); } +void OpenAIChatCompletionsHandler::writeEventHeader(Writer& writer, const char* eventType) { + writer.StartObject(); + writer.String("type"); + writer.String(eventType); + writer.String("sequence_number"); + writer.Uint64(responsesState.sequenceNumber++); +} + +void OpenAIChatCompletionsHandler::writeContentLocation(Writer& writer, const std::string& itemId, uint64_t outputIndex) { + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("content_index"); + writer.Uint64(0); + writer.String("item_id"); + writer.String(itemId.c_str()); +} + +void OpenAIChatCompletionsHandler::writeReasoningLocation(Writer& writer, const std::string& itemId) { + writer.String("output_index"); + writer.Uint64(0); + writer.String("summary_index"); + writer.Uint64(0); + writer.String("item_id"); + writer.String(itemId.c_str()); +} + +std::string OpenAIChatCompletionsHandler::serializeResponseCreatedEvent(const std::string& responseId, int64_t createdAt) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.created"); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeResponseInProgressEvent(const std::string& responseId, int64_t createdAt) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.in_progress"); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeOutputItemAddedEvent(const std::string& outputItemId, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_item.added"); + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("item"); + serializeResponsesOutputItem(writer, outputItemId, "", "in_progress"); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeContentPartAddedEvent(const std::string& outputItemId, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.content_part.added"); + writeContentLocation(writer, outputItemId, outputIndex); + writer.String("part"); + serializeOutputTextPart(writer, ""); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeOutputTextDeltaEvent(const std::string& outputItemId, const std::string& delta, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_text.delta"); + writeContentLocation(writer, outputItemId, outputIndex); + writer.String("delta"); + writer.String(delta.c_str()); + // TODO: logprobs not supported + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeOutputTextDoneEvent(const std::string& outputItemId, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_text.done"); + writeContentLocation(writer, outputItemId, outputIndex); + writer.String("text"); + writer.String(responsesState.outputText.c_str()); + // TODO: logprobs not supported + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeContentPartDoneEvent(const std::string& outputItemId, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.content_part.done"); + writeContentLocation(writer, outputItemId, outputIndex); + writer.String("part"); + serializeOutputTextPart(writer, responsesState.outputText); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeOutputItemDoneEvent(const std::string& outputItemId, ov::genai::GenerationFinishReason finishReason, uint64_t outputIndex) { + const std::string itemStatus = (finishReason == ov::genai::GenerationFinishReason::LENGTH) ? "incomplete" : "completed"; + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_item.done"); + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("item"); + serializeResponsesOutputItem(writer, outputItemId, responsesState.outputText, itemStatus); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeResponseCompletedEvent(const std::string& responseId, int64_t createdAt, ov::genai::GenerationFinishReason finishReason) { + const bool isIncomplete = (finishReason == ov::genai::GenerationFinishReason::LENGTH); + const std::string responseStatus = isIncomplete ? "incomplete" : "completed"; + const char* eventType = isIncomplete ? "response.incomplete" : "response.completed"; + std::optional incompleteReason = isIncomplete ? std::optional("max_tokens") : std::nullopt; + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, eventType); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, responseStatus, responsesState.outputText, true, incompleteReason); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeResponseFailedEventBody(const std::string& responseId, int64_t createdAt, const std::string& errorMessage, ResponsesErrorCode errorCode) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.failed"); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, "failed", responsesState.outputText, false, + std::nullopt, errorMessage, errorCode); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeReasoningOutputItemAddedEvent(const std::string& reasoningItemId) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_item.added"); + writer.String("output_index"); + writer.Uint64(0); + writer.String("item"); + writer.StartObject(); + writer.String("id"); + writer.String(reasoningItemId.c_str()); + writer.String("type"); + writer.String("reasoning"); + writer.String("summary"); + writer.StartArray(); + writer.EndArray(); + writer.EndObject(); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeReasoningSummaryPartAddedEvent(const std::string& reasoningItemId) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.reasoning_summary_part.added"); + writeReasoningLocation(writer, reasoningItemId); + writer.String("part"); + writer.StartObject(); + writer.String("type"); + writer.String("summary_text"); + writer.String("text"); + writer.String(""); + writer.EndObject(); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeReasoningSummaryTextDeltaEvent(const std::string& reasoningItemId, const std::string& delta) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.reasoning_summary_text.delta"); + writeReasoningLocation(writer, reasoningItemId); + writer.String("delta"); + writer.String(delta.c_str()); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeReasoningSummaryTextDoneEvent(const std::string& reasoningItemId) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.reasoning_summary_text.done"); + writeReasoningLocation(writer, reasoningItemId); + writer.String("text"); + writer.String(responsesState.reasoningText.c_str()); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeReasoningSummaryPartDoneEvent(const std::string& reasoningItemId) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.reasoning_summary_part.done"); + writeReasoningLocation(writer, reasoningItemId); + writer.String("part"); + writer.StartObject(); + writer.String("type"); + writer.String("summary_text"); + writer.String("text"); + writer.String(responsesState.reasoningText.c_str()); + writer.EndObject(); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeReasoningOutputItemDoneEvent(const std::string& reasoningItemId) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_item.done"); + writer.String("output_index"); + writer.Uint64(0); + writer.String("item"); + writer.StartObject(); + writer.String("id"); + writer.String(reasoningItemId.c_str()); + writer.String("type"); + writer.String("reasoning"); + writer.String("summary"); + writer.StartArray(); + writer.StartObject(); + writer.String("type"); + writer.String("summary_text"); + writer.String("text"); + writer.String(responsesState.reasoningText.c_str()); + writer.EndObject(); + writer.EndArray(); + writer.EndObject(); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeFunctionCallOutputItemAddedEvent(const ToolCall& toolCall, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_item.added"); + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("item"); + writer.StartObject(); + writer.String("id"); + writer.String(toolCall.id.c_str()); + writer.String("type"); + writer.String("function_call"); + writer.String("status"); + writer.String("in_progress"); + writer.String("call_id"); + writer.String(toolCall.id.c_str()); + writer.String("name"); + writer.String(toolCall.name.c_str()); + writer.String("arguments"); + writer.String(""); + writer.EndObject(); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeFunctionCallArgumentsDeltaEvent(const std::string& callId, const std::string& delta, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.function_call_arguments.delta"); + writer.String("item_id"); + writer.String(callId.c_str()); + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("call_id"); + writer.String(callId.c_str()); + writer.String("delta"); + writer.String(delta.c_str()); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeFunctionCallArgumentsDoneEvent(const ToolCall& toolCall, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.function_call_arguments.done"); + writer.String("item_id"); + writer.String(toolCall.id.c_str()); + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("call_id"); + writer.String(toolCall.id.c_str()); + writer.String("arguments"); + writer.String(toolCall.arguments.c_str()); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeFunctionCallOutputItemDoneEvent(const ToolCall& toolCall, ov::genai::GenerationFinishReason finishReason, uint64_t outputIndex) { + const std::string itemStatus = (finishReason == ov::genai::GenerationFinishReason::LENGTH) ? "incomplete" : "completed"; + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_item.done"); + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("item"); + writer.StartObject(); + writer.String("id"); + writer.String(toolCall.id.c_str()); + writer.String("type"); + writer.String("function_call"); + writer.String("status"); + writer.String(itemStatus.c_str()); + writer.String("call_id"); + writer.String(toolCall.id.c_str()); + writer.String("name"); + writer.String(toolCall.name.c_str()); + writer.String("arguments"); + writer.String(toolCall.arguments.c_str()); + writer.EndObject(); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeResponsesStreamingInitEvents() { + const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); + const std::string responseId = "resp-" + std::to_string(createdAt); + const std::string outputItemId = "msg-0"; + + std::vector events; + + events.emplace_back(serializeResponseCreatedEvent(responseId, createdAt)); + events.emplace_back(serializeResponseInProgressEvent(responseId, createdAt)); + + // When outputParser is present, defer output item events until first chunk + // because reasoning items need to come before message items + if (outputParser == nullptr) { + events.emplace_back(serializeOutputItemAddedEvent(outputItemId)); + events.emplace_back(serializeContentPartAddedEvent(outputItemId)); + responsesState.messageInitialized = true; + } + + responsesState.initialized = true; + + std::stringstream ss; + ss << events.front(); + for (size_t i = 1; i < events.size(); ++i) { + ss << "\n\ndata: " << events[i]; + } + return ss.str(); +} + std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason) { OVMS_PROFILE_FUNCTION(); + if (endpoint == Endpoint::RESPONSES) { + const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); + const std::string responseId = "resp-" + std::to_string(createdAt); + const std::string outputItemId = "msg-0"; + const std::string reasoningItemId = "rs-0"; + + std::vector events; + if (!responsesState.initialized) { + // Fallback: if init events were not sent earlier, emit them now + std::string initEvents = serializeResponsesStreamingInitEvents(); + if (!initEvents.empty()) { + events.emplace_back(std::move(initEvents)); + } + } + + if (outputParser != nullptr) { + // Use output parser to separate reasoning from content + std::optional delta = outputParser->parseChunk(chunkResponse, areToolsAvailable(), finishReason); + + if (delta.has_value() && delta->HasMember("delta") && (*delta)["delta"].IsObject()) { + const auto& deltaObj = (*delta)["delta"]; + if (deltaObj.HasMember("reasoning_content") && deltaObj["reasoning_content"].IsString()) { + // Reasoning chunk + if (!responsesState.reasoningInitialized) { + events.emplace_back(serializeReasoningOutputItemAddedEvent(reasoningItemId)); + events.emplace_back(serializeReasoningSummaryPartAddedEvent(reasoningItemId)); + responsesState.reasoningInitialized = true; + } + const std::string reasoningText = deltaObj["reasoning_content"].GetString(); + responsesState.reasoningText += reasoningText; + events.emplace_back(serializeReasoningSummaryTextDeltaEvent(reasoningItemId, reasoningText)); + } else if (deltaObj.HasMember("content") && deltaObj["content"].IsString()) { + // Content chunk - close reasoning if it was active, init message if needed + if (responsesState.reasoningInitialized && !responsesState.reasoningCompleted) { + events.emplace_back(serializeReasoningSummaryTextDoneEvent(reasoningItemId)); + events.emplace_back(serializeReasoningSummaryPartDoneEvent(reasoningItemId)); + events.emplace_back(serializeReasoningOutputItemDoneEvent(reasoningItemId)); + responsesState.reasoningCompleted = true; + } + const uint64_t msgIdx = responsesState.reasoningInitialized ? 1 : 0; + if (!responsesState.messageInitialized) { + events.emplace_back(serializeOutputItemAddedEvent(outputItemId, msgIdx)); + events.emplace_back(serializeContentPartAddedEvent(outputItemId, msgIdx)); + responsesState.messageInitialized = true; + } + const std::string contentText = deltaObj["content"].GetString(); + responsesState.outputText += contentText; + events.emplace_back(serializeOutputTextDeltaEvent(outputItemId, contentText, msgIdx)); + } else if (deltaObj.HasMember("tool_calls") && deltaObj["tool_calls"].IsArray()) { + // Tool call chunk - close reasoning if active + if (responsesState.reasoningInitialized && !responsesState.reasoningCompleted) { + events.emplace_back(serializeReasoningSummaryTextDoneEvent(reasoningItemId)); + events.emplace_back(serializeReasoningSummaryPartDoneEvent(reasoningItemId)); + events.emplace_back(serializeReasoningOutputItemDoneEvent(reasoningItemId)); + responsesState.reasoningCompleted = true; + } + const auto& toolCallsArr = deltaObj["tool_calls"]; + for (rapidjson::SizeType i = 0; i < toolCallsArr.Size(); ++i) { + const auto& tc = toolCallsArr[i]; + int tcIndex = tc.HasMember("index") ? tc["index"].GetInt() : 0; + // Determine the output index for this tool call + const uint64_t baseIdx = responsesState.reasoningInitialized ? 1 : 0; + const uint64_t tcOutputIdx = baseIdx + static_cast(tcIndex); + // Determine if this is a new tool call (has function name) + bool isNewToolCall = false; + std::string funcName; + std::string tcId; + std::string argDelta; + if (tc.HasMember("function") && tc["function"].IsObject()) { + const auto& funcObj = tc["function"]; + if (funcObj.HasMember("name") && funcObj["name"].IsString()) { + funcName = funcObj["name"].GetString(); + isNewToolCall = true; + } + if (funcObj.HasMember("arguments") && funcObj["arguments"].IsString()) { + argDelta = funcObj["arguments"].GetString(); + } + } + if (tc.HasMember("id") && tc["id"].IsString()) { + tcId = tc["id"].GetString(); + } + if (isNewToolCall) { + // Ensure we have enough entries in our tracking vector + while (static_cast(responsesState.toolCalls.size()) <= tcIndex) { + responsesState.toolCalls.push_back(ToolCall{}); + } + responsesState.toolCalls[tcIndex].id = tcId; + responsesState.toolCalls[tcIndex].name = funcName; + responsesState.toolCalls[tcIndex].arguments = ""; + events.emplace_back(serializeFunctionCallOutputItemAddedEvent(responsesState.toolCalls[tcIndex], tcOutputIdx)); + } + if (!argDelta.empty() && static_cast(responsesState.toolCalls.size()) > tcIndex) { + responsesState.toolCalls[tcIndex].arguments += argDelta; + events.emplace_back(serializeFunctionCallArgumentsDeltaEvent(responsesState.toolCalls[tcIndex].id, argDelta, tcOutputIdx)); + } + } + } + } + // If delta is nullopt, the parser is accumulating tag tokens - skip + } else { + // No parser - pass through raw text + if (!chunkResponse.empty()) { + responsesState.outputText += chunkResponse; + events.emplace_back(serializeOutputTextDeltaEvent(outputItemId, chunkResponse)); + } + } + + if (finishReason != ov::genai::GenerationFinishReason::NONE) { + // Close any open reasoning that wasn't closed by content transition + if (responsesState.reasoningInitialized && !responsesState.reasoningCompleted) { + events.emplace_back(serializeReasoningSummaryTextDoneEvent(reasoningItemId)); + events.emplace_back(serializeReasoningSummaryPartDoneEvent(reasoningItemId)); + events.emplace_back(serializeReasoningOutputItemDoneEvent(reasoningItemId)); + responsesState.reasoningCompleted = true; + } + // Emit done events for any streaming tool calls + if (!responsesState.toolCalls.empty()) { + const uint64_t baseIdx = responsesState.reasoningInitialized ? 1 : 0; + for (size_t i = 0; i < responsesState.toolCalls.size(); ++i) { + const uint64_t tcOutputIdx = baseIdx + static_cast(i); + events.emplace_back(serializeFunctionCallArgumentsDoneEvent(responsesState.toolCalls[i], tcOutputIdx)); + events.emplace_back(serializeFunctionCallOutputItemDoneEvent(responsesState.toolCalls[i], finishReason, tcOutputIdx)); + } + } + // Only emit message item if content was produced or no tool calls were generated + if (!responsesState.outputText.empty() || responsesState.toolCalls.empty()) { + const uint64_t msgIdx = (responsesState.reasoningInitialized ? 1 : 0) + responsesState.toolCalls.size(); + if (!responsesState.messageInitialized) { + events.emplace_back(serializeOutputItemAddedEvent(outputItemId, msgIdx)); + events.emplace_back(serializeContentPartAddedEvent(outputItemId, msgIdx)); + responsesState.messageInitialized = true; + } + events.emplace_back(serializeOutputTextDoneEvent(outputItemId, msgIdx)); + events.emplace_back(serializeContentPartDoneEvent(outputItemId, msgIdx)); + events.emplace_back(serializeOutputItemDoneEvent(outputItemId, finishReason, msgIdx)); + } + events.emplace_back(serializeResponseCompletedEvent(responseId, createdAt, finishReason)); + } + + if (events.empty()) { + return ""; + } + + std::stringstream ss; + ss << events.front(); + for (size_t i = 1; i < events.size(); ++i) { + ss << "\n\ndata: " << events[i]; + } + return ss.str(); + } + Document doc; doc.SetObject(); Document::AllocatorType& allocator = doc.GetAllocator(); @@ -1268,7 +2537,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str // null - natural scenario when the generation has not completed yet // index: integer; Choice index, only n=1 supported anyway choice.AddMember("index", 0, allocator); - // logprobs: object/null; Log probability information for the choice. TODO + // TODO: logprobs: object/null; Log probability information for the choice. choice.AddMember("logprobs", Value(), allocator); if (endpoint == Endpoint::CHAT_COMPLETIONS) { if (outputParser != nullptr) { @@ -1319,11 +2588,9 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str doc.AddMember("usage", Value(), allocator); } - // TODO - // id: string; A unique identifier for the chat completion. Each chunk has the same ID. + // TODO: id: string; A unique identifier for the chat completion. Each chunk has the same ID. - // TODO - // system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. + // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. StringBuffer buffer; @@ -1332,8 +2599,33 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str return buffer.GetString(); } +std::string OpenAIChatCompletionsHandler::serializeResponsesFailedEvent(const std::string& errorMessage, ResponsesErrorCode errorCode) { + const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); + const std::string responseId = "resp-" + std::to_string(createdAt); + + std::vector events; + if (!responsesState.initialized) { + std::string initEvents = serializeResponsesStreamingInitEvents(); + if (!initEvents.empty()) { + events.emplace_back(std::move(initEvents)); + } + } + + events.emplace_back(serializeResponseFailedEventBody(responseId, createdAt, errorMessage, errorCode)); + + std::stringstream ss; + ss << events.front(); + for (size_t i = 1; i < events.size(); ++i) { + ss << "\n\ndata: " << events[i]; + } + return ss.str(); +} + std::string OpenAIChatCompletionsHandler::serializeStreamingUsageChunk() { OVMS_PROFILE_FUNCTION(); + if (endpoint == Endpoint::RESPONSES) { + return ""; + } StringBuffer buffer; Writer writer(buffer); @@ -1345,7 +2637,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingUsageChunk() { // created: integer; Unix timestamp (in seconds) when the MP graph was created. writer.String("created"); - writer.Int(std::chrono::duration_cast(created.time_since_epoch()).count()); + writer.Int64(std::chrono::duration_cast(created.time_since_epoch()).count()); // model: string; copied from the request writer.String("model"); @@ -1363,11 +2655,11 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingUsageChunk() { writer.String("usage"); writer.StartObject(); // { writer.String("prompt_tokens"); - writer.Int(usage.promptTokens); + writer.Uint64(static_cast(usage.promptTokens)); writer.String("completion_tokens"); - writer.Int(usage.completionTokens); + writer.Uint64(static_cast(usage.completionTokens)); writer.String("total_tokens"); - writer.Int(usage.calculateTotalTokens()); + writer.Uint64(static_cast(usage.calculateTotalTokens())); writer.EndObject(); // } writer.EndObject(); // } diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 516133f03a..7fc331aea5 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -47,9 +47,26 @@ namespace ovms { enum class Endpoint { CHAT_COMPLETIONS, COMPLETIONS, + RESPONSES, TOKENIZE, }; +enum class ResponsesErrorCode { + SERVER_ERROR, + INVALID_PROMPT, +}; + +inline const char* responsesErrorCodeToString(ResponsesErrorCode code) { + switch (code) { + case ResponsesErrorCode::SERVER_ERROR: + return "server_error"; + case ResponsesErrorCode::INVALID_PROMPT: + return "invalid_prompt"; + default: + return "server_error"; + } +} + struct CompletionUsageStatistics { size_t promptTokens = 0; size_t completionTokens = 0; @@ -61,6 +78,19 @@ struct CompletionUsageStatistics { // Class that wraps OpenAI request, holds and processes raw JSON, provides methods for serialization and keeps track of usage. // It is used in the calculator. + +// Encapsulates all mutable state accumulated during Responses API streaming. +struct ResponsesStreamingState { + size_t sequenceNumber = 1; + bool initialized = false; + bool reasoningInitialized = false; + bool reasoningCompleted = false; + bool messageInitialized = false; + std::string outputText; + std::string reasoningText; + ToolCalls_t toolCalls; +}; + class OpenAIChatCompletionsHandler { Document& doc; Endpoint endpoint; @@ -69,17 +99,63 @@ class OpenAIChatCompletionsHandler { std::chrono::time_point created; ov::genai::Tokenizer tokenizer; size_t processedTokens = 0; // tracks overall number of tokens processed by the pipeline + ResponsesStreamingState responsesState; // Output parser is used to parse chat completions response to extract specific fields like tool calls and reasoning. std::unique_ptr outputParser = nullptr; absl::Status parseCompletionsPart(); absl::Status parseChatCompletionsPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); + absl::Status parseResponsesPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); + absl::Status parseResponsesInput(std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); absl::Status parseCommonPart(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength); ParsedOutput parseOutputIfNeeded(const std::vector& generatedIds); absl::Status ensureArgumentsInToolCalls(Value& messageObj, bool& jsonChanged); + // Responses API serialization helpers + void serializeResponsesToolChoice(Writer& writer) const; + void serializeResponsesTools(Writer& writer) const; + void serializeResponsesResponseObject(Writer& writer, const std::string& responseId, int64_t createdAt, + const std::string& status, const std::string& fullOutputText, bool includeUsage, + const std::optional& incompleteReason = std::nullopt, const std::optional& errorMessage = std::nullopt, ResponsesErrorCode errorCode = ResponsesErrorCode::SERVER_ERROR) const; + static void serializeResponsesOutputItem(Writer& writer, const std::string& outputItemId, + const std::string& text, const std::string& status); + static void serializeOutputTextPart(Writer& writer, const std::string& text); + std::string serializeResponsesUnaryResponse(const std::vector& parsedOutputs, + ov::genai::GenerationFinishReason finishReason = ov::genai::GenerationFinishReason::STOP) const; + + // Responses API streaming event building blocks + void writeEventHeader(Writer& writer, const char* eventType); + static void writeContentLocation(Writer& writer, const std::string& itemId, uint64_t outputIndex = 0); + static void writeReasoningLocation(Writer& writer, const std::string& itemId); + + // Individual Responses API streaming event serializers + std::string serializeResponseCreatedEvent(const std::string& responseId, int64_t createdAt); + std::string serializeResponseInProgressEvent(const std::string& responseId, int64_t createdAt); + std::string serializeOutputItemAddedEvent(const std::string& outputItemId, uint64_t outputIndex = 0); + std::string serializeContentPartAddedEvent(const std::string& outputItemId, uint64_t outputIndex = 0); + std::string serializeOutputTextDeltaEvent(const std::string& outputItemId, const std::string& delta, uint64_t outputIndex = 0); + std::string serializeOutputTextDoneEvent(const std::string& outputItemId, uint64_t outputIndex = 0); + std::string serializeContentPartDoneEvent(const std::string& outputItemId, uint64_t outputIndex = 0); + std::string serializeOutputItemDoneEvent(const std::string& outputItemId, ov::genai::GenerationFinishReason finishReason, uint64_t outputIndex = 0); + std::string serializeResponseCompletedEvent(const std::string& responseId, int64_t createdAt, ov::genai::GenerationFinishReason finishReason); + + // Reasoning streaming event serializers + std::string serializeReasoningOutputItemAddedEvent(const std::string& reasoningItemId); + std::string serializeReasoningSummaryPartAddedEvent(const std::string& reasoningItemId); + std::string serializeReasoningSummaryTextDeltaEvent(const std::string& reasoningItemId, const std::string& delta); + std::string serializeReasoningSummaryTextDoneEvent(const std::string& reasoningItemId); + std::string serializeReasoningSummaryPartDoneEvent(const std::string& reasoningItemId); + std::string serializeReasoningOutputItemDoneEvent(const std::string& reasoningItemId); + std::string serializeResponseFailedEventBody(const std::string& responseId, int64_t createdAt, const std::string& errorMessage, ResponsesErrorCode errorCode); + + // Function call streaming event serializers + std::string serializeFunctionCallOutputItemAddedEvent(const ToolCall& toolCall, uint64_t outputIndex); + std::string serializeFunctionCallArgumentsDeltaEvent(const std::string& callId, const std::string& delta, uint64_t outputIndex); + std::string serializeFunctionCallArgumentsDoneEvent(const ToolCall& toolCall, uint64_t outputIndex); + std::string serializeFunctionCallOutputItemDoneEvent(const ToolCall& toolCall, ov::genai::GenerationFinishReason finishReason, uint64_t outputIndex); + public: OpenAIChatCompletionsHandler(Document& doc, Endpoint endpoint, std::chrono::time_point creationTime, ov::genai::Tokenizer tokenizer, const std::string& toolParserName = "", const std::string& reasoningParserName = "") : @@ -106,6 +182,7 @@ class OpenAIChatCompletionsHandler { std::optional getResponseFormat() const; bool isStream() const; + Endpoint getEndpoint() const; std::string getModel() const; std::string getToolChoice() const; const std::unique_ptr& getOutputParser() const; @@ -128,5 +205,7 @@ class OpenAIChatCompletionsHandler { std::string serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason); std::string serializeStreamingUsageChunk(); std::string serializeStreamingHandshakeChunk(); + std::string serializeResponsesStreamingInitEvents(); + std::string serializeResponsesFailedEvent(const std::string& errorMessage, ResponsesErrorCode errorCode = ResponsesErrorCode::SERVER_ERROR); }; } // namespace ovms diff --git a/src/llm/http_llm_calculator.cc b/src/llm/http_llm_calculator.cc index ae6461c61a..ff914864f9 100644 --- a/src/llm/http_llm_calculator.cc +++ b/src/llm/http_llm_calculator.cc @@ -125,6 +125,22 @@ class HttpLLMCalculator : public CalculatorBase { if (status != absl::OkStatus()) return status; SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "LLMCalculator [Node: {}] Pipeline execution scheduled successfully", cc->NodeName()); + + // For RESPONSES streaming, emit init events (response.created, response.in_progress, etc.) + // immediately after scheduling, before blocking on readPartialExecutionResults. + // This reduces perceived latency - the client sees the response is created right away. + if (executionContext->apiHandler->isStream() && executionContext->endpoint == Endpoint::RESPONSES) { + std::string initEvents = executionContext->apiHandler->serializeResponsesStreamingInitEvents(); + if (!initEvents.empty()) { + executionContext->response = wrapTextInServerSideEventMessage(initEvents); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(new std::string{std::move(executionContext->response)}, iterationBeginTimestamp); + executionContext->response = ""; + } + cc->Outputs().Tag(LOOPBACK_TAG_NAME).Add(new bool{true}, iterationBeginTimestamp); + auto now = std::chrono::system_clock::now(); + iterationBeginTimestamp = ::mediapipe::Timestamp(std::chrono::duration_cast(now.time_since_epoch()).count()); + return absl::OkStatus(); + } } if (!executionContext->apiHandler->isStream()) { // Unary scenario @@ -160,8 +176,22 @@ class HttpLLMCalculator : public CalculatorBase { cc->Outputs().Tag(LOOPBACK_TAG_NAME).Add(new bool{true}, iterationBeginTimestamp); } } catch (ov::AssertFailure& e) { + if (executionContext->apiHandler && executionContext->apiHandler->isStream() && executionContext->endpoint == Endpoint::RESPONSES) { + std::string failedEvent = executionContext->apiHandler->serializeResponsesFailedEvent(e.what()); + executionContext->response = wrapTextInServerSideEventMessage(failedEvent); + executionContext->response += wrapTextInServerSideEventMessage("[DONE]"); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(new std::string{std::move(executionContext->response)}, iterationBeginTimestamp); + return absl::OkStatus(); + } return absl::InvalidArgumentError(e.what()); } catch (...) { + if (executionContext->apiHandler && executionContext->apiHandler->isStream() && executionContext->endpoint == Endpoint::RESPONSES) { + std::string failedEvent = executionContext->apiHandler->serializeResponsesFailedEvent("Response generation failed"); + executionContext->response = wrapTextInServerSideEventMessage(failedEvent); + executionContext->response += wrapTextInServerSideEventMessage("[DONE]"); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(new std::string{std::move(executionContext->response)}, iterationBeginTimestamp); + return absl::OkStatus(); + } return absl::InvalidArgumentError("Response generation failed"); } auto now = std::chrono::system_clock::now(); diff --git a/src/llm/py_jinja_template_processor.hpp b/src/llm/py_jinja_template_processor.hpp index 219dd5250c..95b9e8598b 100644 --- a/src/llm/py_jinja_template_processor.hpp +++ b/src/llm/py_jinja_template_processor.hpp @@ -18,7 +18,6 @@ #include #include -#include #pragma warning(push) #pragma warning(disable : 6326 28182 6011 28020) // Python execution for template processing diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 6d9810ae5f..725967ceac 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -68,10 +68,12 @@ absl::Status GenAiServable::loadRequest(std::shared_ptrendpoint = Endpoint::CHAT_COMPLETIONS; } else if (payload.uri == "/v3/completions" || payload.uri == "/v3/v1/completions") { executionContext->endpoint = Endpoint::COMPLETIONS; + } else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") { + executionContext->endpoint = Endpoint::RESPONSES; } else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) { executionContext->endpoint = Endpoint::TOKENIZE; } else { - return absl::InvalidArgumentError("Wrong endpoint. Allowed endpoints: /v3/chat/completions, /v3/completions"); + return absl::InvalidArgumentError("Wrong endpoint. Allowed endpoints: /v3/chat/completions, /v3/completions, /v3/responses, /v3/tokenize"); } executionContext->payload = payload; return absl::OkStatus(); @@ -204,6 +206,45 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrapiHandler->getChatHistory().size() > 0) { +#if (PYTHON_DISABLE == 0) + bool success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); + if (!success) { + return absl::Status(absl::StatusCode::kInvalidArgument, inputText); + } +#else + ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory(); + constexpr bool add_generation_prompt = true; + auto toolsStatus = executionContext->apiHandler->parseToolsToJsonContainer(); + if (!toolsStatus.ok()) { + return toolsStatus.status(); + } + const auto& tools = toolsStatus.value(); + auto chatTemplateKwargsStatus = executionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); + if (!chatTemplateKwargsStatus.ok()) { + return chatTemplateKwargsStatus.status(); + } + const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value(); + try { + inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools, chatTemplateKwargs); + } catch (const std::exception& e) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); + return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); + } +#endif + if (inputText.size() == 0) { + return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); + } + } else { + auto prompt = executionContext->apiHandler->getPrompt(); + if (!prompt.has_value()) { + return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing"); + } + inputText = prompt.value(); + } + break; + } case Endpoint::COMPLETIONS: { inputText = executionContext->apiHandler->getPrompt().value(); break; @@ -264,7 +305,10 @@ absl::Status GenAiServable::preparePartialResponse(std::shared_ptr 0) { + // For RESPONSES endpoint, always call serializeStreamingChunk so that + // initialization events (response.created, response.in_progress, etc.) + // are emitted immediately, even before the tokenizer produces text. + if (lastTextChunk.size() > 0 || executionContext->apiHandler->getEndpoint() == Endpoint::RESPONSES) { std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); @@ -286,8 +330,12 @@ absl::Status GenAiServable::preparePartialResponse(std::shared_ptrresponse = wrapTextInServerSideEventMessage(serializedChunk); } - if (executionContext->apiHandler->getStreamOptions().includeUsage) - executionContext->response += wrapTextInServerSideEventMessage(executionContext->apiHandler->serializeStreamingUsageChunk()); + if (executionContext->apiHandler->getStreamOptions().includeUsage) { + std::string usageChunk = executionContext->apiHandler->serializeStreamingUsageChunk(); + if (!usageChunk.empty()) { + executionContext->response += wrapTextInServerSideEventMessage(usageChunk); + } + } executionContext->response += wrapTextInServerSideEventMessage("[DONE]"); diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index be33838d9f..94aef05387 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -45,10 +45,12 @@ absl::Status VisualLanguageModelServable::loadRequest(std::shared_ptrendpoint = Endpoint::CHAT_COMPLETIONS; + } else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") { + executionContext->endpoint = Endpoint::RESPONSES; } else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) { executionContext->endpoint = Endpoint::TOKENIZE; } else { - return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions endpoint or /v3/tokenize"); + return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions, /v3/responses endpoint or /v3/tokenize"); } executionContext->payload = payload; return absl::OkStatus(); @@ -67,7 +69,7 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrapiHandler == nullptr) { return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized"); } - if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS) { + if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); for (size_t i = 0; i < chatHistory.size(); i++) { diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 2834072410..307723415a 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -53,10 +53,12 @@ absl::Status VisualLanguageModelLegacyServable::loadRequest(std::shared_ptrendpoint = Endpoint::CHAT_COMPLETIONS; + } else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") { + executionContext->endpoint = Endpoint::RESPONSES; } else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) { executionContext->endpoint = Endpoint::TOKENIZE; } else { - return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions endpoint or /v3/tokenize"); + return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions, /v3/responses endpoint or /v3/tokenize"); } executionContext->payload = payload; return absl::OkStatus(); @@ -237,7 +239,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrapiHandler == nullptr) { return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized"); } - if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS) { + if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); for (size_t i = 0; i < chatHistory.size(); i++) { diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 94648d0e68..618b6252f4 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -14,6 +14,7 @@ // limitations under the License. //***************************************************************************** #include +#include #include #include #include @@ -269,55 +270,29 @@ TEST_F(HttpOpenAIHandlerTest, Stream) { ASSERT_EQ(response, ""); } -TEST_F(HttpOpenAIHandlerTest, BodyNotAJson) { - std::string requestBody = "not a json"; - - EXPECT_CALL(*writer, PartialReplyEnd()).Times(0); - EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(0); - EXPECT_CALL(*writer, IsDisconnected()).Times(0); - - auto status = handler->dispatchToProcessor("/v3/completions", requestBody, &response, comp, responseComponents, writer, multiPartParser); - ASSERT_EQ(status, ovms::StatusCode::JSON_INVALID); - ASSERT_EQ(status.string(), "The file is not valid json - Cannot parse JSON body"); -} - -TEST_F(HttpOpenAIHandlerTest, JsonBodyValidButNotAnObject) { - std::string requestBody = "[1, 2, 3]"; - - EXPECT_CALL(*writer, PartialReplyEnd()).Times(0); - EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(0); - EXPECT_CALL(*writer, IsDisconnected()).Times(0); - - auto status = handler->dispatchToProcessor("/v3/completions", requestBody, &response, comp, responseComponents, writer, multiPartParser); - ASSERT_EQ(status, ovms::StatusCode::JSON_INVALID); - ASSERT_EQ(status.string(), "The file is not valid json - JSON body must be an object"); -} - -TEST_F(HttpOpenAIHandlerTest, ModelFieldMissing) { +TEST_F(HttpOpenAIHandlerTest, ResponsesStream) { std::string requestBody = R"( - { - "stream": true, - "messages": [] - } + { + "model": "gpt", + "stream": true, + "input": "What is OpenVINO?" + } )"; - EXPECT_CALL(*writer, PartialReplyEnd()).Times(0); - EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(0); - EXPECT_CALL(*writer, IsDisconnected()).Times(0); + EXPECT_CALL(*writer, PartialReplyBegin(::testing::_)).WillOnce(testing::Invoke([](std::function fn) { fn(); })); + EXPECT_CALL(*writer, PartialReplyEnd()).Times(1); + EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(9); + EXPECT_CALL(*writer, IsDisconnected()).Times(9); - auto status = handler->dispatchToProcessor("/v3/completions", requestBody, &response, comp, responseComponents, writer, multiPartParser); - ASSERT_EQ(status, ovms::StatusCode::JSON_INVALID); - ASSERT_EQ(status.string(), "The file is not valid json - model field is missing in JSON body"); + ASSERT_EQ( + handler->dispatchToProcessor("/v3/responses", requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::PARTIAL_END); + + ASSERT_EQ(response, ""); } -TEST_F(HttpOpenAIHandlerTest, ModelFieldNotAString) { - std::string requestBody = R"( - { - "model": 2, - "stream": true, - "messages": [] - } - )"; +TEST_F(HttpOpenAIHandlerTest, BodyNotAJson) { + std::string requestBody = "not a json"; EXPECT_CALL(*writer, PartialReplyEnd()).Times(0); EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(0); @@ -325,26 +300,19 @@ TEST_F(HttpOpenAIHandlerTest, ModelFieldNotAString) { auto status = handler->dispatchToProcessor("/v3/completions", requestBody, &response, comp, responseComponents, writer, multiPartParser); ASSERT_EQ(status, ovms::StatusCode::JSON_INVALID); - ASSERT_EQ(status.string(), "The file is not valid json - model field is not a string"); + ASSERT_EQ(status.string(), "The file is not valid json - Cannot parse JSON body"); } -TEST_F(HttpOpenAIHandlerTest, StreamFieldNotABoolean) { - std::string requestBody = R"( - { - "model": "gpt", - "stream": 2, - "messages": [] - } - )"; +TEST_F(HttpOpenAIHandlerTest, JsonBodyValidButNotAnObject) { + std::string requestBody = "[1, 2, 3]"; - EXPECT_CALL(*writer, PartialReplyBegin(::testing::_)).Times(0); EXPECT_CALL(*writer, PartialReplyEnd()).Times(0); EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(0); EXPECT_CALL(*writer, IsDisconnected()).Times(0); auto status = handler->dispatchToProcessor("/v3/completions", requestBody, &response, comp, responseComponents, writer, multiPartParser); ASSERT_EQ(status, ovms::StatusCode::JSON_INVALID); - ASSERT_EQ(status.string(), "The file is not valid json - stream field is not a boolean"); + ASSERT_EQ(status.string(), "The file is not valid json - JSON body must be an object"); } TEST_F(HttpOpenAIHandlerTest, GraphWithANameDoesNotExist) { @@ -402,6 +370,304 @@ class HttpOpenAIHandlerParsingTest : public ::testing::Test { } }; +class HttpOpenAIHandlerCommonParsingValidationTest : public HttpOpenAIHandlerParsingTest, + public ::testing::WithParamInterface { +protected: + ovms::Endpoint endpoint() const { + return GetParam(); + } + + std::string createRequestWithRawStreamValue(const std::string& streamRawValue) const { + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"prompt\":\"valid prompt\"}"; + } + if (endpoint() == ovms::Endpoint::RESPONSES) { + return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"input\":\"valid prompt\"}"; + } + return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; + } + + std::string createRequestWithoutModel() const { + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + return "{\"prompt\":\"valid prompt\"}"; + } + if (endpoint() == ovms::Endpoint::RESPONSES) { + return "{\"input\":\"valid prompt\"}"; + } + return "{\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; + } + + std::string createRequestWithNonStringModel() const { + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + return "{\"model\":2,\"prompt\":\"valid prompt\"}"; + } + if (endpoint() == ovms::Endpoint::RESPONSES) { + return "{\"model\":2,\"input\":\"valid prompt\"}"; + } + return "{\"model\":2,\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; + } +}; + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, StreamFieldNotABooleanFails) { + std::string json = createRequestWithRawStreamValue("2"); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("Stream is not bool")); +} + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, ModelFieldMissingFails) { + std::string json = createRequestWithoutModel(); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("model missing in request")); +} + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, ModelFieldNotStringFails) { + std::string json = createRequestWithNonStringModel(); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("model is not a string")); +} + +INSTANTIATE_TEST_SUITE_P( + CommonParsingValidation, + HttpOpenAIHandlerCommonParsingValidationTest, + ::testing::Values(ovms::Endpoint::CHAT_COMPLETIONS, ovms::Endpoint::COMPLETIONS, ovms::Endpoint::RESPONSES), + [](const testing::TestParamInfo& info) { + switch (info.param) { + case ovms::Endpoint::CHAT_COMPLETIONS: + return "ChatCompletions"; + case ovms::Endpoint::COMPLETIONS: + return "Completions"; + case ovms::Endpoint::RESPONSES: + return "Responses"; + default: + return "Unknown"; + } + }); + +class HttpOpenAIHandlerChatAndResponsesParsingTest : public HttpOpenAIHandlerParsingTest, + public ::testing::WithParamInterface { +protected: + ovms::Endpoint endpoint() const { + return GetParam(); + } + + std::string createTextRequest(const std::string& text, const std::string& extraJsonFields = "") const { + if (endpoint() == ovms::Endpoint::RESPONSES) { + return std::string("{\"model\":\"llama\",\"input\":\"") + text + "\"" + extraJsonFields + "}"; + } + return std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"") + text + "\"}]" + extraJsonFields + "}"; + } + + std::string createMultimodalRequestWithImageUrl(const std::string& dataUrl) const { + if (endpoint() == ovms::Endpoint::RESPONSES) { + return std::string("{\"model\":\"llama\",\"input\":[{\"role\":\"user\",\"content\":[{\"type\":\"input_text\",\"text\":\"what is in this image?\"},{\"type\":\"input_image\",\"image_url\":\"") + dataUrl + "\"}]}] }"; + } + return std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"what is in this image?\"},{\"type\":\"image_url\",\"image_url\":{\"url\":\"") + dataUrl + "\"}}]}]}"; + } + + std::string createToolRequest(const std::string& toolChoiceJson) const { + std::string base = createTextRequest("What is the weather like in Boston today?", ",\"tools\":[{\"type\":\"function\",\"function\":{\"name\":\"get_current_weather\",\"parameters\":{\"type\":\"object\",\"properties\":{\"location\":{\"type\":\"string\"}},\"required\":[\"location\"]}}}]"); + if (toolChoiceJson.empty()) { + return base; + } + base.pop_back(); // remove trailing '}' + base += ",\"tool_choice\":" + toolChoiceJson + "}"; + return base; + } + + std::shared_ptr parseCurrentRequest(const std::string& json) { + doc.Parse(json.c_str()); + EXPECT_FALSE(doc.HasParseError()) << json; + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()) << json; + return apiHandler; + } +}; + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTextInputCreatesUserChatMessage) { + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + + auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 1); + ASSERT_TRUE(chatHistory[0].contains("role")); + ASSERT_TRUE(chatHistory[0].contains("content")); + EXPECT_EQ(chatHistory[0]["role"], "user"); + EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); + if (endpoint() == ovms::Endpoint::CHAT_COMPLETIONS) { + // Chat completions with simple text does not mutate the JSON, so processedJson is empty + EXPECT_TRUE(apiHandler->getProcessedJson().empty()); + } +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonContainsEquivalentMessages) { + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + + // For Responses, processedJson is always built from chatHistory. + // For chat/completions with simple text, processedJson is empty (original body is used instead). + // In both cases, the chatHistory should be equivalent. + auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 1); + EXPECT_EQ(chatHistory[0]["role"], "user"); + EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); + + if (endpoint() == ovms::Endpoint::RESPONSES) { + // Responses path builds processedJson with messages array + const std::string& processedJson = apiHandler->getProcessedJson(); + ASSERT_FALSE(processedJson.empty()) << "Responses should build processedJson"; + // Verify it contains a messages array with the correct content + rapidjson::Document processedDoc; + processedDoc.Parse(processedJson.c_str()); + ASSERT_FALSE(processedDoc.HasParseError()); + ASSERT_TRUE(processedDoc.HasMember("messages")); + ASSERT_TRUE(processedDoc["messages"].IsArray()); + ASSERT_EQ(processedDoc["messages"].Size(), 1u); + EXPECT_STREQ(processedDoc["messages"][0]["role"].GetString(), "user"); + EXPECT_STREQ(processedDoc["messages"][0]["content"].GetString(), "What is OpenVINO?"); + } +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonEquivalentMultiMessage) { + // Test with array input containing multiple messages + std::string json; + if (endpoint() == ovms::Endpoint::RESPONSES) { + json = R"({"model":"llama","input":[ + {"role":"system","content":"You are helpful."}, + {"role":"user","content":"Hello"} + ]})"; + } else { + json = R"({"model":"llama","messages":[ + {"role":"system","content":"You are helpful."}, + {"role":"user","content":"Hello"} + ]})"; + } + auto apiHandler = parseCurrentRequest(json); + + auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 2); + EXPECT_EQ(chatHistory[0]["role"], "system"); + EXPECT_EQ(chatHistory[0]["content"], "You are helpful."); + EXPECT_EQ(chatHistory[1]["role"], "user"); + EXPECT_EQ(chatHistory[1]["content"], "Hello"); + + if (endpoint() == ovms::Endpoint::RESPONSES) { + const std::string& processedJson = apiHandler->getProcessedJson(); + ASSERT_FALSE(processedJson.empty()); + rapidjson::Document processedDoc; + processedDoc.Parse(processedJson.c_str()); + ASSERT_FALSE(processedDoc.HasParseError()); + ASSERT_TRUE(processedDoc.HasMember("messages")); + ASSERT_EQ(processedDoc["messages"].Size(), 2u); + EXPECT_STREQ(processedDoc["messages"][0]["role"].GetString(), "system"); + EXPECT_STREQ(processedDoc["messages"][0]["content"].GetString(), "You are helpful."); + EXPECT_STREQ(processedDoc["messages"][1]["role"].GetString(), "user"); + EXPECT_STREQ(processedDoc["messages"][1]["content"].GetString(), "Hello"); + } +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonIncludesToolsWhenPresent) { + std::string json = createToolRequest("\"auto\""); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_TRUE(apiHandler->areToolsAvailable()); + + if (endpoint() == ovms::Endpoint::RESPONSES) { + const std::string& processedJson = apiHandler->getProcessedJson(); + ASSERT_FALSE(processedJson.empty()); + rapidjson::Document processedDoc; + processedDoc.Parse(processedJson.c_str()); + ASSERT_FALSE(processedDoc.HasParseError()); + ASSERT_TRUE(processedDoc.HasMember("messages")); + ASSERT_TRUE(processedDoc.HasMember("tools")); + ASSERT_TRUE(processedDoc["tools"].IsArray()); + ASSERT_GT(processedDoc["tools"].Size(), 0u); + } +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTokenLimitSetsMaxTokens) { + std::string tokenField = endpoint() == ovms::Endpoint::RESPONSES ? "max_output_tokens" : "max_completion_tokens"; + std::string json = createTextRequest("valid prompt", ",\"" + tokenField + "\":7"); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_TRUE(apiHandler->getMaxTokens().has_value()); + EXPECT_EQ(apiHandler->getMaxTokens().value(), 7); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingFunctionToolsWithAutoChoiceSucceeds) { + std::string json = createToolRequest("\"auto\""); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_TRUE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "auto"); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingToolChoiceFunctionObjectSucceeds) { + std::string json = createToolRequest("{\"type\":\"function\",\"function\":{\"name\":\"get_current_weather\"}}"); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_TRUE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "get_current_weather"); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingToolChoiceNoneRemovesTools) { + std::string json = createToolRequest("\"none\""); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_FALSE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "none"); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingMultimodalInputImageSucceeds) { + const std::string base64Image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="; + std::string json = createMultimodalRequestWithImageUrl(base64Image); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_EQ(apiHandler->getImageHistory().size(), 1); +} + +INSTANTIATE_TEST_SUITE_P( + ChatAndResponses, + HttpOpenAIHandlerChatAndResponsesParsingTest, + ::testing::Values(ovms::Endpoint::CHAT_COMPLETIONS, ovms::Endpoint::RESPONSES), + [](const testing::TestParamInfo& info) { + switch (info.param) { + case ovms::Endpoint::CHAT_COMPLETIONS: + return "ChatCompletions"; + case ovms::Endpoint::RESPONSES: + return "Responses"; + default: + return "Unknown"; + } + }); + static std::vector createHermes3ToolCallTokens(ov::genai::Tokenizer& tokenizer) { std::string toolCall = R"({"name": "example_tool", "arguments": {"arg1": "value1", "arg2": 42}})"; auto generatedTensor = tokenizer.encode(toolCall, ov::genai::add_special_tokens(true)).input_ids; @@ -557,98 +823,920 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseVLMSupportsToolCallsF ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) { +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsOutputText) { std::string json = R"({ "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg==" - } - } - ] - } - ] + "input": "What is OpenVINO?", + "max_output_tokens": 5 })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 3); - std::vector expectedBytes = {110, 181, 160}; - for (size_t i = 0; i < image.get_size(); i++) { - EXPECT_EQ(expectedBytes[i], ((uint8_t*)image.data())[i]); - } - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"output\":"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"type\":\"output_text\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"text\":"), std::string::npos) << serialized; } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttp) { - SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); // CVS-180127 +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsReasoningOutputItem) { std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" - } - } - ] - } - ] -})"; + "model": "llama", + "input": "Think about this", + "max_output_tokens": 100 + })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"raw.githubusercontent.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 225792); - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); -} -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpMultipleAllowedDomains) { - SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); // CVS-180127 + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer, "", "qwen3"); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + std::string modelOutput = "Let me reason about thisThe answer is 42"; + ov::Tensor outputIds = tokenizer->encode(modelOutput, ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; + // Reasoning output item should be present + ASSERT_NE(serialized.find("\"type\":\"reasoning\""), std::string::npos) << "Reasoning output item missing: " << serialized; + ASSERT_NE(serialized.find("\"type\":\"summary_text\""), std::string::npos) << "Summary text missing: " << serialized; + // Reasoning item should NOT have status field (per OpenAI spec) + auto reasoningPos = serialized.find("\"type\":\"reasoning\""); + auto messagePos = serialized.find("\"type\":\"message\""); + ASSERT_LT(reasoningPos, messagePos) << "Reasoning item should come before message item"; + // Reasoning item ID should start with rs- + ASSERT_NE(serialized.find("\"id\":\"rs-"), std::string::npos) << serialized; + // Message output item should still be present with content + ASSERT_NE(serialized.find("\"type\":\"output_text\""), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesOmitsReasoningWhenAbsent) { std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" + "model": "llama", + "input": "What is OpenVINO?", + "max_output_tokens": 5 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer, "", "qwen3"); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("OVMS is great", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; + // No reasoning output item when model output has no tags + ASSERT_EQ(serialized.find("\"type\":\"reasoning\""), std::string::npos) << "Reasoning item should not be present: " << serialized; + // Message item should still be present + ASSERT_NE(serialized.find("\"type\":\"message\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"type\":\"output_text\""), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, parseResponsesReasoningParameterInjectsEnableThinking) { + std::string json = R"({ + "model": "llama", + "input": "Think about this", + "reasoning": {"effort": "high", "summary": "auto"}, + "max_output_tokens": 100 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer, "", "qwen3"); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Verify that chat_template_kwargs was injected with enable_thinking: true + auto chatTemplateKwargsStatus = apiHandler->parseChatTemplateKwargsToJsonContainer(); + ASSERT_TRUE(chatTemplateKwargsStatus.ok()); + ASSERT_TRUE(chatTemplateKwargsStatus.value().has_value()); +} + +TEST_F(HttpOpenAIHandlerParsingTest, parseResponsesReasoningParameterAllEffortValuesWork) { + for (const auto& effort : {"low", "medium", "high"}) { + std::string json = R"({"model": "llama", "input": "test", "reasoning": {"effort": ")" + std::string(effort) + R"("}, "max_output_tokens": 10})"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()) << json; + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()) << "Failed for effort: " << effort; + + auto chatTemplateKwargsStatus = apiHandler->parseChatTemplateKwargsToJsonContainer(); + ASSERT_TRUE(chatTemplateKwargsStatus.ok()); + ASSERT_TRUE(chatTemplateKwargsStatus.value().has_value()) << "enable_thinking not injected for effort: " << effort; + } +} + +TEST_F(HttpOpenAIHandlerParsingTest, parseResponsesReasoningParameterInvalidEffortRejected) { + std::string json = R"({ + "model": "llama", + "input": "test", + "reasoning": {"effort": "invalid"}, + "max_output_tokens": 10 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_NE(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); +} + +TEST_F(HttpOpenAIHandlerParsingTest, parseResponsesReasoningParameterDoesNotOverrideExplicitKwargs) { + std::string json = R"({ + "model": "llama", + "input": "test", + "reasoning": {"effort": "high"}, + "chat_template_kwargs": {"enable_thinking": false}, + "max_output_tokens": 10 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // chat_template_kwargs should exist, but the explicit enable_thinking: false should be preserved + auto chatTemplateKwargsStatus = apiHandler->parseChatTemplateKwargsToJsonContainer(); + ASSERT_TRUE(chatTemplateKwargsStatus.ok()); + ASSERT_TRUE(chatTemplateKwargsStatus.value().has_value()); +} + +TEST_F(HttpOpenAIHandlerParsingTest, parseResponsesReasoningParameterNotAnObjectRejected) { + std::string json = R"({ + "model": "llama", + "input": "test", + "reasoning": "high", + "max_output_tokens": 10 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_NE(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesContainsRequiredEvents) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Phase 1: Init events emitted via dedicated method (called right after scheduleExecution in calculator) + std::string initChunk = apiHandler->serializeResponsesStreamingInitEvents(); + ASSERT_NE(initChunk.find("\"type\":\"response.created\""), std::string::npos) << initChunk; + ASSERT_NE(initChunk.find("\"type\":\"response.in_progress\""), std::string::npos) << initChunk; + ASSERT_NE(initChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << initChunk; + ASSERT_NE(initChunk.find("\"type\":\"response.content_part.added\""), std::string::npos) << initChunk; + // No delta event when text is empty + ASSERT_EQ(initChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << initChunk; + + // Verify correct event ordering: created < in_progress < output_item.added < content_part.added + auto createdPos = initChunk.find("\"type\":\"response.created\""); + auto inProgressPos = initChunk.find("\"type\":\"response.in_progress\""); + auto outputItemAddedPos = initChunk.find("\"type\":\"response.output_item.added\""); + auto contentPartAddedPos = initChunk.find("\"type\":\"response.content_part.added\""); + ASSERT_LT(createdPos, inProgressPos) << "response.created must come before response.in_progress"; + ASSERT_LT(inProgressPos, outputItemAddedPos) << "response.in_progress must come before response.output_item.added"; + ASSERT_LT(outputItemAddedPos, contentPartAddedPos) << "response.output_item.added must come before response.content_part.added"; + + // Phase 2: Second call should only contain delta, no repeated init events + std::string secondChunk = apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::NONE); + ASSERT_TRUE(secondChunk.empty()) << "Empty text after init should produce no output: " << secondChunk; + + // Phase 3: Text delta + std::string deltaChunk = apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(deltaChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << deltaChunk; + ASSERT_NE(deltaChunk.find("\"delta\":\"Hello\""), std::string::npos) << deltaChunk; + ASSERT_EQ(deltaChunk.find("\"type\":\"response.created\""), std::string::npos) << "No repeated init events: " << deltaChunk; + + // Phase 4: Final chunk with finish reason + std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP); + ASSERT_NE(finalChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.output_text.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.content_part.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.output_item.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"text\":\"Hello world\""), std::string::npos) << finalChunk; + + // Verify correct event ordering in final chunk: delta < output_text.done < content_part.done < output_item.done < completed + auto deltaPos = finalChunk.find("\"type\":\"response.output_text.delta\""); + auto textDonePos = finalChunk.find("\"type\":\"response.output_text.done\""); + auto partDonePos = finalChunk.find("\"type\":\"response.content_part.done\""); + auto itemDonePos = finalChunk.find("\"type\":\"response.output_item.done\""); + auto completedPos = finalChunk.find("\"type\":\"response.completed\""); + ASSERT_LT(deltaPos, textDonePos) << "delta must come before output_text.done"; + ASSERT_LT(textDonePos, partDonePos) << "output_text.done must come before content_part.done"; + ASSERT_LT(partDonePos, itemDonePos) << "content_part.done must come before output_item.done"; + ASSERT_LT(itemDonePos, completedPos) << "output_item.done must come before response.completed"; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesWithReasoningEmitsReasoningEvents) { + std::string json = R"({ + "model": "llama", + "input": "Think about this", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer, "", "qwen3"); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Phase 1: Init events - should only have created + in_progress (output items deferred) + std::string initChunk = apiHandler->serializeResponsesStreamingInitEvents(); + ASSERT_NE(initChunk.find("\"type\":\"response.created\""), std::string::npos) << initChunk; + ASSERT_NE(initChunk.find("\"type\":\"response.in_progress\""), std::string::npos) << initChunk; + // Output item events should be deferred when parser is present + ASSERT_EQ(initChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << "output_item.added should be deferred: " << initChunk; + ASSERT_EQ(initChunk.find("\"type\":\"response.content_part.added\""), std::string::npos) << "content_part.added should be deferred: " << initChunk; + + // Phase 2: Reasoning chunk with tag - should emit reasoning init + delta + std::string reasoningChunk = apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::NONE); + // tag itself should be consumed by parser, no events + // (parser returns nullopt for tag tokens) + + // Phase 3: Reasoning content + std::string reasoningContent = apiHandler->serializeStreamingChunk("Let me think", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(reasoningContent.find("\"type\":\"response.output_item.added\""), std::string::npos) << "Should have reasoning output_item.added: " << reasoningContent; + ASSERT_NE(reasoningContent.find("\"type\":\"reasoning\""), std::string::npos) << "Output item should be reasoning type: " << reasoningContent; + ASSERT_NE(reasoningContent.find("\"type\":\"response.reasoning_summary_part.added\""), std::string::npos) << reasoningContent; + ASSERT_NE(reasoningContent.find("\"type\":\"response.reasoning_summary_text.delta\""), std::string::npos) << reasoningContent; + ASSERT_NE(reasoningContent.find("\"delta\":\"Let me think\""), std::string::npos) << reasoningContent; + + // Phase 4: More reasoning + std::string moreReasoning = apiHandler->serializeStreamingChunk(" harder", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(moreReasoning.find("\"type\":\"response.reasoning_summary_text.delta\""), std::string::npos) << moreReasoning; + ASSERT_NE(moreReasoning.find("\"delta\":\" harder\""), std::string::npos) << moreReasoning; + // Should NOT have another output_item.added + ASSERT_EQ(moreReasoning.find("\"type\":\"response.output_item.added\""), std::string::npos) << "No repeated init: " << moreReasoning; + + // Phase 5: End of reasoning with + std::string endThink = apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::NONE); + // tag consumed by parser + + // Phase 6: Content chunk - should close reasoning and open message + std::string contentChunk = apiHandler->serializeStreamingChunk("The answer", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(contentChunk.find("\"type\":\"response.reasoning_summary_text.done\""), std::string::npos) << "Should close reasoning: " << contentChunk; + ASSERT_NE(contentChunk.find("\"type\":\"response.reasoning_summary_part.done\""), std::string::npos) << contentChunk; + // Message item should be at output_index 1 + ASSERT_NE(contentChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << "Should add message item: " << contentChunk; + ASSERT_NE(contentChunk.find("\"type\":\"response.content_part.added\""), std::string::npos) << contentChunk; + ASSERT_NE(contentChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << contentChunk; + + // Phase 7: Final chunk + std::string finalChunk = apiHandler->serializeStreamingChunk(" is 42", ov::genai::GenerationFinishReason::STOP); + ASSERT_NE(finalChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.output_text.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.content_part.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.output_item.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; + // Completed event should contain reasoning in output + ASSERT_NE(finalChunk.find("\"type\":\"reasoning\""), std::string::npos) << "Completed response should include reasoning: " << finalChunk; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesWithoutReasoningWorksNormally) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer, "", "qwen3"); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Init events should be deferred (parser present) + std::string initChunk = apiHandler->serializeResponsesStreamingInitEvents(); + ASSERT_NE(initChunk.find("\"type\":\"response.created\""), std::string::npos) << initChunk; + ASSERT_EQ(initChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << "Should be deferred: " << initChunk; + + // Content without reasoning - should emit message init events on first content + std::string contentChunk = apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(contentChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << "Should init message: " << contentChunk; + ASSERT_NE(contentChunk.find("\"type\":\"response.content_part.added\""), std::string::npos) << contentChunk; + ASSERT_NE(contentChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << contentChunk; + // Should NOT have any reasoning events + ASSERT_EQ(contentChunk.find("\"type\":\"reasoning\""), std::string::npos) << "No reasoning: " << contentChunk; + ASSERT_EQ(contentChunk.find("\"type\":\"response.reasoning_summary"), std::string::npos) << "No reasoning: " << contentChunk; + + // Final chunk + std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP); + ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingUsageChunkForResponsesIsEmpty) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ASSERT_EQ(apiHandler->serializeStreamingUsageChunk(), ""); +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesEmitsIncompleteOnLengthFinish) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Init events + apiHandler->serializeResponsesStreamingInitEvents(); + // Delta + apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + + // Final chunk with LENGTH finish reason + std::string finalChunk = apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::LENGTH); + + // Should emit response.incomplete instead of response.completed + ASSERT_NE(finalChunk.find("\"type\":\"response.incomplete\""), std::string::npos) << finalChunk; + ASSERT_EQ(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << "Should not contain response.completed: " << finalChunk; + + // Should contain incomplete_details with max_tokens reason + ASSERT_NE(finalChunk.find("\"incomplete_details\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"reason\":\"max_tokens\""), std::string::npos) << finalChunk; + + // Response status should be "incomplete" + ASSERT_NE(finalChunk.find("\"status\":\"incomplete\""), std::string::npos) << finalChunk; + + // Should NOT contain completed_at + // Find the response.incomplete event section and check it doesn't have completed_at + auto incompletePos = finalChunk.find("\"type\":\"response.incomplete\""); + auto responseSection = finalChunk.substr(incompletePos); + ASSERT_EQ(responseSection.find("\"completed_at\""), std::string::npos) << "Incomplete response should not have completed_at: " << responseSection; + + // output_item.done should have status "incomplete" + auto itemDonePos = finalChunk.find("\"type\":\"response.output_item.done\""); + ASSERT_NE(itemDonePos, std::string::npos) << finalChunk; + auto itemSection = finalChunk.substr(itemDonePos); + ASSERT_NE(itemSection.find("\"status\":\"incomplete\""), std::string::npos) << "output_item.done should have incomplete status: " << itemSection; + + // Still should have the other finalization events + ASSERT_NE(finalChunk.find("\"type\":\"response.output_text.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.content_part.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.output_item.done\""), std::string::npos) << finalChunk; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesEmitsCompletedOnStopFinish) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Init events + apiHandler->serializeResponsesStreamingInitEvents(); + // Delta + finish with STOP + std::string finalChunk = apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::STOP); + + // Should emit response.completed, NOT response.incomplete + ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; + ASSERT_EQ(finalChunk.find("\"type\":\"response.incomplete\""), std::string::npos) << "Should not contain response.incomplete: " << finalChunk; + ASSERT_EQ(finalChunk.find("\"incomplete_details\""), std::string::npos) << "Should not contain incomplete_details: " << finalChunk; + + // Response status should be "completed" + ASSERT_NE(finalChunk.find("\"status\":\"completed\""), std::string::npos) << finalChunk; + + // Should contain spec-aligned fields + ASSERT_NE(finalChunk.find("\"error\":null"), std::string::npos) << "Should contain error:null: " << finalChunk; + ASSERT_NE(finalChunk.find("\"store\":true"), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"truncation\":\"disabled\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"metadata\":{}"), std::string::npos) << finalChunk; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeResponsesFailedEventContainsCorrectStructure) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + std::string failedEvent = apiHandler->serializeResponsesFailedEvent("Something went wrong"); + + // Should contain response.failed event type + ASSERT_NE(failedEvent.find("\"type\":\"response.failed\""), std::string::npos) << failedEvent; + // Should NOT contain response.completed or response.incomplete + ASSERT_EQ(failedEvent.find("\"type\":\"response.completed\""), std::string::npos) << failedEvent; + ASSERT_EQ(failedEvent.find("\"type\":\"response.incomplete\""), std::string::npos) << failedEvent; + + // Should contain error object with code and message + ASSERT_NE(failedEvent.find("\"error\":{"), std::string::npos) << "Should contain error object: " << failedEvent; + ASSERT_NE(failedEvent.find("\"code\":\"server_error\""), std::string::npos) << failedEvent; + ASSERT_NE(failedEvent.find("\"message\":\"Something went wrong\""), std::string::npos) << failedEvent; + + // Response status should be "failed" + ASSERT_NE(failedEvent.find("\"status\":\"failed\""), std::string::npos) << failedEvent; + + // Should include init events since they were not emitted before + ASSERT_NE(failedEvent.find("\"type\":\"response.created\""), std::string::npos) << failedEvent; + + // Should contain sequence_number + ASSERT_NE(failedEvent.find("\"sequence_number\""), std::string::npos) << failedEvent; + + // Should NOT contain completed_at + auto failedPos = failedEvent.find("\"type\":\"response.failed\""); + auto responseSection = failedEvent.substr(failedPos); + ASSERT_EQ(responseSection.find("\"completed_at\""), std::string::npos) << "Failed response should not have completed_at: " << responseSection; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeResponsesFailedEventWithCustomErrorCode) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + std::string failedEvent = apiHandler->serializeResponsesFailedEvent("Invalid prompt content", ovms::ResponsesErrorCode::INVALID_PROMPT); + + ASSERT_NE(failedEvent.find("\"code\":\"invalid_prompt\""), std::string::npos) << failedEvent; + ASSERT_NE(failedEvent.find("\"message\":\"Invalid prompt content\""), std::string::npos) << failedEvent; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeResponsesFailedEventAfterPartialStreaming) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Emit init events and some deltas first + apiHandler->serializeResponsesStreamingInitEvents(); + apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + + // Then fail + std::string failedEvent = apiHandler->serializeResponsesFailedEvent("Generation aborted"); + + // Should contain response.failed but NOT init events (already sent) + ASSERT_NE(failedEvent.find("\"type\":\"response.failed\""), std::string::npos) << failedEvent; + ASSERT_EQ(failedEvent.find("\"type\":\"response.created\""), std::string::npos) << "Should not re-emit init events: " << failedEvent; + + // Error should be present + ASSERT_NE(failedEvent.find("\"error\":{"), std::string::npos) << failedEvent; + ASSERT_NE(failedEvent.find("\"code\":\"server_error\""), std::string::npos) << failedEvent; + ASSERT_NE(failedEvent.find("\"message\":\"Generation aborted\""), std::string::npos) << failedEvent; + + // Should NOT contain usage (failed responses don't include usage) + ASSERT_EQ(failedEvent.find("\"usage\""), std::string::npos) << "Failed response should not include usage: " << failedEvent; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesIncompleteOnLength) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "max_output_tokens": 5 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::GenerationOutput genOutput; + ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + genOutput.generated_ids = std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1]); + genOutput.finish_reason = ov::genai::GenerationFinishReason::LENGTH; + + std::vector generationOutputs = {genOutput}; + std::string serialized = apiHandler->serializeUnaryResponse(generationOutputs); + + // Should have status "incomplete" + ASSERT_NE(serialized.find("\"status\":\"incomplete\""), std::string::npos) << serialized; + // Should have incomplete_details with reason + ASSERT_NE(serialized.find("\"incomplete_details\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"reason\":\"max_tokens\""), std::string::npos) << serialized; + // Should NOT have completed_at + ASSERT_EQ(serialized.find("\"completed_at\""), std::string::npos) << serialized; + // Should NOT have status "completed" + ASSERT_EQ(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized; + + // Should contain spec-aligned fields + ASSERT_NE(serialized.find("\"store\":true"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"truncation\":\"disabled\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"metadata\":{}"), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesCompletedOnStop) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "max_output_tokens": 5 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::GenerationOutput genOutput; + ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + genOutput.generated_ids = std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1]); + genOutput.finish_reason = ov::genai::GenerationFinishReason::STOP; + + std::vector generationOutputs = {genOutput}; + std::string serialized = apiHandler->serializeUnaryResponse(generationOutputs); + + // Should have status "completed" + ASSERT_NE(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized; + // Should have completed_at + ASSERT_NE(serialized.find("\"completed_at\""), std::string::npos) << serialized; + // Should NOT have incomplete_details + ASSERT_EQ(serialized.find("\"incomplete_details\""), std::string::npos) << serialized; + + // Should contain spec-aligned fields + ASSERT_NE(serialized.find("\"store\":true"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"truncation\":\"disabled\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"metadata\":{}"), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg==" + } + } + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); + const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); + ASSERT_EQ(imageHistory.size(), 1); + auto [index, image] = imageHistory[0]; + EXPECT_EQ(index, 0); + EXPECT_EQ(image.get_element_type(), ov::element::u8); + EXPECT_EQ(image.get_size(), 3); + std::vector expectedBytes = {110, 181, 160}; + for (size_t i = 0; i < image.get_size(); i++) { + EXPECT_EQ(expectedBytes[i], ((uint8_t*)image.data())[i]); + } + json = apiHandler->getProcessedJson(); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttp) { + SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); // CVS-180127 + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" + } + } + ] + } + ] +})"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + std::vector allowedDomains = {"raw.githubusercontent.com"}; + ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); + const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); + ASSERT_EQ(imageHistory.size(), 1); + auto [index, image] = imageHistory[0]; + EXPECT_EQ(index, 0); + EXPECT_EQ(image.get_element_type(), ov::element::u8); + EXPECT_EQ(image.get_size(), 225792); + json = apiHandler->getProcessedJson(); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpMultipleAllowedDomains) { + SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); // CVS-180127 + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" + } + } + ] + } + ] +})"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + std::vector allowedDomains = {"raw.githubusercontent.com", "githubusercontent.com", "google.com"}; + ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); + const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); + ASSERT_EQ(imageHistory.size(), 1); + auto [index, image] = imageHistory[0]; + EXPECT_EQ(index, 0); + EXPECT_EQ(image.get_element_type(), ov::element::u8); + EXPECT_EQ(image.get_size(), 225792); + json = apiHandler->getProcessedJson(); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttps) { + SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); // CVS-180127 + std::string json = R"({ +"model": "llama", +"messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" + } + } + ] + } +] +})"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + std::vector allowedDomains = {"raw.githubusercontent.com"}; + ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); + const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); + ASSERT_EQ(imageHistory.size(), 1); + auto [index, image] = imageHistory[0]; + EXPECT_EQ(index, 0); + EXPECT_EQ(image.get_element_type(), ov::element::u8); + EXPECT_EQ(image.get_size(), 225792); + json = apiHandler->getProcessedJson(); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpsAllowedDomainAll) { + SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); // CVS-180127 + std::string json = R"({ +"model": "llama", +"messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" + } + } + ] + } +] +})"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + std::vector allowedDomains = {"all"}; + ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); + const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); + ASSERT_EQ(imageHistory.size(), 1); + auto [index, image] = imageHistory[0]; + EXPECT_EQ(index, 0); + EXPECT_EQ(image.get_element_type(), ov::element::u8); + EXPECT_EQ(image.get_size(), 225792); + json = apiHandler->getProcessedJson(); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingImageJpegWithNoTextSucceeds) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGIy+/oREAAA//8DiQIftNKCRwAAAABJRU5ErkJggg==" + } + } + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); + const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); + ASSERT_EQ(imageHistory.size(), 1); + auto [index, image] = imageHistory[0]; + EXPECT_EQ(index, 0); + EXPECT_EQ(image.get_element_type(), ov::element::u8); + EXPECT_EQ(image.get_size(), 3); + std::vector expectedBytes = {54, 245, 241}; + for (size_t i = 0; i < image.get_size(); i++) { + EXPECT_EQ(expectedBytes[i], ((uint8_t*)image.data())[i]); + } + json = apiHandler->getProcessedJson(); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"\"}]}")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageStringWithNoPrefixFails) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg==" + } + } + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Loading images from local filesystem is disabled.")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesFailsUrlHttpNotAllowedDomain) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" } } ] @@ -658,20 +1746,131 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpMultipleAllow doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"raw.githubusercontent.com", "githubusercontent.com", "google.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); + std::vector allowedDomains = {"wikipedia.com"}; + ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesFailsUrlMatchAllowedDomainPartially1) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" + } + } + ] + } + ] +})"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + std::vector allowedDomains = {"githubusercontent.com"}; + ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesFailsUrlMatchAllowedDomainPartially2) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" + } + } + ] + } + ] +})"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + std::vector allowedDomains = {"host.raw.githubusercontent.com"}; + ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesFailsRegexNotSupported) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" + } + } + ] + } + ] +})"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + std::vector allowedDomains = {"*githubusercontent.com"}; + ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystem) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": ")" + + getGenericFullPathForSrcTest("/ovms/src/test/binaryutils/rgb.jpg") + R"(" + } + } + ] + } + ] +})"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseMessages(getGenericFullPathForSrcTest("/ovms/src/test")), absl::OkStatus()); const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); ASSERT_EQ(imageHistory.size(), 1); auto [index, image] = imageHistory[0]; EXPECT_EQ(index, 0); EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 225792); + EXPECT_EQ(image.get_size(), 3); json = apiHandler->getProcessedJson(); EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttps) { - SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); // CVS-180127 +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemWithinAllowedPath) { std::string json = R"({ "model": "llama", "messages": [ @@ -685,7 +1884,43 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttps) { { "type": "image_url", "image_url": { - "url": "https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" + "url": ")" + getGenericFullPathForSrcTest("/ovms/src/test/binaryutils/rgb.jpg") + + R"(" + } + } + ] + } +] +})"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseMessages(getGenericFullPathForSrcTest("/ovms/src/test/binaryutils")), absl::OkStatus()); + const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); + ASSERT_EQ(imageHistory.size(), 1); + auto [index, image] = imageHistory[0]; + EXPECT_EQ(index, 0); + EXPECT_EQ(image.get_element_type(), ov::element::u8); + EXPECT_EQ(image.get_size(), 3); + json = apiHandler->getProcessedJson(); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemNotWithinAllowedPath) { + std::string json = R"({ +"model": "llama", +"messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "/ovms/src/test/binaryutils/rgb.jpg" } } ] @@ -695,20 +1930,37 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttps) { doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"raw.githubusercontent.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 225792); - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + ASSERT_EQ(apiHandler->parseMessages("src/test"), absl::InvalidArgumentError("Given filepath is not subpath of allowed_local_media_path")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemInvalidPath) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "/ovms/not_exisiting.jpeg" + } + } + ] + } + ] +})"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseMessages("/ovms/"), absl::InvalidArgumentError("Image file /ovms/not_exisiting.jpeg parsing failed: can't fopen")); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpsAllowedDomainAll) { - SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); // CVS-180127 +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemInvalidEscaped) { std::string json = R"({ "model": "llama", "messages": [ @@ -722,7 +1974,8 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpsAllowedDomai { "type": "image_url", "image_url": { - "url": "https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" + "url": ")" + getGenericFullPathForSrcTest("/ovms/src/test/../test/binaryutils/rgb.jpg") + + R"(" } } ] @@ -732,32 +1985,65 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpsAllowedDomai doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"all"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 225792); - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + std::string expectedMessage = "Path " + getGenericFullPathForSrcTest("/ovms/src/test/../test/binaryutils/rgb.jpg") + " escape with .. is forbidden."; + EXPECT_EQ(apiHandler->parseMessages("/ovms/"), absl::InvalidArgumentError(expectedMessage.c_str())); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingImageJpegWithNoTextSucceeds) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMultipleMessagesSucceeds) { std::string json = R"({ "model": "llama", "messages": [ { "role": "user", "content": [ + { + "type": "text", + "text": "What is in this image?" + }, { "type": "image_url", "image_url": { - "url": "data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGIy+/oREAAA//8DiQIftNKCRwAAAABJRU5ErkJggg==" + "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg==" + } + } + ] + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "No idea my friend." + } + ] + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What about this one?" + }, + { + "type": "image_url", + "image_url": { + "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg==" } } ] + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "Same thing. I'm not very good with images." + } + ] + }, + { + "role": "user", + "content": "You were not trained with images, were you?" } ] })"; @@ -766,20 +2052,27 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingImageJpegWithNoTextSucceeds) { std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 3); - std::vector expectedBytes = {54, 245, 241}; - for (size_t i = 0; i < image.get_size(); i++) { - EXPECT_EQ(expectedBytes[i], ((uint8_t*)image.data())[i]); + ASSERT_EQ(imageHistory.size(), 2); + std::vector expectedBytes = {110, 181, 160}; + std::vector expectedImageIndexes = {0, 2}; + size_t i = 0; + for (auto [index, image] : imageHistory) { + EXPECT_EQ(index, expectedImageIndexes[i++]); + EXPECT_EQ(image.get_element_type(), ov::element::u8); + EXPECT_EQ(image.get_size(), 3); + for (size_t i = 0; i < image.get_size(); i++) { + EXPECT_EQ(expectedBytes[i], ((uint8_t*)image.data())[i]); + } } json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}," + "{\"role\":\"assistant\",\"content\":\"No idea my friend.\"}," + "{\"role\":\"user\",\"content\":\"What about this one?\"}," + "{\"role\":\"assistant\",\"content\":\"Same thing. I'm not very good with images.\"}," + "{\"role\":\"user\",\"content\":\"You were not trained with images, were you?\"}]}")); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageStringWithNoPrefixFails) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesWithInvalidContentTypeFails) { std::string json = R"({ "model": "llama", "messages": [ @@ -790,10 +2083,30 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageStringWithNoPrefixFails "type": "text", "text": "What is in this image?" }, + { + "type": "INVALID" + } + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Unsupported content type")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesEmptyImageUrlFails) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ { "type": "image_url", "image_url": { - "url": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg==" + "url": "" } } ] @@ -806,516 +2119,547 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageStringWithNoPrefixFails EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Loading images from local filesystem is disabled.")); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesFailsUrlHttpNotAllowedDomain) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageUrlNotBase64Fails) { std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "base64,NOTBASE64" + } + } + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Invalid base64 string in request")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesEmptyContentArrayFails) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Invalid message structure - content array is empty")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, maxTokensValueDefaultToMaxTokensLimit) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "valid prompt" } + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + uint32_t maxTokensLimit = 10; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_TRUE(apiHandler->getMaxTokens().has_value()); + EXPECT_EQ(apiHandler->getMaxTokens().value(), maxTokensLimit); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingRequestWithNullParametersChat) { + std::vector chatParamsThatAcceptNull = {"stream", "stream_options", "ignore_eos", "frequency_penalty", "presence_penalty", "repetition_penalty", + "length_penalty", "temperature", "top_p", "top_k", "seed", "stop", "include_stop_str_in_output", "best_of", "n", "num_assistant_tokens", "assistant_confidence_threshold", + "logprobs", "max_completion_tokens", "tools", "tool_choice"}; + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + for (auto param : chatParamsThatAcceptNull) { + std::string json = R"({ + "model": "llama", + ")" + param + R"(": null, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "valid prompt" + } + ] } ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); } - ] -})"; +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingRequestWithNullParametersCompletions) { + std::vector chatParamsThatAcceptNull = {"stream", "stream_options", "ignore_eos", "frequency_penalty", "presence_penalty", "repetition_penalty", + "length_penalty", "temperature", "top_p", "top_k", "seed", "stop", "include_stop_str_in_output", "best_of", "n", "num_assistant_tokens", "assistant_confidence_threshold", + "logprobs", "echo"}; + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + for (auto param : chatParamsThatAcceptNull) { + std::string json = R"({ + "model": "llama", + ")" + param + R"(": null, + "prompt": "valid prompt" + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + } +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesMaxOutputTokensSetsLimit) { + std::string json = R"({ + "model": "llama", + "input": "valid prompt", + "max_output_tokens": 42 + })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"wikipedia.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_TRUE(apiHandler->getMaxTokens().has_value()); + EXPECT_EQ(apiHandler->getMaxTokens().value(), 42); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesFailsUrlMatchAllowedDomainPartially1) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesMaxCompletionTokensIsIgnored) { std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" - } - } - ] - } - ] -})"; + "model": "llama", + "input": "valid prompt", + "max_completion_tokens": 50 + })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"githubusercontent.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + // max_completion_tokens should be ignored for RESPONSES endpoint, so maxTokens should not be 50 + EXPECT_FALSE(apiHandler->getMaxTokens().has_value() && apiHandler->getMaxTokens().value() == 50); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesFailsUrlMatchAllowedDomainPartially2) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesMaxTokensIsIgnored) { std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" - } - } - ] - } - ] -})"; + "model": "llama", + "input": "valid prompt", + "max_tokens": 50 + })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"host.raw.githubusercontent.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + // max_tokens should be ignored for RESPONSES endpoint, so maxTokens should not be 50 + EXPECT_FALSE(apiHandler->getMaxTokens().has_value() && apiHandler->getMaxTokens().value() == 50); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesFailsRegexNotSupported) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesNStreamingIsRejected) { std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" - } - } - ] - } - ] -})"; + "model": "llama", + "input": "valid prompt", + "stream": true, + "n": 2 + })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"*githubusercontent.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("n greater than 1 is not supported for Responses API streaming")); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystem) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesNUnaryIsAccepted) { std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": ")" + - getGenericFullPathForSrcTest("/ovms/src/test/binaryutils/rgb.jpg") + R"(" - } - } - ] - } - ] -})"; + "model": "llama", + "input": "valid prompt", + "best_of": 3, + "n": 2 + })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages(getGenericFullPathForSrcTest("/ovms/src/test")), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 3); - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 100; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemWithinAllowedPath) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesFlatFunctionToolsSucceeds) { std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": "auto", + "tools": [ { - "type": "image_url", - "image_url": { - "url": ")" + getGenericFullPathForSrcTest("/ovms/src/test/binaryutils/rgb.jpg") + - R"(" + "type": "function", + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"] + } + }, + "required": ["location", "unit"] } } ] - } -] -})"; + })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages(getGenericFullPathForSrcTest("/ovms/src/test/binaryutils")), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 3); - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_TRUE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "auto"); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemNotWithinAllowedPath) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesToolChoiceFunctionObjectSucceeds) { std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": { + "type": "function", + "name": "get_current_weather" + }, + "tools": [ { - "type": "text", - "text": "What is in this image?" + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } }, { - "type": "image_url", - "image_url": { - "url": "/ovms/src/test/binaryutils/rgb.jpg" + "type": "function", + "name": "unused_tool", + "parameters": { + "type": "object", + "properties": { + "arg": {"type": "string"} + } } } ] - } -] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages("src/test"), absl::InvalidArgumentError("Given filepath is not subpath of allowed_local_media_path")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemInvalidPath) { - std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "/ovms/not_exisiting.jpeg" - } - } - ] - } - ] -})"; + })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseMessages("/ovms/"), absl::InvalidArgumentError("Image file /ovms/not_exisiting.jpeg parsing failed: can't fopen")); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_TRUE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "get_current_weather"); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemInvalidEscaped) { +TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunctionTools) { std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": "auto", + "tools": [ { - "type": "image_url", - "image_url": { - "url": ")" + getGenericFullPathForSrcTest("/ovms/src/test/../test/binaryutils/rgb.jpg") + - R"(" + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] } } ] - } -] -})"; + })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::string expectedMessage = "Path " + getGenericFullPathForSrcTest("/ovms/src/test/../test/binaryutils/rgb.jpg") + " escape with .. is forbidden."; - EXPECT_EQ(apiHandler->parseMessages("/ovms/"), absl::InvalidArgumentError(expectedMessage.c_str())); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("Sunny", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"tools\":[{"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"type\":\"function\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"name\":\"get_current_weather\""), std::string::npos) << serialized; } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMultipleMessagesSucceeds) { +TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunctionToolChoiceObject) { std::string json = R"({ "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg==" - } - } - ] - }, - { - "role": "assistant", - "content": [ - { - "type": "text", - "text": "No idea my friend." - } - ] - }, + "input": "What is the weather like in Boston today?", + "tool_choice": { + "type": "function", + "name": "get_current_weather" + }, + "tools": [ { - "role": "user", - "content": [ - { - "type": "text", - "text": "What about this one?" + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} }, - { - "type": "image_url", - "image_url": { - "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg==" - } - } - ] - }, - { - "role": "assistant", - "content": [ - { - "type": "text", - "text": "Same thing. I'm not very good with images." - } - ] - }, - { - "role": "user", - "content": "You were not trained with images, were you?" + "required": ["location"] + } } ] })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 2); - std::vector expectedBytes = {110, 181, 160}; - std::vector expectedImageIndexes = {0, 2}; - size_t i = 0; - for (auto [index, image] : imageHistory) { - EXPECT_EQ(index, expectedImageIndexes[i++]); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 3); - for (size_t i = 0; i < image.get_size(); i++) { - EXPECT_EQ(expectedBytes[i], ((uint8_t*)image.data())[i]); - } - } - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}," - "{\"role\":\"assistant\",\"content\":\"No idea my friend.\"}," - "{\"role\":\"user\",\"content\":\"What about this one?\"}," - "{\"role\":\"assistant\",\"content\":\"Same thing. I'm not very good with images.\"}," - "{\"role\":\"user\",\"content\":\"You were not trained with images, were you?\"}]}")); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("Sunny", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"tool_choice\":{"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"type\":\"function\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"name\":\"get_current_weather\""), std::string::npos) << serialized; } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesWithInvalidContentTypeFails) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesToolChoiceFunctionObjectMissingNameFails) { std::string json = R"({ "model": "llama", - "messages": [ + "input": "What is the weather like in Boston today?", + "tool_choice": { + "type": "function" + }, + "tools": [ { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "INVALID" + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} } - ] + } } ] })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Unsupported content type")); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("tool_choice.name is not a valid string")); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesEmptyImageUrlFails) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesToolChoiceFunctionObjectNameNotStringFails) { std::string json = R"({ "model": "llama", - "messages": [ + "input": "What is the weather like in Boston today?", + "tool_choice": { + "type": "function", + "name": 7 + }, + "tools": [ { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "" - } + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} } - ] + } } ] })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Loading images from local filesystem is disabled.")); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("tool_choice.name is not a valid string")); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageUrlNotBase64Fails) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageUrlObjectSucceeds) { std::string json = R"({ "model": "llama", - "messages": [ + "input": [ { "role": "user", "content": [ - { - "type": "image_url", - "image_url": { - "url": "base64,NOTBASE64" - } - } + {"type": "input_text", "text": "what is in this image?"}, + {"type": "input_image", "image_url": {"url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="}} ] } ] })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Invalid base64 string in request")); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_EQ(apiHandler->getImageHistory().size(), 1); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesEmptyContentArrayFails) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageWithoutImageUrlFails) { std::string json = R"({ "model": "llama", - "messages": [ + "input": [ { "role": "user", - "content": [] + "content": [ + {"type": "input_text", "text": "what is in this image?"}, + {"type": "input_image"} + ] } ] })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Invalid message structure - content array is empty")); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("input_image requires image_url field")); } -TEST_F(HttpOpenAIHandlerParsingTest, maxTokensValueDefaultToMaxTokensLimit) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageUrlInvalidTypeFails) { std::string json = R"({ "model": "llama", - "messages": [ + "input": [ { "role": "user", "content": [ - { - "type": "text", - "text": "valid prompt" - } + {"type": "input_text", "text": "what is in this image?"}, + {"type": "input_image", "image_url": 123} ] } ] })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - uint32_t maxTokensLimit = 10; + std::optional maxTokensLimit; uint32_t bestOfLimit = 0; std::optional maxModelLength; - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); - EXPECT_TRUE(apiHandler->getMaxTokens().has_value()); - EXPECT_EQ(apiHandler->getMaxTokens().value(), maxTokensLimit); + std::shared_ptr apiHandler = + std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("input_image.image_url must be a string or object")); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingRequestWithNullParametersChat) { - std::vector chatParamsThatAcceptNull = {"stream", "stream_options", "ignore_eos", "frequency_penalty", "presence_penalty", "repetition_penalty", - "length_penalty", "temperature", "top_p", "top_k", "seed", "stop", "include_stop_str_in_output", "best_of", "n", "num_assistant_tokens", "assistant_confidence_threshold", - "logprobs", "max_completion_tokens", "tools", "tool_choice"}; +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesUnsupportedToolTypeFails) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": "auto", + "tools": [ + { + "type": "web_search_preview" + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); std::optional maxTokensLimit; uint32_t bestOfLimit = 0; std::optional maxModelLength; - for (auto param : chatParamsThatAcceptNull) { - std::string json = R"({ - "model": "llama", - ")" + param + R"(": null, - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "valid prompt" - } - ] - } - ] - })"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); - } + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("Only function tools are supported")); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingRequestWithNullParametersCompletions) { - std::vector chatParamsThatAcceptNull = {"stream", "stream_options", "ignore_eos", "frequency_penalty", "presence_penalty", "repetition_penalty", - "length_penalty", "temperature", "top_p", "top_k", "seed", "stop", "include_stop_str_in_output", "best_of", "n", "num_assistant_tokens", "assistant_confidence_threshold", - "logprobs", "echo"}; +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesToolChoiceNoneRemovesTools) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": "none", + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + } + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); std::optional maxTokensLimit; uint32_t bestOfLimit = 0; std::optional maxModelLength; - for (auto param : chatParamsThatAcceptNull) { - std::string json = R"({ - "model": "llama", - ")" + param + R"(": null, - "prompt": "valid prompt" - })"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); - } + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_FALSE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "none"); } // Provide get_weather2 but take none diff --git a/src/test/llm/visual_language_model/complete_flow_test.cpp b/src/test/llm/visual_language_model/complete_flow_test.cpp index 4dc22d6fa3..5f2b380556 100644 --- a/src/test/llm/visual_language_model/complete_flow_test.cpp +++ b/src/test/llm/visual_language_model/complete_flow_test.cpp @@ -49,6 +49,7 @@ class VLMServableExecutionTest : public ::testing::Test { std::unordered_map headers{{"content-type", "application/json"}}; ovms::HttpRequestComponents comp; const std::string endpointChatCompletions = "/v3/chat/completions"; + const std::string endpointResponses = "/v3/responses"; std::shared_ptr writer; std::shared_ptr multiPartParser; std::string response; @@ -129,6 +130,50 @@ static std::string createRequestBody(const std::string& modelName, const std::ve return oss.str(); } +static std::string createResponsesRequestBody(const std::string& modelName, const std::vector>& fields, bool includeText = true, int numberOfImages = 1, const std::string contentOfTheFirstMessage = "What is in this image?") { + std::ostringstream oss; + oss << R"( + { + "model": ")" + << modelName << R"(", + "input": [ + { + "role": "user", + "content": [)"; + if (includeText) { + oss << R"( + { + "type": "input_text", + "text": ")"; + oss << contentOfTheFirstMessage; + oss << R"("})"; + if (numberOfImages > 0) { + oss << ","; + } + } + for (int i = 0; i < numberOfImages; i++) { + oss << R"( + { + "type": "input_image", + "image_url": "data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGIy+/oREAAA//8DiQIftNKCRwAAAABJRU5ErkJggg==" + })"; + if (i < numberOfImages - 1) { + oss << ","; + } + } + oss << R"( + ] + } + ] + )"; + for (const auto& field : fields) { + oss << R"(, ")" << field.first << R"(": )" << field.second << R"()" + << "\n"; + } + oss << "\n}"; + return oss.str(); +} + class VLMServableExecutionTestParameterized : public VLMServableExecutionTest, public ::testing::WithParamInterface {}; // Unary flow @@ -304,6 +349,152 @@ TEST_P(VLMServableExecutionTestParameterized, unaryBasicWithTools) { EXPECT_STREQ(parsedResponse["model"].GetString(), modelName.c_str()); } +TEST_P(VLMServableExecutionTestParameterized, unaryResponsesWithImageInput) { + auto modelName = GetParam(); + std::vector> fields = { + {"max_output_tokens", "5"}, + {"temperature", "0.0"}}; + std::string requestBody = createResponsesRequestBody(modelName, fields); + + ovms::HttpRequestComponents responsesComp; + ASSERT_EQ(handler->parseRequestComponents(responsesComp, "POST", endpointResponses, headers), ovms::StatusCode::OK); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointResponses, requestBody, &response, responsesComp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse.IsObject()); + ASSERT_TRUE(parsedResponse.HasMember("object")); + EXPECT_STREQ(parsedResponse["object"].GetString(), "response"); + ASSERT_TRUE(parsedResponse.HasMember("model")); + EXPECT_STREQ(parsedResponse["model"].GetString(), modelName.c_str()); + ASSERT_TRUE(parsedResponse.HasMember("output")); + ASSERT_TRUE(parsedResponse["output"].IsArray()); + ASSERT_GT(parsedResponse["output"].GetArray().Size(), 0); + ASSERT_TRUE(parsedResponse["output"][0].IsObject()); + ASSERT_TRUE(parsedResponse["output"][0].HasMember("type")); + EXPECT_STREQ(parsedResponse["output"][0]["type"].GetString(), "message"); + ASSERT_TRUE(parsedResponse["output"][0].HasMember("content")); + ASSERT_TRUE(parsedResponse["output"][0]["content"].IsArray()); + ASSERT_GT(parsedResponse["output"][0]["content"].GetArray().Size(), 0); + ASSERT_TRUE(parsedResponse["output"][0]["content"][0].HasMember("type")); + EXPECT_STREQ(parsedResponse["output"][0]["content"][0]["type"].GetString(), "output_text"); + + ASSERT_TRUE(parsedResponse.HasMember("usage")); + ASSERT_TRUE(parsedResponse["usage"].IsObject()); + ASSERT_TRUE(parsedResponse["usage"].HasMember("input_tokens")); + ASSERT_TRUE(parsedResponse["usage"].HasMember("output_tokens")); + ASSERT_TRUE(parsedResponse["usage"].HasMember("total_tokens")); +} + +TEST_P(VLMServableExecutionTestParameterized, unaryResponsesOnlyImageInput) { + auto modelName = GetParam(); + std::vector> fields = { + {"max_output_tokens", "5"}, + {"temperature", "0.0"}}; + std::string requestBody = createResponsesRequestBody(modelName, fields, false, 1); + + ovms::HttpRequestComponents responsesComp; + ASSERT_EQ(handler->parseRequestComponents(responsesComp, "POST", endpointResponses, headers), ovms::StatusCode::OK); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointResponses, requestBody, &response, responsesComp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse.IsObject()); + ASSERT_TRUE(parsedResponse.HasMember("object")); + EXPECT_STREQ(parsedResponse["object"].GetString(), "response"); + ASSERT_TRUE(parsedResponse.HasMember("output")); + ASSERT_TRUE(parsedResponse["output"].IsArray()); + ASSERT_GT(parsedResponse["output"].GetArray().Size(), 0); +} + +TEST_P(VLMServableExecutionTestParameterized, unaryResponsesWithTools) { + auto modelName = GetParam(); + std::vector> fields = { + {"max_output_tokens", "5"}, + {"temperature", "0.0"}, + {"tool_choice", R"("auto")"}, + {"tools", R"([ + { + "type": "function", + "name": "get_weather", + "description": "Get weather by city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string" + } + }, + "required": ["city"] + } + } + ])"}}; + std::string requestBody = createResponsesRequestBody(modelName, fields); + + ovms::HttpRequestComponents responsesComp; + ASSERT_EQ(handler->parseRequestComponents(responsesComp, "POST", endpointResponses, headers), ovms::StatusCode::OK); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointResponses, requestBody, &response, responsesComp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse.IsObject()); + ASSERT_TRUE(parsedResponse.HasMember("object")); + EXPECT_STREQ(parsedResponse["object"].GetString(), "response"); + ASSERT_TRUE(parsedResponse.HasMember("tools")); + ASSERT_TRUE(parsedResponse["tools"].IsArray()); + ASSERT_GT(parsedResponse["tools"].GetArray().Size(), 0); + ASSERT_TRUE(parsedResponse.HasMember("tool_choice")); + ASSERT_TRUE(parsedResponse["tool_choice"].IsString()); + EXPECT_STREQ(parsedResponse["tool_choice"].GetString(), "auto"); +} + +TEST_P(VLMServableExecutionTestParameterized, unaryResponsesWithFunctionToolChoiceObject) { + auto modelName = GetParam(); + std::vector> fields = { + {"max_output_tokens", "5"}, + {"temperature", "0.0"}, + {"tool_choice", R"({"type":"function","name":"get_weather"})"}, + {"tools", R"([ + { + "type": "function", + "name": "get_weather", + "description": "Get weather by city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string" + } + }, + "required": ["city"] + } + } + ])"}}; + std::string requestBody = createResponsesRequestBody(modelName, fields); + + ovms::HttpRequestComponents responsesComp; + ASSERT_EQ(handler->parseRequestComponents(responsesComp, "POST", endpointResponses, headers), ovms::StatusCode::OK); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointResponses, requestBody, &response, responsesComp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse.IsObject()); + ASSERT_TRUE(parsedResponse.HasMember("tool_choice")); + ASSERT_TRUE(parsedResponse["tool_choice"].IsObject()); + ASSERT_TRUE(parsedResponse["tool_choice"].HasMember("type")); + EXPECT_STREQ(parsedResponse["tool_choice"]["type"].GetString(), "function"); + ASSERT_TRUE(parsedResponse["tool_choice"].HasMember("name")); + EXPECT_STREQ(parsedResponse["tool_choice"]["name"].GetString(), "get_weather"); +} + // Stream flow TEST_P(VLMServableExecutionTestParameterized, streamBasic) {