CogStack · baixiac · Mar 13, 2026 · Mar 6, 2026 · Mar 9, 2026 · Mar 9, 2026
diff --git a/.github/workflows/api-docs.yaml b/.github/workflows/api-docs.yaml
@@ -16,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ '3.10' ]
+        python-version: [ '3.11' ]
       max-parallel: 1
 
     steps:

diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -19,7 +19,7 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Lint
-        run: hadolint --ignore DL3008 --ignore DL3013 --ignore DL3003 --ignore DL4006 docker/Dockerfile* docker/**/Dockerfile*
+        run: hadolint --ignore DL3008 --ignore DL4006 --ignore DL3006 --ignore SC2046 docker/Dockerfile
 
   build-and-push:
     needs: lint
@@ -74,6 +74,9 @@ jobs:
           platforms: linux/amd64,linux/arm64
           context: .
           file: docker/Dockerfile
+          build-args: |
+            IMAGE_TYPE=gpu
+            PIP_EXTRAS=llm
           push: true
           tags: ${{ steps.cms_meta.outputs.tags }}
           labels: ${{ steps.cms_meta.outputs.labels }}

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -24,7 +24,7 @@ jobs:
       - name: Install uv and set Python to ${{ matrix.python-version }}
         uses: astral-sh/setup-uv@v6
         with:
-          version: "0.8.10"
+          version: "0.9.30"
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |

diff --git a/.github/workflows/release-gpu.yaml b/.github/workflows/release-gpu.yaml
@@ -0,0 +1,113 @@
+name: release
+
+on:
+  release:
+    types: [published]
+
+env:
+  REGISTRY: docker.io
+  CMS_GPU_IMAGE_NAME: cogstacksystems/cogstack-modelserve-gpu
+
+jobs:
+  ensure-branch:
+    runs-on: ubuntu-latest
+    outputs:
+      is-valid: ${{ steps.ensure-branch.outputs.is-valid }}
+    steps:
+      - name: Ensures release is from the production branch only
+        id: ensure-branch
+        run: |
+          TARGET_BRANCH="${{ github.event.release.target_commitish }}"
+          if [ "$TARGET_BRANCH" != "production" ]; then
+            echo "Only releases from the 'production' branch are allowed but found: $TARGET_BRANCH"
+            echo "is-valid=false" >> "$GITHUB_OUTPUT"
+            exit 1
+          else
+            echo "Target release branch is: $TARGET_BRANCH"
+            echo "is-valid=true" >> "$GITHUB_OUTPUT"
+          fi
+
+  qc:
+    runs-on: ubuntu-latest
+    needs: ensure-branch
+    if: needs.ensure-branch.outputs.is-valid == 'true'
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+            version: "0.9.30"
+            python-version: "3.11"
+      - name: Install dependencies
+        run: |
+          uv sync --lock --extra dev --extra docs --extra llm
+          uv run python -m ensurepip
+      - name: Run unit tests
+        run: |
+          uv run pytest -v tests/app --cov --cov-report=html:coverage_reports #--random-order
+      - name: Run integration tests
+        run: |
+          uv run pytest -s -v tests/integration
+
+  release-gpu:
+      runs-on: ubuntu-latest
+      needs: [ensure-branch, qc]
+      if: needs.ensure-branch.outputs.is-valid == 'true'
+      permissions:
+        contents: read
+        packages: write
+        id-token: write
+        attestations: write
+      steps:
+        - uses: actions/checkout@v4
+
+        - name: Set up QEMU
+          uses: docker/setup-qemu-action@v3
+
+        - name: Set up Docker Buildx
+          uses: docker/setup-buildx-action@v3
+
+        - name: Extract the tag
+          run: |
+            echo "RELEASE_VERSION=${GITHUB_REF/refs\/tags\/v/}" >> $GITHUB_ENV
+
+        - name: Login to Docker Hub
+          uses: docker/login-action@v3
+          with:
+            registry: ${{ env.REGISTRY }}
+            username: ${{ secrets.DOCKERHUB_USERNAME }}
+            password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+        - name: Extract CMS meta
+          id: cms_meta
+          uses: docker/metadata-action@v5
+          with:
+            images: ${{ env.REGISTRY }}/${{ env.CMS_GPU_IMAGE_NAME }}
+
+        - name: Build and push CMS image
+          uses: docker/build-push-action@v6
+          id: build_and_push_cms
+          with:
+            platforms: linux/amd64,linux/arm64
+            context: .
+            file: docker/Dockerfile
+            build-args: |
+              IMAGE_TYPE=gpu
+              PIP_EXTRAS=llm
+            push: true
+            github-token: ${{ github.token }}
+            tags: |
+              ${{ env.REGISTRY }}/${{ env.CMS_GPU_IMAGE_NAME }}:${{ env.RELEASE_VERSION }}
+            labels: ${{ steps.cms_meta.outputs.labels }}
+
+        - name: Attest CMS image artifacts
+          uses: actions/attest-build-provenance@v2
+          with:
+            subject-name: ${{ env.REGISTRY }}/${{ env.CMS_GPU_IMAGE_NAME }}
+            subject-digest: ${{ steps.build_and_push_cms.outputs.digest }}
+            push-to-registry: true
+
+        - name: Inspect the released image
+          run: |
+            docker pull ${{ env.REGISTRY }}/${{ env.CMS_GPU_IMAGE_NAME }}:${{ env.RELEASE_VERSION }}
+            docker image inspect ${{ env.REGISTRY }}/${{ env.CMS_GPU_IMAGE_NAME }}:${{ env.RELEASE_VERSION }}
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -37,11 +37,11 @@ jobs:
       - name: Install uv
         uses: astral-sh/setup-uv@v5
         with:
-            version: "0.8.10"
-            python-version: "3.10"
+            version: "0.9.30"
+            python-version: "3.11"
       - name: Install dependencies
         run: |
-          uv sync --extra dev --extra docs --extra llm
+          uv sync --lock --extra dev --extra docs --extra llm
           uv run python -m ensurepip
       - name: Run unit tests
         run: |

diff --git a/.gitignore b/.gitignore
@@ -91,7 +91,6 @@ venv/
 ENV/
 env.bak/
 venv.bak/
-.env
 
 # Spyder project settings
 .spyderproject

diff --git a/app/api/routers/generative.py b/app/api/routers/generative.py
@@ -10,7 +10,12 @@
 from fastapi import APIRouter, Depends, Request, Body, Query
 from fastapi.encoders import jsonable_encoder
 from fastapi.responses import PlainTextResponse, StreamingResponse, JSONResponse
-from starlette.status import HTTP_200_OK, HTTP_400_BAD_REQUEST, HTTP_500_INTERNAL_SERVER_ERROR
+from starlette.status import (
+    HTTP_200_OK,
+    HTTP_400_BAD_REQUEST,
+    HTTP_500_INTERNAL_SERVER_ERROR,
+    HTTP_404_NOT_FOUND,
+)
 from app.domain import (
     Tags,
     TagsGenerative,
@@ -35,6 +40,7 @@
 PATH_CHAT_COMPLETIONS = "/v1/chat/completions"
 PATH_COMPLETIONS = "/v1/completions"
 PATH_EMBEDDINGS = "/v1/embeddings"
+PATH_MODELS = "/v1/models"
 
 router = APIRouter()
 config = get_settings()
@@ -200,7 +206,12 @@ def generate_chat_completions(
     max_tokens = request_data.max_tokens
     temperature = request_data.temperature
     top_p = request_data.top_p
-    stop_sequences = request_data.stop_sequences
+    if isinstance(request_data.stop, str):
+        stop_sequences = [request_data.stop]
+    elif isinstance(request_data.stop, list):
+        stop_sequences = request_data.stop
+    else:
+        stop_sequences = []
     tracking_id = tracking_id or str(uuid.uuid4())
 
     if not messages:
@@ -337,12 +348,11 @@ def generate_text_completions(
     max_tokens = request_data.max_tokens
     temperature = request_data.temperature
     top_p = request_data.top_p
-    stop = request_data.stop
 
-    if isinstance(stop, str):
-        stop_sequences = [stop]
-    elif isinstance(stop, list):
-        stop_sequences = stop
+    if isinstance(request_data.stop, str):
+        stop_sequences = [request_data.stop]
+    elif isinstance(request_data.stop, list):
+        stop_sequences = request_data.stop
     else:
         stop_sequences = []
 
@@ -534,6 +544,81 @@ def embed_texts(
         )
 
 
+@router.get(
+    PATH_MODELS,
+    tags=[Tags.OpenAICompatible],
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+    description="List available models, similar to OpenAI's /v1/models endpoint",
+)
+def list_models(
+    model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
+) -> JSONResponse:
+    """
+    Lists all available models, mimicking OpenAI's /v1/models endpoint.
+
+    Args:
+        model_service (AbstractModelService): The model service dependency.
+
+    Returns:
+        JSONResponse: A response containing the list of models.
+    """
+    response = {
+        "object": "list",
+        "data": [
+            {
+                "id": model_service.model_name.replace(" ", "_"),
+                "object": "model",
+                "created": 0,
+                "owned_by": "cms",
+            }
+        ],
+    }
+    return JSONResponse(content=response)
+
+
+@router.get(
+    PATH_MODELS + "/{model_name}",
+    tags=[Tags.OpenAICompatible],
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+    description="Get a specific model, similar to OpenAI's /v1/models/{model_id} endpoint",
+)
+def get_model(
+    model_name: str,
+    model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
+) -> JSONResponse:
+    """
+    Gets a specific model by ID, mimicking OpenAI's /v1/models/{model_id} endpoint.
+
+    Args:
+        model_name (str): The model name to retrieve.
+        model_service (AbstractModelService): The model service dependency.
+
+    Returns:
+        JSONResponse: A response containing the model details.
+    """
+    if model_name != model_service.model_name.replace(" ", "_"):
+        error_response = {
+            "error": {
+                "message": f"The model `{model_name}` does not exist",
+                "type": "invalid_request_error",
+                "param": None,
+                "code": "model_not_found",
+            }
+        }
+        return JSONResponse(content=error_response, status_code=HTTP_404_NOT_FOUND
+)
+    response = {
+        "id": model_name,
+        "object": "model",
+        "created": 0,
+        "owned_by": "cms",
+        "permission": [],
+        "root": model_name,
+        "parent": None,
+    }
+    return JSONResponse(content=response)
+
+
 def _empty_prompt_error() -> Iterable[str]:
     yield "ERROR: No prompt text provided\n"
 

diff --git a/app/config.py b/app/config.py
@@ -38,6 +38,7 @@ class Settings(BaseSettings):   # type: ignore
     HF_PIPELINE_AGGREGATION_STRATEGY: str = "simple"  # the strategy used for aggregating the predictions of the Hugging Face NER model
     LOG_PER_CONCEPT_ACCURACIES: str = "false"         # if "true", per-concept accuracies will be exposed to the metrics scrapper. Switch this on with caution due to the potentially high number of concepts
     MEDCAT2_MAPPED_ONTOLOGIES: str = ""               # the comma-separated names of ontologies for MedCAT2 to map to
+    ENABLE_SPDA_ATTN: str = "true"                    # if "true", attempt to use SPDA attention for HuggingFace LLM loading
     DEBUG: str = "false"                              # if "true", the debug mode is switched on
 
     class Config:

diff --git a/app/domain.py b/app/domain.py
@@ -218,7 +218,10 @@ class OpenAIChatCompletionsRequest(BaseModel):
     model: str = Field(..., description="The name of the model used for generating the completion")
     temperature: float = Field(0.7, description="The temperature of the generated text", ge=0.0, le=1.0)
     top_p: float = Field(0.9, description="The top-p value for nucleus sampling", ge=0.0, le=1.0)
-    stop_sequences: Optional[List[str]] = Field(default=None, description="The list of sequences used to stop the generation")
+    stop: Optional[Union[str, List[str]]] = Field(
+        default=None,
+        description="The single sequence or the list of sequences used to stop the generation",
+    )
 
 
 class OpenAIChatCompletionsResponse(BaseModel):
@@ -242,7 +245,7 @@ class OpenAICompletionsRequest(BaseModel):
     top_p: float = Field(0.9, description="The top-p value for nucleus sampling", ge=0.0, le=1.0)
     stop: Optional[Union[str, List[str]]] = Field(
         default=None,
-        description="The list of sequences used to stop the generation",
+        description="The single sequence or the list of sequences used to stop the generation",
     )
 
 

diff --git a/app/envs/.env b/app/envs/.env
@@ -79,5 +79,8 @@ TRAINING_HF_TAGGING_SCHEME=flat
 # The comma-separated names of ontologies for MedCAT2 to map to
 MEDCAT2_MAPPED_ONTOLOGIES=opcs4,icd10
 
+# If "true", attempt to use SPDA attention for Hugging Face LLM loading
+ENABLE_SPDA_ATTN=true
+
 # If "true", the debug mode is switched on
 DEBUG=false
diff --git a/app/mcp/README.md b/app/mcp/README.md
@@ -92,7 +92,7 @@ cms mcp run --transport sse
         "mcp-remote",
         "http://127.0.0.1:8080/sse",
         "--header",
-        "X-API-Key:${AUTH_HEADER}"
+        "AUTHORIZATION:${AUTH_HEADER}"
       ],
       "env": {
         "AUTH_HEADER": "Bearer <ACCESS_TOKEN>"
@@ -123,7 +123,7 @@ cms mcp run --transport sse
 | `CMS_ACCESS_TOKEN` | Empty | Bearer token for ModelServe API |
 | `CMS_API_KEY` | `Bearer` | API key for ModelServe API |
 | `CMS_MCP_API_KEYS` | None | Comma-separated API keys for authentication |
-| `CMS_MCP_OAUTH_ENABLED` | `false` | Enable OAuth authentication |
+| `CMS_MCP_OAUTH_PROVIDER` | Empty | Enable OAuth authentication if set to "github" or "google" |
 | `CMS_MCP_BASE_URL` | `http://<host>:<port>`  | Base URL for OAuth callback |
 | `CMS_MCP_DEV` | `0` | Run in development mode |
 
@@ -137,7 +137,7 @@ When `CMS_MCP_API_KEYS` is set, clients must authenticate using:
 - **Header**: `X-API-Key: your-key`
 
 ### 2. OAuth Authentication (SSE Transport)
-When `CMS_MCP_OAUTH_ENABLED=true`, the server provides a built-in OAuth 2.0 login flow for SSE transport.
+When `CMS_MCP_OAUTH_PROVIDER` is set, the server provides a built-in OAuth 2.0 login flow for SSE transport.
 
 **OAuth Endpoints:**
 - `/oauth/login` - Login page with Google and GitHub options