Argonne Leadership Computing Facility

# Create a new Conda environment
conda create -n globus_env python==3.11.9 --y
conda activate globus_env

# Install necessary packages
pip install openai globus_sdk

# Download the authentication helper script
wget https://raw.githubusercontent.com/argonne-lcf/inference-endpoints/refs/heads/main/inference_auth_token.py

# Authenticate with your Globus account
python inference_auth_token.py authenticate

python inference_auth_token.py get_time_until_token_expiration --units seconds

#!/bin/bash

# Get your access token
access_token=$(python inference_auth_token.py get_access_token)

curl -X POST "https://inference-api.alcf.anl.gov/resource_server/metis/api/v1/chat/completions" \
     -H "Authorization: Bearer ${access_token}" \
     -H "Content-Type: application/json" \
     -d '{
            "model": "gpt-oss-120b-131072",
            "messages":[{"role": "user", "content": "Explain quantum computing in simple terms."}]
         }'

from openai import OpenAI
from inference_auth_token import get_access_token

# Get your access token
access_token = get_access_token()

client = OpenAI(
    api_key=access_token,
    base_url="https://inference-api.alcf.anl.gov/resource_server/metis/api/v1"
)

response = client.chat.completions.create(
    model="gpt-oss-120b-131072",
    messages=[{"role": "user", "content": "Explain quantum computing in simple terms."}]
)

print(response.choices[0].message.content)

access_token=$(python inference_auth_token.py get_access_token)
curl -X GET "https://inference-api.alcf.anl.gov/resource_server/list-endpoints" \
     -H "Authorization: Bearer ${access_token}"

#!/bin/bash

# Get your access token
access_token=$(python inference_auth_token.py get_access_token)

# Check Sophia cluster status
curl -X GET "https://inference-api.alcf.anl.gov/resource_server/sophia/jobs" \
 -H "Authorization: Bearer ${access_token}"

# Check Metis cluster status (replace 'sophia' with 'metis')
curl -X GET "https://inference-api.alcf.anl.gov/resource_server/metis/jobs" \
 -H "Authorization: Bearer ${access_token}"

#!/bin/bash

# Get your access token
access_token=$(python inference_auth_token.py get_access_token)

curl -X GET "https://inference-api.alcf.anl.gov/resource_server/list-endpoints" \
 -H "Authorization: Bearer ${access_token}"

#!/bin/bash
access_token=$(python inference_auth_token.py get_access_token)

# Sophia cluster example
curl -X POST "https://inference-api.alcf.anl.gov/resource_server/sophia/vllm/v1/chat/completions" \
     -H "Authorization: Bearer ${access_token}" \
     -H "Content-Type: application/json" \
     -d '{
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "temperature": 0.2,
            "max_tokens": 150,
            "messages":[{"role": "user", "content": "What are the symptoms of diabetes?"}]
         }'

# Metis cluster example (replace '/sophia/vllm' with '/metis/api')
curl -X POST "https://inference-api.alcf.anl.gov/resource_server/metis/api/v1/chat/completions" \
     -H "Authorization: Bearer ${access_token}" \
     -H "Content-Type: application/json" \
     -d '{
            "model": "gpt-oss-120b-131072",
            "temperature": 0.2,
            "max_tokens": 150,
            "messages":[{"role": "user", "content": "What are the symptoms of diabetes?"}]
         }'

from openai import OpenAI
from inference_auth_token import get_access_token

access_token = get_access_token()

# Sophia cluster
client = OpenAI(
    api_key=access_token,
    base_url="https://inference-api.alcf.anl.gov/resource_server/sophia/vllm/v1"
)

response = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
    messages=[{"role": "user", "content": "What are the symptoms of diabetes?"}]
)
print(response.choices[0].message.content)

# Metis cluster (replace '/sophia/vllm' with '/metis/api')
client_metis = OpenAI(
    api_key=access_token,
    base_url="https://inference-api.alcf.anl.gov/resource_server/metis/api/v1"
)

response = client_metis.chat.completions.create(
    model="gpt-oss-120b-131072",
    messages=[{"role": "user", "content": "What are the symptoms of diabetes?"}]
)
print(response.choices[0].message.content)

from openai import OpenAI
import base64
from inference_auth_token import get_access_token

access_token = get_access_token()
client = OpenAI(
    api_key=access_token,
    base_url="https://inference-api.alcf.anl.gov/resource_server/sophia/vllm/v1"
)

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

image_path = "scientific_diagram.png" # Replace with your image
base64_image = encode_image(image_path)

response = client.chat.completions.create(
    model="meta-llama/Llama-3.2-90B-Vision-Instruct",
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Describe the key components in this scientific diagram"},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
            ]
        }
    ],
    max_tokens=300
)
print(response.choices[0].message.content)

from openai import OpenAI
from inference_auth_token import get_access_token

access_token = get_access_token()
client = OpenAI(
    api_key=access_token,
    base_url="https://inference-api.alcf.anl.gov/resource_server/sophia/vllm/v1"
)

response = client.embeddings.create(
  model="mistralai/Mistral-7B-Instruct-v0.3-embed",
  input="The food was delicious and the waiter...",
  encoding_format="float"
)
print(response.data[0].embedding)

#!/bin/bash

# Get your access token
access_token=$(python inference_auth_token.py get_access_token)

# Define the base URL
base_url="https://inference-api.alcf.anl.gov/resource_server/sophia/vllm/v1/batches"

# Submit batch request
curl -X POST "$base_url" \
     -H "Authorization: Bearer ${access_token}" \
     -H "Content-Type: application/json" \
     -d '{
          "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
          "input_file": "/eagle/argonne_tpc/path/to/your/input.jsonl"
        }'

# Submit batch request with custom output folder
curl -X POST "$base_url" \
     -H "Authorization: Bearer ${access_token}" \
     -H "Content-Type: application/json" \
     -d '{
          "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
          "input_file": "/eagle/argonne_tpc/path/to/your/input.jsonl",
          "output_folder_path": "/eagle/argonne_tpc/path/to/your/output/folder/"
        }'

import requests
import json
from inference_auth_token import get_access_token

# Get your access token
access_token = get_access_token()

# Define headers and URL
headers = {
    'Authorization': f'Bearer {access_token}',
    'Content-Type': 'application/json'
}
url = "https://inference-api.alcf.anl.gov/resource_server/sophia/vllm/v1/batches"

# Submit batch request
data = {
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "input_file": "/eagle/argonne_tpc/path/to/your/input.jsonl",
    "output_folder_path": "/eagle/argonne_tpc/path/to/your/output/folder/"
}

response = requests.post(url, headers=headers, json=data)
print(response.json())

#!/bin/bash

# Get your access token
access_token=$(python inference_auth_token.py get_access_token)

# Get results of specific batch
batch_id="your-batch-id"
curl -X GET "https://inference-api.alcf.anl.gov/resource_server/v1/batches/${batch_id}/result" \
     -H "Authorization: Bearer ${access_token}"

import requests
from inference_auth_token import get_access_token

# Get your access token
access_token = get_access_token()

# Define headers and URL
headers = {
    'Authorization': f'Bearer {access_token}'
}
batch_id = "your-batch-id"
url = f"https://inference-api.alcf.anl.gov/resource_server/v1/batches/{batch_id}/result"

# Get batch results
response = requests.get(url, headers=headers)
print(response.json())

{
    "results_file": "/eagle/argonne_tpc/path/to/your/output/folder/<input-file-name>_<model>_<batch-id>/<input-file-name>_<timestamp>.results.jsonl",
    "progress_file": "/eagle/argonne_tpc/path/to/your/output/folder/<input-file-name>_<model>_<batch-id>/<input-file-name>_<timestamp>.progress.json",
    "metrics": {
        "response_time": 27837.440138816833,
        "throughput_tokens_per_second": 3899.833442250346,
        "total_tokens": 108561380,
        "num_responses": 99985,
        "lines_processed": 100000
    }
}

#!/bin/bash

# Get your access token
access_token=$(python inference_auth_token.py get_access_token)

# List all batches
curl -X GET "https://inference-api.alcf.anl.gov/resource_server/v1/batches" \
     -H "Authorization: Bearer ${access_token}"

# Optionally filter by status (pending, running, completed, or failed)
curl -X GET "https://inference-api.alcf.anl.gov/resource_server/v1/batches?status=completed" \
     -H "Authorization: Bearer ${access_token}"

import requests
from inference_auth_token import get_access_token

# Get your access token
access_token = get_access_token()

# Define headers and URL
headers = {
    'Authorization': f'Bearer {access_token}'
}
url = "https://inference-api.alcf.anl.gov/resource_server/v1/batches"

# List all batches
response = requests.get(url, headers=headers)
print(response.json())

# Optionally filter by status (pending, running, completed, or failed)
params = {'status': 'completed'}
response = requests.get(url, headers=headers, params=params)
print(response.json())

[
  {
    "batch_id": "f8fa8efd-1111-476d-a0a0-111111111111",
    "cluster": "sophia",
    "created_at": "2025-02-20 18:39:58.049584+00:00",
    "framework": "vllm",
    "input_file": "/eagle/argonne_tpc/path/to/your/output/folder/chunk_a.jsonl",
    "status": "pending"
  },
  {
    "batch_id": "4b8a31b8-2222-479f-8c8c-222222222222",
    "cluster": "sophia",
    "created_at": "2025-02-20 18:40:30.882414+00:00",
    "framework": "vllm",
    "input_file": "/eagle/argonne_tpc/path/to/your/output/folder/chunk_b.jsonl",
    "status": "pending"
  }
]

#!/bin/bash

# Get your access token
access_token=$(python inference_auth_token.py get_access_token)

# Get status of specific batch
batch_id="your-batch-id"
curl -X GET "https://inference-api.alcf.anl.gov/resource_server/v1/batches/${batch_id}" \
     -H "Authorization: Bearer ${access_token}"

import requests
from inference_auth_token import get_access_token

# Get your access token
access_token = get_access_token()

# Define headers and URL
headers = {
    'Authorization': f'Bearer {access_token}'
}
batch_id = "your-batch-id"
url = f"https://inference-api.alcf.anl.gov/resource_server/v1/batches/{batch_id}"

# Get batch status
response = requests.get(url, headers=headers)
print(response.json())

Cluster	Status	Framework	Base URL	Supported Endpoints
Sophia	Active	vLLM	`/resource_server/sophia/vllm/v1`	`/chat/completions` `/completions` `/embeddings` `/batches`
SambaNova SN40L (Metis)	Active	SambaNova API	`/resource_server/metis/api/v1`	`/chat/completions`
Cerebras CS-3	Coming Soon	-	-	-
GH200 Nvidia	Coming Soon	-	-	-

Argonne Leadership Computing Facility

ALCF Inference Endpoints¶

Quick Start¶

Web UI¶

API Access¶

1. Setup Your Environment¶

2. Authenticate¶

3. Make a Test Call¶

System Details¶

Available Clusters¶

API Usage Examples¶

Querying Endpoint Status¶

Chat Completions¶

Vision Language Models¶

Embeddings¶

Available Models¶

Sophia Cluster (vLLM)¶

Metis Cluster (SambaNova)¶

Batch Processing¶

Batch API Endpoints¶

Create Batch¶

Retrieve Batch¶

List Batch¶

Batch Status¶

Cancel Batch¶

Performance and Wait Times¶

Important Notes¶

Troubleshooting¶

Notifications¶

Contact Us¶