tattn/localllmclient
A Swift package to interact with local Large Language Models (LLMs) on Apple platforms.
Features
- Support for GGUF / MLX models / FoundationModels framework
- Support for iOS, macOS and Linux
- Streaming API
- Multimodal (experimental)
- Tool calling (experimental)
Installation
Add the following dependency to your Package.swift file:
dependencies: [
.package(url: "https://github.com/tattn/LocalLLMClient.git", branch: "main")
]Usage
The API documentation is available [here](https://tattn.github.io/LocalLLMClient/documentation/).
### Quick Start
```swift
import LocalLLMClient
import LocalLLMClientLlama
let session = LLMSession(model: .llama(
id: "lmstudio-community/gemma-3-4B-it-qat-GGUF",
model: "gemma-3-4B-it-QAT-Q4_0.gguf"
))
print(try await session.respond(to: "Tell me a joke."))
for try await text in session.streamResponse(to: "Write a story about cats.") {
print(text, terminator: "")
}
```
### Using with Each Backend
<details open>
<summary>Using llama.cpp</summary>
```swift
import LocalLLMClient
import LocalLLMClientLlama
// Create a model
let model = LLMSession.DownloadModel.llama(
id: "lmstudio-community/gemma-3-4B-it-qat-GGUF",
model: "gemma-3-4B-it-QAT-Q4_0.gguf",
parameter: .init(
temperature: 0.7, // Randomness (0.0〜1.0)
topK: 40, // Top-K sampling
topP: 0.9, // Top-P (nucleus) sampling
options: .init(responseFormat: .json) // Response format
)
)
// You can track download progress
try await model.downloadModel { progress in
print("Download progress: \(progress)")
}
// Create a session with the downloaded model
let session = LLMSession(model: model)
// Generate a response with a specific prompt
let response = try await session.respond(to: """
Create the beginning of a synopsis for an epic story with a cat as the main character.
Format it in JSON, as shown below.
{
"title": "<title>",
"content": "<content>",
}
""")
print(response)
// You can also add system messages before asking questions
session.messages = [.system("You are a helpful assistant.")]
```
</details>
<details>
<summary>Using Apple MLX</summary>
```swift
import LocalLLMClient
import LocalLLMClientMLX
// Create a model
let model = LLMSession.DownloadModel.mlx(
id: "mlx-community/Qwen3-1.7B-4bit",
parameter: .init(
temperature: 0.7, // Randomness (0.0 to 1.0)
topP: 0.9 // Top-P (nucleus) sampling
)
)
// You can track download progress
try await model.downloadModel { progress in
print("Download progress: \(progress)")
}
// Create a session with the downloaded model
let session = LLMSession(model: model)
// Generate text with system and user messages
session.messages = [.system("You are a helpful assistant.")]
let response = try await session.respond(to: "Tell me a story about a cat.")
print(response)
```
</details>
<details>
<summary>Using Apple FoundationModels</summary>
```swift
import LocalLLMClient
import LocalLLMClientFoundationModels
// Available on iOS 26.0+ / macOS 26.0+ and requires Apple Intelligence
let session = LLMSession(model: .foundationModels(
// Use system's default model
model: .default,
// Configure generation options
parameter: .init(
temperature: 0.7,
)
))
// Generate a response with a specific prompt
let response = try await session.respond(to: "Tell me a short story about a clever fox.")
print(response)
```
</details>
### Tool Calling
LocalLLMClient supports tool calling for integrations with external systems.
> [!IMPORTANT]
> Tool calling is only available with models that support this feature. Each backend has different model compatibility.
>
> Make sure your chosen model explicitly supports tool calling before using this feature.
<details open>
<summary>Using tool calling</summary>
```swift
import LocalLLMClient
import LocalLLMClientLlama
@Tool("get_weather")
struct GetWeatherTool {
let description = "Get the current weather in a given location"
@ToolArguments
struct Arguments {
@ToolArgument("The city and state, e.g. San Francisco, CA")
var location: String
@ToolArgument("Temperature unit")
var unit: Unit?
@ToolArgumentEnum
enum Unit: String {
case celsius
case fahrenheit
}
}
func call(arguments: Arguments) async throws -> ToolOutput {
// In a real implementation, this would call a weather API
let temp = arguments.unit == .celsius ? "22°C" : "72°F"
return ToolOutput([
"location": arguments.location,
"temperature": temp,
"condition": "sunny"
])
}
}
// Create the tool
let weatherTool = GetWeatherTool()
// Create a session with a model that supports tool calling and register tools
let session = LLMSession(
model: .llama(
id: "Qwen/Qwen2.5-1.5B-Instruct-GGUF",
model: "qwen2.5-1.5b-instruct-q4_k_m.gguf"
),
tools: [weatherTool]
)
// Ask a question that requires tool use
let response = try await session.respond(to: "What's the weather like in Tokyo?")
print(response)
// The model will automatically call the weather tool and include the result in its response
```
</details>
### Multimodal for Image Processing
LocalLLMClient also supports multimodal models for processing images.
<details open>
<summary>Using with llama.cpp</summary>
```swift
import LocalLLMClient
import LocalLLMClientLlama
// Create a session with a multimodal model
let session = LLMSession(model: .llama(
id: "ggml-org/gemma-3-4b-it-GGUF",
model: "gemma-3-4b-it-Q8_0.gguf",
mmproj: "mmproj-model-f16.gguf"
))
// Ask a question about an image
let response = try await session.respond(
to: "What's in this image?",
attachments: [.image(.init(resource: .yourImage))]
)
print(response)
// You can also stream the response
for try await text in session.streamResponse(
to: "Describe this image in detail",
attachments: [.image(.init(resource: .yourImage))]
) {
print(text, terminator: "")
}
```
</details>
<details>
<summary>Using with Apple MLX</summary>
```swift
import LocalLLMClient
import LocalLLMClientMLX
// Create a session with a multimodal model
let session = LLMSession(model: .mlx(
id: "mlx-community/Qwen2.5-VL-3B-Instruct-abliterated-4bit"
))
// Ask a question about an image
let response = try await session.respond(
to: "What's in this image?",
attachments: [.image(.init(resource: .yourImage))]
)
print(response)
```
</details>
<details>
<summary><h3>Advanced Usage: Low Level API</h3></summary>
For more advanced control over model loading and inference, you can use the `LocalLLMClient` APIs directly.
<details>
<summary>Using with llama.cpp</summary>
```swift
import LocalLLMClient
import LocalLLMClientLlama
import LocalLLMClientUtility
// Download model from Hugging Face (Gemma 3)
let ggufName = "gemma-3-4B-it-QAT-Q4_0.gguf"
let downloader = FileDownloader(source: .huggingFace(
id: "lmstudio-community/gemma-3-4B-it-qat-GGUF",
globs: [ggufName]
))
try await downloader.download { print("Progress: \($0)") }
// Initialize a client with the downloaded model
let modelURL = downloader.destination.appending(component: ggufName)
let client = try await LocalLLMClient.llama(url: modelURL, parameter: .init(
context: 4096, // Context size
temperature: 0.7, // Randomness (0.0〜1.0)
topK: 40, // Top-K sampling
topP: 0.9, // Top-P (nucleus) sampling
options: .init(responseFormat: .json) // Response format
))
let prompt = """
Create the beginning of a synopsis for an epic story with a cat as the main character.
Format it in JSON, as shown below.
{
"title": "<title>",
"content": "<content>",
}
"""
// Generate text
let input = LLMInput.chat([
.system("You are a helpful assistant."),
.user(prompt)
])
for try await text in try await client.textStream(from: input) {
print(text, terminator: "")
}
```
</details>
<details>
<summary>Using with Apple MLX</summary>
```swift
import LocalLLMClient
import LocalLLMClientMLX
import LocalLLMClientUtility
// Download model from Hugging Face
let downloader = FileDownloader(
source: .huggingFace(id: "mlx-community/Qwen3-1.7B-4bit", globs: .mlx)
)
try await downloader.download { print("Progress: \($0)") }
// Initialize a client with the downloaded model
let client = try await LocalLLMClient.mlx(url: downloader.destination, parameter: .init(
temperature: 0.7, // Randomness (0.0 to 1.0)
topP: 0.9 // Top-P (nucleus) sampling
))
// Generate text
let input = LLMInput.chat([
.system("You are a helpful assistant."),
.user("Tell me a story about a cat.")
])
for try await text in try await client.textStream(from: input) {
print(text, terminator: "")
}
```
</details>
<details>
<summary>Using with Apple FoundationModels</summary>
```swift
import LocalLLMClient
import LocalLLMClientFoundationModels
// Available on iOS 26.0+ / macOS 26.0+ and requires Apple Intelligence
let client = try await LocalLLMClient.foundationModels(
// Use system's default model
model: .default,
// Configure generation options
parameter: .init(
temperature: 0.7,
)
)
// Generate text
let input = LLMInput.chat([
.system("You are a helpful assistant."),
.user("Tell me a short story about a clever fox.")
])
for try await text in try await client.textStream(from: input) {
print(text, terminator: "")
}
```
</details>
<details>
<summary>Advanced Multimodal with llama.cpp</summary>
```swift
import LocalLLMClient
import LocalLLMClientLlama
import LocalLLMClientUtility
// Download model from Hugging Face (Gemma 3)
let model = "gemma-3-4b-it-Q8_0.gguf"
let mmproj = "mmproj-model-f16.gguf"
let downloader = FileDownloader(
source: .huggingFace(id: "ggml-org/gemma-3-4b-it-GGUF", globs: [model, mmproj]),
)
try await downloader.download { print("Download: \($0)") }
// Initialize a client with the downloaded model
let client = try await LocalLLMClient.llama(
url: downloader.destination.appending(component: model),
mmprojURL: downloader.destination.appending(component: mmproj)
)
let input = LLMInput.chat([
.user("What's in this image?", attachments: [.image(.init(resource: .yourImage))]),
])
// Generate text without streaming
print(try await client.generateText(from: input))
```
</details>
<details>
<summary>Advanced Multimodal with Apple MLX</summary>
```swift
import LocalLLMClient
import LocalLLMClientMLX
import LocalLLMClientUtility
// Download model from Hugging Face (Qwen2.5 VL)
let downloader = FileDownloader(source: .huggingFace(
id: "mlx-community/Qwen2.5-VL-3B-Instruct-abliterated-4bit",
globs: .mlx
))
try await downloader.download { print("Progress: \($0)") }
let client = try await LocalLLMClient.mlx(url: downloader.destination)
let input = LLMInput.chat([
.user("What's in this image?", attachments: [.image(.init(resource: .yourImage))]),
])
// Generate text without streaming
print(try await client.generateText(from: input))
```
</details>
</details>
### CLI Tool
You can use LocalLLMClient directly from the terminal using the command line tool:
```bash
# Run using llama.cpp
swift run LocalLLMCLI --model /path/to/your/model.gguf "Your prompt here"
# Run using MLX
./scripts/run_mlx.sh --model https://huggingface.co/mlx-community/Qwen3-1.7B-4bit "Your prompt here"
```Tested Models
- LLaMA 3
- Gemma 3 / 2
- Qwen 3 / 2
- Phi 4
Models compatible with llama.cpp backend Models compatible with MLX backend
If you have a model that works, please open an issue or PR to add it to the list.
Requirements
- iOS 17.0+ / macOS 14.0+
- Xcode 16.0+
Acknowledgements
This package uses llama.cpp, Apple's MLX and Foundation Models framework for model inference.
Package Metadata
Repository: tattn/localllmclient
Default branch: main
README: README.md