shimmy/Cargo.toml at main · gustavembende-cell/shimmy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
[package]
name = "shimmy"
version = "1.3.3"
edition = "2021"
license = "MIT"
description = "Lightweight 5MB Ollama alternative with native SafeTensors support. No Python dependencies, 2x faster loading."
homepage = "https://github.com/Michael-A-Kuykendall/shimmy"
repository = "https://github.com/Michael-A-Kuykendall/shimmy"
readme = "README.md"
keywords = ["llm", "local-ai", "inference", "server", "api"]
categories = ["command-line-utilities", "web-programming::http-server"]
authors = ["Michael A. Kuykendall <michaelallenkuykendall@gmail.com>"]
exclude = [
    "docs-internal/*",
    "test-models/*",
    "target/*",
    ".*",
    "*.sh",
    "*.ps1",
    "*.py"
]

[features]
default = ["huggingface", "llama"]  # macOS ARM64 i8mm issues fixed via forked llama-cpp-2
# Engine backends
llama = ["dep:llama-cpp-2"]
huggingface = [] # Python integration, no additional Rust deps
# Convenience feature sets
fast = ["huggingface"] # Fast compilation - no C++ deps
full = ["huggingface", "llama"] # Full compilation - includes C++ deps
coverage = ["huggingface"] # Coverage testing - minimal deps for faster builds

[dependencies]
anyhow = "1"
axum = { version = "0.7", features = ["http1","json","ws"] }
async-trait = "0.1"
bytes = "1"
chrono = { version = "0.4", features = ["serde"] }
clap = { version = "4", features = ["derive"] }
futures-util = "0.3"
lazy_static = "1.5"
memmap2 = "0.9"
minijinja = { version = "2", features = ["loader"] }
parking_lot = "0.12"
rand = "0.8"
safetensors = "0.4"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
sys-info = "0.9"
sysinfo = "0.30"
tempfile = "3"
thiserror = "1"
tokio = { version = "1", features = ["macros","rt-multi-thread","signal","process","fs"] }
tokio-stream = "0.1"
tracing = "0.1"
tracing-subscriber = { version = "0.3.20", features = ["env-filter"] }
uuid = { version = "1", features = ["v4", "serde"] }
dirs = "5.0"
reqwest = { version = "0.11", features = ["json", "rustls-tls"], default-features = false }

# llama.cpp bindings (optional) - using forked version with macOS ARM64 i8mm fix
llama-cpp-2 = { version = "0.1.118", optional = true, default-features = false }

# Use forked llama-cpp-2 with macOS ARM64 i8mm compatibility fix
[patch.crates-io]
llama-cpp-2 = { git = "https://github.com/Michael-A-Kuykendall/llama-cpp-rs.git", branch = "fix-macos-arm64-i8mm", package = "llama-cpp-2" }

[dev-dependencies]
tokio-tungstenite = "0.20"
criterion = { version = "0.5", features = ["html_reports"] }
# Additional dependencies for mock testing infrastructure
tempfile = "3"  # For creating temporary test directories
rand = "0.8"    # For randomized testing scenarios (already in main deps)
# Note: tempfile is already in main dependencies, rand is already in main dependencies

[profile.release]
lto = true
codegen-units = 1
opt-level = "z"

# Optimize build times for development
[profile.dev]
opt-level = 1
debug = true

# Faster builds for dependencies
[profile.dev.package."*"]
opt-level = 2
debug = false

# Benchmark configuration
[[bench]]
name = "model_loading"
harness = false

[[bench]]
name = "generation_performance"
harness = false