forked from lightonai/next-plaid
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.cuda.yml
More file actions
121 lines (120 loc) · 5.04 KB
/
Copy pathdocker-compose.cuda.yml
File metadata and controls
121 lines (120 loc) · 5.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# =============================================================================
# Next-Plaid API Docker Compose - CUDA
# =============================================================================
# Standalone configuration with GPU/CUDA support.
# Usage: docker compose -f docker-compose.cuda.yml up -d
#
# Vector Database Storage:
# Indices are persisted at ${NEXT_PLAID_DATA:-~/.local/share/next-plaid}
# Each index is stored as a subdirectory: <data-dir>/<index-name>/
# On container restart, existing indices are automatically loaded.
#
# Model Cache:
# Downloaded HuggingFace models are cached at ${NEXT_PLAID_MODELS:-~/.cache/huggingface/next-plaid}
# Models are only downloaded once and reused on subsequent container starts.
#
# Model Configuration (via command arguments):
# --model <id> HuggingFace model ID or local path
# --cuda Use CUDA (required for GPU)
# --batch-size <N> Batch size per session
# --query-length <N> Max query length in tokens (default: 48)
# --document-length <N> Max document length in tokens (default: 300)
#
# Rate Limiting & Concurrency (via environment variables):
# RATE_LIMIT_ENABLED Enable rate limiting (default: false)
# RATE_LIMIT_PER_SECOND Max requests per second (default: 100, when enabled)
# RATE_LIMIT_BURST_SIZE Burst size for rate limiting (default: 200, when enabled)
# CONCURRENCY_LIMIT Max concurrent in-flight requests (default: 200)
# MAX_QUEUED_TASKS_PER_INDEX Max queued updates/deletes per index (default: 20)
# MAX_BATCH_DOCUMENTS Max documents to batch before processing (default: 500)
# BATCH_CHANNEL_SIZE Buffer size for document batch queue (default: 200)
# MAX_BATCH_TEXTS Max texts to batch for encoding (default: 128)
# ENCODE_BATCH_CHANNEL_SIZE Buffer size for encode batch queue (default: 512)
# MODEL_POOL_SIZE Number of model workers for concurrent encoding (default: 1)
#
# CUDA Defaults (optimized for GPU, high throughput):
# --model lightonai/GTE-ModernColBERT-v1 --cuda --batch-size 128
# (no --int8: GPU is fast enough with FP32, no --parallel: GPU handles parallelism)
#
# Examples:
# # Default CUDA configuration
# docker compose -f docker-compose.cuda.yml up -d
#
# # Custom model (override command in docker-compose.override.yml)
# # Or run directly:
# docker run -p 8080:8080 --gpus all -v ~/.local/share/next-plaid:/data/indices \
# -v ~/.cache/huggingface/next-plaid:/models \
# next-plaid-api:cuda --model my-org/my-model --cuda --batch-size 128
#
# To customize storage locations, create a .env file with:
# NEXT_PLAID_DATA=/path/to/indices
# NEXT_PLAID_MODELS=/path/to/models
# =============================================================================
services:
next-plaid-api:
build:
context: .
dockerfile: next-plaid-api/Dockerfile
target: runtime-cuda
ports:
- "8080:8080"
volumes:
# Persistent vector database storage
# Default: ~/.local/share/next-plaid (XDG standard for user data)
# Override with NEXT_PLAID_DATA environment variable
- ${NEXT_PLAID_DATA:-~/.local/share/next-plaid}:/data/indices
# Persistent model cache (auto-downloaded from HuggingFace)
# Default: ~/.cache/huggingface (standard HF cache location)
# Override with NEXT_PLAID_MODELS environment variable
- ${NEXT_PLAID_MODELS:-~/.cache/huggingface/next-plaid}:/models
environment:
- RUST_LOG=info
- NVIDIA_VISIBLE_DEVICES=all
# Rate limiting (disabled by default, uncomment to enable)
# - RATE_LIMIT_ENABLED=true
# - RATE_LIMIT_PER_SECOND=${RATE_LIMIT_PER_SECOND:-100}
# - RATE_LIMIT_BURST_SIZE=${RATE_LIMIT_BURST_SIZE:-200}
- CONCURRENCY_LIMIT=${CONCURRENCY_LIMIT:-200}
# Document processing configuration
- MAX_QUEUED_TASKS_PER_INDEX=${MAX_QUEUED_TASKS_PER_INDEX:-20}
- MAX_BATCH_DOCUMENTS=${MAX_BATCH_DOCUMENTS:-500}
- BATCH_CHANNEL_SIZE=${BATCH_CHANNEL_SIZE:-200}
# Encode batching configuration
- MAX_BATCH_TEXTS=${MAX_BATCH_TEXTS:-64}
- ENCODE_BATCH_CHANNEL_SIZE=${ENCODE_BATCH_CHANNEL_SIZE:-512}
# CUDA defaults: FP32 model (GPU is fast), large batches, 1 model pool worker
command:
- --host
- "0.0.0.0"
- --port
- "8080"
- --index-dir
- /data/indices
- --model
- ${MODEL:-lightonai/GTE-ModernColBERT-v1}
- --cuda
- --batch-size
- "128"
- --model-pool-size
- "${MODEL_POOL_SIZE:-1}"
- --query-length
- "48"
- --document-length
- "300"
healthcheck:
test: ["CMD", "curl", "-f", "--max-time", "5", "http://localhost:8080/health"]
interval: 15s
timeout: 5s
retries: 2
start_period: 120s # Longer start period for model download + CUDA initialization
restart: unless-stopped
deploy:
resources:
limits:
memory: 16G
reservations:
memory: 4G
devices:
- driver: nvidia
count: 1
capabilities: [gpu]