Files
tori/scripts/embed.py

67 lines
2.0 KiB
Python

#!/usr/bin/env python3
"""Embedding HTTP server. Loads model once at startup, serves requests on port 8199.
POST /embed {"texts": ["text1", "text2", ...]}
Response: {"embeddings": [[0.1, 0.2, ...], ...]}
GET /health -> 200 OK
"""
import json
import sys
from http.server import HTTPServer, BaseHTTPRequestHandler
from sentence_transformers import SentenceTransformer
MODEL_NAME = "all-MiniLM-L6-v2"
PORT = 8199
# Load model once at startup
print(f"Loading model {MODEL_NAME}...", flush=True)
model = SentenceTransformer(MODEL_NAME)
print(f"Model loaded, serving on port {PORT}", flush=True)
class EmbedHandler(BaseHTTPRequestHandler):
def do_POST(self):
length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(length)
data = json.loads(body)
texts = data.get("texts", [])
if not texts:
result = {"embeddings": []}
else:
embeddings = model.encode(texts, normalize_embeddings=True)
result = {"embeddings": embeddings.tolist()}
resp = json.dumps(result).encode()
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(resp)))
self.end_headers()
self.wfile.write(resp)
def do_GET(self):
self.send_response(200)
self.send_header("Content-Type", "text/plain")
self.end_headers()
self.wfile.write(b"ok")
def log_message(self, format, *args):
# Suppress per-request logs
pass
if __name__ == "__main__":
import socket
# Dual-stack: listen on both IPv4 and IPv6
class DualStackHTTPServer(HTTPServer):
address_family = socket.AF_INET6
def server_bind(self):
self.socket.setsockopt(socket.IPPROTO_IPV6, socket.IPV6_V6ONLY, 0)
super().server_bind()
server = DualStackHTTPServer(("::", PORT), EmbedHandler)
try:
server.serve_forever()
except KeyboardInterrupt:
pass