rerank server.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from sentence_transformers import CrossEncoder
from typing import List
import torch
import gc
import uvicorn
app = FastAPI()
# 1. 載入模型:針對 16G 顯存 (實測剩餘 3.8G) 優化配置
MODEL_PATH = "/opt/models/bge-reranker-v2-m3"
model = CrossEncoder(
MODEL_PATH,
device="cuda",
max_length=256 # 建議維持 256,節省顯存並提升響應速度
)
model.model.half() # 使用 FP16 半精度,顯存減半
model.model.eval() # 推理模式
class RerankRequest(BaseModel):
query: str
documents: List[str]
def clean_text(text: str) -> str:
"""清理二進位殘留與特殊字符,徹底根治前端 JSON 解析亂碼報錯"""
if not isinstance(text, str):
return ""
# 移除 null byte 並強制忽略非法 UTF-8 字元
return text.encode("utf-8", "ignore").decode("utf-8").replace("\x00", "")
@app.post("/rerank")
async def rerank(req: RerankRequest):
try:
if not req.documents:
return {"model": "bge-reranker-v2-m3", "docs": []}
# 2. 數據清洗:防止前端報錯的關鍵步驟
safe_query = clean_text(req.query)
safe_docs = [clean_text(doc) for doc in req.documents]
pairs = [[safe_query, doc] for doc in safe_docs]
# 3. 顯存保護:固定保守的 batch_size
# 在 3.8G 剩餘顯存下,16 是一個兼顧性能與穩定性的安全值
safe_batch_size = 16
# 4. 執行推理
with torch.no_grad():
# 使用 torch.cuda.amp.autocast 配合 FP16
with torch.amp.autocast('cuda'):
scores = model.predict(
pairs,
batch_size=safe_batch_size,
show_progress_bar=False
)
# 將 numpy 轉為 list,保證 JSON 序列化正確性
scores_list = scores.tolist() if hasattr(scores, 'tolist') else [float(s) for s in scores]
# 5. 構建返回結構(修正為前端預期的 docs 欄位)
results = {
"model": "bge-reranker-v2-m3",
"docs": [
{
"index": i,
"text": safe_docs[i],
"score": float(scores_list[i])
}
for i in range(len(safe_docs))
]
}
# 6. 強制顯存回收(與 Ollama 共存的必要手段)
del pairs
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
return results
except torch.cuda.OutOfMemoryError:
if torch.cuda.is_available():
torch.cuda.empty_cache()
raise HTTPException(status_code=507, detail="GPU 顯存不足,請嘗試減少文檔數量或重啟服務")
except Exception as e:
# 捕捉其餘異常並回傳
raise HTTPException(status_code=500, detail=f"服務端錯誤: {str(e)}")
if __name__ == "__main__":
# 預設監聽 8000 端口
uvicorn.run(app, host="0.0.0.0", port=8000)
yxbinghe
/etc/nginx/conf.d/ai.confi
location /api/rerank {
proxy_pass http://127.0.0.1:8000/rerank;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
# location /api/rerank/health {
# proxy_pass http://127.0.0.1:8000/health;
# }
location /api/rerank/health {
return 200 “ok”;
}