Spaces:
Running
Running
fahmiaziz98
commited on
Commit
·
66f2fb1
1
Parent(s):
95cd425
add model splade-large-doc
Browse files- requirements.txt +3 -1
- src/config/models.yaml +3 -3
- vectordb/milvus_client.py +0 -0
- vectordb/utils.py +77 -0
requirements.txt
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
fastapi[standard]==0.116.2
|
| 2 |
uvicorn==0.35.0
|
|
|
|
|
|
|
| 3 |
torch==2.8.0
|
| 4 |
sentence-transformers==5.1.1
|
| 5 |
loguru==0.7.3
|
|
@@ -8,4 +10,4 @@ pydantic-settings==2.11.0
|
|
| 8 |
pyyaml==6.0.3
|
| 9 |
ruff==0.14.3
|
| 10 |
pinecone==7.3.0
|
| 11 |
-
ipykernel==7.1.0
|
|
|
|
| 1 |
fastapi[standard]==0.116.2
|
| 2 |
uvicorn==0.35.0
|
| 3 |
+
numpy==2.0.2
|
| 4 |
+
scipy==1.16.3
|
| 5 |
torch==2.8.0
|
| 6 |
sentence-transformers==5.1.1
|
| 7 |
loguru==0.7.3
|
|
|
|
| 10 |
pyyaml==6.0.3
|
| 11 |
ruff==0.14.3
|
| 12 |
pinecone==7.3.0
|
| 13 |
+
ipykernel==7.1.0
|
src/config/models.yaml
CHANGED
|
@@ -7,8 +7,8 @@ models:
|
|
| 7 |
name: "google/embeddinggemma-300M"
|
| 8 |
type: "embeddings"
|
| 9 |
|
| 10 |
-
splade-
|
| 11 |
-
name: "
|
| 12 |
type: "sparse-embeddings"
|
| 13 |
|
| 14 |
splade-large-query:
|
|
@@ -17,4 +17,4 @@ models:
|
|
| 17 |
|
| 18 |
bge-v2-m3:
|
| 19 |
name: "BAAI/bge-reranker-v2-m3"
|
| 20 |
-
type: "rerank"
|
|
|
|
| 7 |
name: "google/embeddinggemma-300M"
|
| 8 |
type: "embeddings"
|
| 9 |
|
| 10 |
+
splade-large-doc:
|
| 11 |
+
name: "naver/efficient-splade-VI-BT-large-doc"
|
| 12 |
type: "sparse-embeddings"
|
| 13 |
|
| 14 |
splade-large-query:
|
|
|
|
| 17 |
|
| 18 |
bge-v2-m3:
|
| 19 |
name: "BAAI/bge-reranker-v2-m3"
|
| 20 |
+
type: "rerank"
|
vectordb/milvus_client.py
ADDED
|
File without changes
|
vectordb/utils.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from scipy.sparse import csr_matrix
|
| 3 |
+
from typing import Dict, List
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def convert_sparse_to_csr(sparse_dict: Dict[str, List]) -> csr_matrix:
|
| 7 |
+
"""
|
| 8 |
+
Convert sparse embedding to scipy CSR matrix
|
| 9 |
+
|
| 10 |
+
API format: {"indices": [10, 25, 42], "values": [0.85, 0.62, 0.91]}
|
| 11 |
+
Milvus format: scipy.sparse.csr_matrix with shape (1, max_dimension)
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
sparse_dict: Dictionary with 'indices' and 'values'
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
scipy CSR matrix
|
| 18 |
+
"""
|
| 19 |
+
indices = sparse_dict["indices"]
|
| 20 |
+
values = sparse_dict["values"]
|
| 21 |
+
|
| 22 |
+
max_dim = max(indices) + 1 if indices else 1
|
| 23 |
+
|
| 24 |
+
# Create CSR matrix
|
| 25 |
+
# Shape: (1, max_dim) karena ini single vector
|
| 26 |
+
row_indices = [0] * len(indices) # Semua di row 0
|
| 27 |
+
col_indices = indices
|
| 28 |
+
|
| 29 |
+
sparse_matrix = csr_matrix(
|
| 30 |
+
(values, (row_indices, col_indices)),
|
| 31 |
+
shape=(1, max_dim)
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
return sparse_matrix
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def batch_convert_sparse_to_csr(sparse_list: List[Dict[str, List]]) -> csr_matrix:
|
| 38 |
+
"""
|
| 39 |
+
Convert batch of sparse embeddings to single CSR matrix
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
sparse_list: List of sparse dicts
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
scipy CSR matrix with shape (batch_size, max_dim)
|
| 46 |
+
"""
|
| 47 |
+
if not sparse_list:
|
| 48 |
+
return csr_matrix((0, 0))
|
| 49 |
+
|
| 50 |
+
max_dim = 0
|
| 51 |
+
for sparse_dict in sparse_list:
|
| 52 |
+
if sparse_dict["indices"]:
|
| 53 |
+
max_dim = max(max_dim, max(sparse_dict["indices"]) + 1)
|
| 54 |
+
|
| 55 |
+
if max_dim == 0:
|
| 56 |
+
max_dim = 30000 # Default vocab size for SPLADE
|
| 57 |
+
|
| 58 |
+
# Build row indices, column indices, and values
|
| 59 |
+
row_indices = []
|
| 60 |
+
col_indices = []
|
| 61 |
+
values = []
|
| 62 |
+
|
| 63 |
+
for row_idx, sparse_dict in enumerate(sparse_list):
|
| 64 |
+
indices = sparse_dict["indices"]
|
| 65 |
+
vals = sparse_dict["values"]
|
| 66 |
+
|
| 67 |
+
row_indices.extend([row_idx] * len(indices))
|
| 68 |
+
col_indices.extend(indices)
|
| 69 |
+
values.extend(vals)
|
| 70 |
+
|
| 71 |
+
# Create CSR matrix
|
| 72 |
+
sparse_matrix = csr_matrix(
|
| 73 |
+
(values, (row_indices, col_indices)),
|
| 74 |
+
shape=(len(sparse_list), max_dim)
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
return sparse_matrix
|