fahmiaziz98 commited on
Commit
66f2fb1
·
1 Parent(s): 95cd425

add model splade-large-doc

Browse files
requirements.txt CHANGED
@@ -1,5 +1,7 @@
1
  fastapi[standard]==0.116.2
2
  uvicorn==0.35.0
 
 
3
  torch==2.8.0
4
  sentence-transformers==5.1.1
5
  loguru==0.7.3
@@ -8,4 +10,4 @@ pydantic-settings==2.11.0
8
  pyyaml==6.0.3
9
  ruff==0.14.3
10
  pinecone==7.3.0
11
- ipykernel==7.1.0
 
1
  fastapi[standard]==0.116.2
2
  uvicorn==0.35.0
3
+ numpy==2.0.2
4
+ scipy==1.16.3
5
  torch==2.8.0
6
  sentence-transformers==5.1.1
7
  loguru==0.7.3
 
10
  pyyaml==6.0.3
11
  ruff==0.14.3
12
  pinecone==7.3.0
13
+ ipykernel==7.1.0
src/config/models.yaml CHANGED
@@ -7,8 +7,8 @@ models:
7
  name: "google/embeddinggemma-300M"
8
  type: "embeddings"
9
 
10
- splade-pp-v2:
11
- name: "prithivida/Splade_PP_en_v2"
12
  type: "sparse-embeddings"
13
 
14
  splade-large-query:
@@ -17,4 +17,4 @@ models:
17
 
18
  bge-v2-m3:
19
  name: "BAAI/bge-reranker-v2-m3"
20
- type: "rerank"
 
7
  name: "google/embeddinggemma-300M"
8
  type: "embeddings"
9
 
10
+ splade-large-doc:
11
+ name: "naver/efficient-splade-VI-BT-large-doc"
12
  type: "sparse-embeddings"
13
 
14
  splade-large-query:
 
17
 
18
  bge-v2-m3:
19
  name: "BAAI/bge-reranker-v2-m3"
20
+ type: "rerank"
vectordb/milvus_client.py ADDED
File without changes
vectordb/utils.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from scipy.sparse import csr_matrix
3
+ from typing import Dict, List
4
+
5
+
6
+ def convert_sparse_to_csr(sparse_dict: Dict[str, List]) -> csr_matrix:
7
+ """
8
+ Convert sparse embedding to scipy CSR matrix
9
+
10
+ API format: {"indices": [10, 25, 42], "values": [0.85, 0.62, 0.91]}
11
+ Milvus format: scipy.sparse.csr_matrix with shape (1, max_dimension)
12
+
13
+ Args:
14
+ sparse_dict: Dictionary with 'indices' and 'values'
15
+
16
+ Returns:
17
+ scipy CSR matrix
18
+ """
19
+ indices = sparse_dict["indices"]
20
+ values = sparse_dict["values"]
21
+
22
+ max_dim = max(indices) + 1 if indices else 1
23
+
24
+ # Create CSR matrix
25
+ # Shape: (1, max_dim) karena ini single vector
26
+ row_indices = [0] * len(indices) # Semua di row 0
27
+ col_indices = indices
28
+
29
+ sparse_matrix = csr_matrix(
30
+ (values, (row_indices, col_indices)),
31
+ shape=(1, max_dim)
32
+ )
33
+
34
+ return sparse_matrix
35
+
36
+
37
+ def batch_convert_sparse_to_csr(sparse_list: List[Dict[str, List]]) -> csr_matrix:
38
+ """
39
+ Convert batch of sparse embeddings to single CSR matrix
40
+
41
+ Args:
42
+ sparse_list: List of sparse dicts
43
+
44
+ Returns:
45
+ scipy CSR matrix with shape (batch_size, max_dim)
46
+ """
47
+ if not sparse_list:
48
+ return csr_matrix((0, 0))
49
+
50
+ max_dim = 0
51
+ for sparse_dict in sparse_list:
52
+ if sparse_dict["indices"]:
53
+ max_dim = max(max_dim, max(sparse_dict["indices"]) + 1)
54
+
55
+ if max_dim == 0:
56
+ max_dim = 30000 # Default vocab size for SPLADE
57
+
58
+ # Build row indices, column indices, and values
59
+ row_indices = []
60
+ col_indices = []
61
+ values = []
62
+
63
+ for row_idx, sparse_dict in enumerate(sparse_list):
64
+ indices = sparse_dict["indices"]
65
+ vals = sparse_dict["values"]
66
+
67
+ row_indices.extend([row_idx] * len(indices))
68
+ col_indices.extend(indices)
69
+ values.extend(vals)
70
+
71
+ # Create CSR matrix
72
+ sparse_matrix = csr_matrix(
73
+ (values, (row_indices, col_indices)),
74
+ shape=(len(sparse_list), max_dim)
75
+ )
76
+
77
+ return sparse_matrix