Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from dataclasses import dataclass | |
| from typing import List, Tuple, Dict | |
| import pandas as pd | |
| import numpy as np | |
| from rank_bm25 import BM25Okapi | |
| from sentence_transformers import SentenceTransformer | |
| import json | |
| with open("/app/data.json", "r", encoding="utf-8") as f: | |
| DATA = json.load(f) | |
| class Doc: | |
| id: int | |
| text: str | |
| def normalize_text(s: str) -> str: | |
| return s.lower() | |
| def build_corpus() -> List[Doc]: | |
| corpus = [] | |
| for row in DATA: | |
| text = f"{row['year']}. {row['title']}. {row['abstract']}" | |
| corpus.append(Doc(id=row['id'], text=normalize_text(text))) | |
| return corpus | |
| def bm25_search(corpus: List[Doc], query: str, k: int = 5) -> List[Tuple[int, float]]: | |
| tokenized_corpus = [doc.text.split() for doc in corpus] | |
| bm25 = BM25Okapi(tokenized_corpus) | |
| scores = bm25.get_scores(query.split()) | |
| idxs = np.argsort(scores)[::-1][:k] | |
| return [(corpus[i].id, float(scores[i])) for i in idxs] | |
| def show(results, title="Résultats"): | |
| import pandas as pd | |
| rows = [] | |
| for rank, (doc_id, score) in enumerate(results, start=1): | |
| row = next(item for item in DATA if item['id'] == doc_id) | |
| rows.append({ | |
| 'rank': rank, | |
| 'id': doc_id, | |
| 'title': row['title'], | |
| 'score': round(score, 4) | |
| }) | |
| return pd.DataFrame(rows) | |
| class DenseIndex: | |
| def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"): | |
| self.model = SentenceTransformer(model_name) | |
| self.index = None | |
| self.embeddings = None | |
| def encode(self, texts: List[str]): | |
| vecs = self.model.encode(texts, normalize_embeddings=True, show_progress_bar=False) | |
| return np.asarray(vecs, dtype='float32') | |
| def build(self, docs: List[Doc]): | |
| texts = [d.text for d in docs] | |
| X = self.encode(texts) | |
| try: | |
| import faiss # type: ignore | |
| dim = X.shape[1] | |
| index = faiss.IndexFlatIP(dim) | |
| index.add(X) | |
| self.index = index | |
| self.embeddings = None | |
| except Exception: | |
| self.index = None | |
| self.embeddings = X | |
| def search(self, query: str, k: int = 5) -> List[Tuple[int, float]]: | |
| q = self.encode([query])[0:1] | |
| if self.index is not None: | |
| import faiss # type: ignore | |
| D, I = self.index.search(q, k) | |
| scores = D[0].tolist() | |
| idxs = I[0].tolist() | |
| else: | |
| X = self.embeddings | |
| scores = (X @ q[0]) | |
| idxs = np.argsort(scores)[::-1][:k].tolist() | |
| scores = scores[idxs].tolist() | |
| return [(DATA[i]['id'], float(scores[j])) for j, i in enumerate(idxs)] | |
| def rrf_fusion(results: Dict[str, List[Tuple[int, float]]], k: int = 5, K: int = 60) -> List[Tuple[int, float]]: | |
| ranks_by_sys: Dict[str, Dict[int, int]] = {} | |
| for name, lst in results.items(): | |
| ranks = {} | |
| for rank, (doc_id, _score) in enumerate(lst, start=1): | |
| ranks[doc_id] = rank | |
| ranks_by_sys[name] = ranks | |
| all_ids = set() | |
| for ranks in ranks_by_sys.values(): | |
| all_ids |= set(ranks.keys()) | |
| fused = [] | |
| for doc_id in all_ids: | |
| s = 0.0 | |
| for ranks in ranks_by_sys.values(): | |
| if doc_id in ranks: | |
| r = ranks[doc_id] | |
| s += 1.0 / (K + r) | |
| fused.append((doc_id, s)) | |
| fused.sort(key=lambda x: x[1], reverse=True) | |
| return fused[:k] | |
| def rerank_cross_encoder(query: str, doc_ids: List[int]): | |
| pairs = [(query, normalize_text(f"{d['title']}. {d['abstract']}")) for d in DATA if d['id'] in doc_ids] | |
| try: | |
| from FlagEmbedding import FlagReranker | |
| reranker = FlagReranker('BAAI/bge-reranker-base', use_fp16=True) | |
| scores = reranker.compute_score(pairs, normalize=True) | |
| id_order = [d['id'] for d in DATA if d['id'] in doc_ids] | |
| return list(zip(id_order, [float(s) for s in scores])) | |
| except Exception as e: | |
| import numpy as np | |
| print("Reranker indisponible (fallback aléatoire pour la démo).") | |
| rng = np.random.default_rng(123) | |
| scores = rng.random(len(pairs)) | |
| id_order = [d['id'] for d in DATA if d['id'] in doc_ids] | |
| return list(zip(id_order, [float(s) for s in scores])) | |
| def orchestrate_search(query: str, k: int = 5, do_rerank: bool = False): | |
| qn = normalize_text(query) | |
| # 1. BM25 Search | |
| bm25_res = bm25_search(corpus, qn, k) | |
| sparse_df = show(bm25_res, title="BM25") | |
| # 2. Dense Retrieval | |
| dense_res = dense.search(qn, k) | |
| dense_df = show(dense_res, title="Dense (Embeddings)") | |
| # 3. RRF Fusion | |
| hybrid_res = rrf_fusion({"sparse": bm25_res, "dense": dense_res}, k) | |
| hybrid_df = show(hybrid_res, title="Fusion Hybride (RRF)") | |
| # 4. Optional Reranking | |
| rerank_df = pd.DataFrame() | |
| if do_rerank: | |
| doc_ids_for_rerank = [doc_id for doc_id, _ in hybrid_res] | |
| rerank_res = rerank_cross_encoder(qn, doc_ids_for_rerank) | |
| rerank_res.sort(key=lambda x: x[1], reverse=True) | |
| rerank_df = show(rerank_res, title="Reranking (cross-encoder)") | |
| return sparse_df, dense_df, hybrid_df, rerank_df | |
| def gradio_interface(query: str, k: int, do_rerank: bool): | |
| sparse_df, dense_df, hybrid_df, rerank_df = orchestrate_search(query, k, do_rerank) | |
| return sparse_df, dense_df, hybrid_df, rerank_df | |
| with gr.Blocks() as demo: | |
| corpus = build_corpus() | |
| dense = DenseIndex() | |
| dense.build(corpus) | |
| gr.Markdown("# Hybrid Search Pipeline Demo") | |
| with gr.Row(): | |
| query_input = gr.Textbox(label="Query", placeholder="Enter your search query here...") | |
| k_input = gr.Slider(minimum=1, maximum=10, step=1, value=5, label="Number of results (k)") | |
| rerank_checkbox = gr.Checkbox(label="Enable Reranking (cross-encoder)", value=False) | |
| search_button = gr.Button("Run Search") | |
| with gr.Tabs(): | |
| with gr.TabItem("BM25 (Sparse)"): | |
| bm25_output = gr.DataFrame(headers=["rank", "id", "title", "score"], datatype=["number", "number", "str", "number"]) | |
| with gr.TabItem("Dense (Embeddings)"): | |
| dense_output = gr.DataFrame(headers=["rank", "id", "title", "score"], datatype=["number", "number", "str", "number"]) | |
| with gr.TabItem("Hybrid (RRF Fusion)"): | |
| hybrid_output = gr.DataFrame(headers=["rank", "id", "title", "score"], datatype=["number", "number", "str", "number"]) | |
| with gr.TabItem("Reranked Results"): | |
| rerank_output = gr.DataFrame(headers=["rank", "id", "title", "score"], datatype=["number", "number", "str", "number"]) | |
| with gr.TabItem("Corpus Data"): | |
| gr.DataFrame(pd.DataFrame(DATA), label="Original Corpus Data") | |
| search_button.click( | |
| gradio_interface, | |
| inputs=[query_input, k_input, rerank_checkbox], | |
| outputs=[bm25_output, dense_output, hybrid_output, rerank_output] | |
| ) | |
| demo.launch() |