File size: 9,583 Bytes
ba71fca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import os
import time
from pathlib import Path
from typing import List, Dict, Any, Optional
import chromadb
from sentence_transformers import SentenceTransformer
import PyPDF2
import fitz  # PyMuPDF
from langchain_text_splitters import CharacterTextSplitter
from config import settings

class RAGSystem:
    """
    Robust RAG (Retrieval-Augmented Generation) System.
    All paths, collection, and model names are configurable via config.settings.
    Emoji are used in logs for user feedback.
    """
    def __init__(
        self,
        knowledge_base_path: str = None,
        collection_name: str = None,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
        db_path: str = None,
        embedding_model_name: str = None
    ):
        # Use config if not provided
        self.knowledge_base_path = Path(knowledge_base_path or settings.KNOWLEDGE_BASE_PATH)
        self.collection_name = collection_name or settings.COLLECTION_NAME
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.db_path = str(db_path or settings.EMBEDDING_PATH)
        self.embedding_model_name = embedding_model_name or settings.EMBEDDING_MODEL_NAME

        print(f"๐Ÿ”„ Loading embedding model: {self.embedding_model_name}")
        self.embedding_model = SentenceTransformer(self.embedding_model_name)
        self.embedding_dimension = len(self.embedding_model.encode(["test"], convert_to_tensor=False)[0])
        print(f"๐Ÿ“ Model dimension: {self.embedding_dimension}")

        print("๐Ÿ”„ Initializing ChromaDB...")
        self.chroma_client = chromadb.PersistentClient(path=self.db_path)
        self.collection = self._get_or_create_collection()
        print(f"โœ… Robust RAG System initialized!")

    def _get_or_create_collection(self):
        """Check if collection exists and is compatible, otherwise recreate."""
        try:
            colls = self.chroma_client.list_collections()
            col_exists = any(c.name == self.collection_name for c in colls)
            if col_exists:
                col = self.chroma_client.get_collection(self.collection_name)
                if col.count() > 0:
                    try:
                        test_vec = self.embedding_model.encode(["test"], convert_to_tensor=False)[0].tolist()
                        col.query(query_embeddings=[test_vec], n_results=1)
                        print("โœ… Existing collection is compatible.")
                        return col
                    except Exception as e:
                        if "dimension" in str(e).lower():
                            print(f"โš ๏ธ Dimension mismatch: {e}")
                print("๐Ÿ—‘๏ธ Recreating collection...")
                self.chroma_client.delete_collection(self.collection_name)
            # Create new collection
            print("๐Ÿ†• Creating new collection...")
            return self.chroma_client.get_or_create_collection(
                name=self.collection_name,
                metadata={
                    "hnsw:space": "cosine",
                    "model_name": self.embedding_model_name,
                    "dimension": self.embedding_dimension
                }
            )
        except Exception as e:
            print(f"โŒ Error managing collection: {e}")
            raise

    def build_knowledge_base(self) -> None:
        """Create knowledge base from source files."""
        print("๐Ÿš€ Building knowledge base...")
        start = time.time()
        self.clear_database()
        chunks = self._process_documents()
        if not chunks:
            print("โŒ No documents processed!")
            return
        self._create_and_save_embeddings(chunks)
        print(f"๐ŸŽ‰ Knowledge base built in {time.time() - start:.2f}s!")
        self.show_database_stats()

    def _process_documents(self) -> List[Dict[str, Any]]:
        if not self.knowledge_base_path.exists():
            print(f"โŒ Knowledge base not found: {self.knowledge_base_path}")
            return []
        chunks = []
        exts = ['.md', '.txt', '.pdf']
        for root, _, files in os.walk(self.knowledge_base_path):
            root_path = Path(root)
            category = root_path.relative_to(self.knowledge_base_path).parts[0] if root_path != self.knowledge_base_path else "root"
            for file in files:
                file_path = root_path / file
                if file_path.suffix.lower() in exts:
                    content = self._read_file_content(file_path)
                    if content:
                        chunks += self._create_chunks_from_text(content, str(file_path), category, file_path.name)
                        print(f"โœ… Processed {file_path.name}: {len(chunks)} chunks (running total)")
        print(f"๐Ÿ“Š Total chunks created: {len(chunks)}")
        return chunks

    def _create_and_save_embeddings(self, chunks: List[Dict[str, Any]]) -> None:
        texts = [c['text'] for c in chunks]
        ids = [c['id'] for c in chunks]
        metas = [c['metadata'] for c in chunks]
        print(f"๐Ÿ”ฎ Generating embeddings for {len(texts)} chunks...")
        embeddings = self.embedding_model.encode(texts, batch_size=16, show_progress_bar=True, convert_to_tensor=False)
        assert len(embeddings[0]) == self.embedding_dimension, "Embedding dimension mismatch"
        print("๐Ÿ’พ Saving to ChromaDB...")
        self.collection.add(
            embeddings=[e.tolist() if hasattr(e, "tolist") else list(e) for e in embeddings],
            documents=texts,
            metadatas=metas,
            ids=ids
        )
        print("โœ… Saved to ChromaDB!")

    def _read_file_content(self, file_path: Path) -> str:
        try:
            if file_path.suffix.lower() == '.pdf':
                try:
                    with fitz.open(str(file_path)) as doc:
                        return "".join(page.get_text() for page in doc)
                except Exception:
                    with open(file_path, 'rb') as f:
                        reader = PyPDF2.PdfReader(f)
                        return "".join(page.extract_text() for page in reader.pages)
            else:
                for enc in ['utf-8', 'utf-8-sig', 'cp874', 'latin-1']:
                    try:
                        with open(file_path, 'r', encoding=enc) as f:
                            return f.read()
                    except Exception:
                        continue
        except Exception as e:
            print(f"โŒ Error reading {file_path.name}: {e}")
        return ""

    def _create_chunks_from_text(self, text: str, source: str, category: str, filename: str) -> List[Dict[str, Any]]:
        splitter = CharacterTextSplitter(
            separator="---",
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len,
            is_separator_regex=False,
        )
        chunks_text = splitter.split_text(text)
        return [
            {
                'id': f"{category}_{Path(filename).stem}_{i}",
                'text': chunk.strip(),
                'metadata': {
                    'source': source,
                    'category': category,
                    'chunk_id': f"{category}_{Path(filename).stem}_{i}",
                    'chunk_number': i,
                    'filename': filename
                }
            }
            for i, chunk in enumerate(chunks_text)
        ]

    def query(self, query_text: str, top_k: int = 5, category_filter: Optional[str] = None) -> str:
        print(f"๐Ÿ” Querying: '{query_text[:50]}...'")
        query_embedding = self.embedding_model.encode([query_text], convert_to_tensor=False)[0]
        where_clause = {"category": category_filter} if category_filter else None
        results = self.collection.query(
            query_embeddings=[query_embedding.tolist() if hasattr(query_embedding, "tolist") else list(query_embedding)],
            n_results=top_k,
            where=where_clause
        )
        if not results['documents'][0]:
            return "No relevant information found."
        return "\n---\n".join(
            f"[Context {i+1}]\nSource: {m['filename']} (Category: {m['category']})\nContent: {doc.strip()}\n"
            for i, (doc, m) in enumerate(zip(results['documents'][0], results['metadatas'][0]))
        )

    def show_database_stats(self) -> None:
        try:
            count = self.collection.count()
            meta = self.collection.metadata
            print(f"\n๐Ÿ“Š Database Stats:")
            print(f"   Total chunks: {count}")
            print(f"   Model: {meta.get('model_name', 'Unknown')}")
            print(f"   Dimension: {meta.get('dimension', 'Unknown')}")
            if count > 0:
                all_results = self.collection.get()
                categories = {}
                for m in all_results['metadatas']:
                    categories[m['category']] = categories.get(m['category'], 0) + 1
                print(f"   Categories: {len(categories)}")
        except Exception as e:
            print(f"โŒ Error getting stats: {e}")

    def clear_database(self) -> None:
        print("๐Ÿ—‘๏ธ Clearing database...")
        try:
            self.chroma_client.delete_collection(self.collection_name)
            self.collection = self._get_or_create_collection()
            print("โœ… Database cleared!")
        except Exception as e:
            print(f"โŒ Error clearing database: {e}")

if __name__ == "__main__":
    rag = RAGSystem()  # Uses config.settings by default
    rag.build_knowledge_base()