Spaces:
Sleeping
Sleeping
| # https://python.langchain.com/docs/tutorials/rag/ | |
| import gradio as gr | |
| from langchain import hub | |
| from langchain_chroma import Chroma | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_core.runnables import RunnablePassthrough | |
| from langchain_mistralai import MistralAIEmbeddings | |
| from langchain_community.embeddings import HuggingFaceInstructEmbeddings | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_mistralai import ChatMistralAI | |
| from langchain_community.document_loaders import PyPDFLoader | |
| import requests | |
| from pathlib import Path | |
| from langchain_community.document_loaders import WebBaseLoader | |
| import bs4 | |
| from langchain_core.rate_limiters import InMemoryRateLimiter | |
| from urllib.parse import urljoin | |
| rate_limiter = InMemoryRateLimiter( | |
| requests_per_second=0.1, # <-- MistralAI free. We can only make a request once every second | |
| check_every_n_seconds=0.01, # Wake up every 100 ms to check whether allowed to make a request, | |
| max_bucket_size=10, # Controls the maximum burst size. | |
| ) | |
| # Get data from url | |
| url = 'https://camels.readthedocs.io/_/downloads/en/latest/pdf/' | |
| r = requests.get(url, stream=True) | |
| document_path = Path('data.pdf') | |
| document_path.write_bytes(r.content) | |
| # document_path = "camels-readthedocs-io-en-latest.pdf" | |
| loader = PyPDFLoader(document_path) | |
| docs = loader.load() | |
| # # Load, chunk and index the contents of the blog. | |
| # url = "https://lilianweng.github.io/posts/2023-06-23-agent/" | |
| # loader = WebBaseLoader( | |
| # web_paths=(url,), | |
| # bs_kwargs=dict( | |
| # parse_only=bs4.SoupStrainer( | |
| # class_=("post-content", "post-title", "post-header") | |
| # ) | |
| # ), | |
| # ) | |
| # loader = WebBaseLoader(url) | |
| # docs = loader.load() | |
| # def get_subpages(base_url): | |
| # visited_urls = [] | |
| # urls_to_visit = [base_url] | |
| # while urls_to_visit: | |
| # url = urls_to_visit.pop(0) | |
| # if url in visited_urls: | |
| # continue | |
| # visited_urls.append(url) | |
| # response = requests.get(url) | |
| # soup = bs4.BeautifulSoup(response.content, "html.parser") | |
| # for link in soup.find_all("a", href=True): | |
| # full_url = urljoin(base_url, link['href']) | |
| # if base_url in full_url and not full_url.endswith(".html") and full_url not in visited_urls: | |
| # urls_to_visit.append(full_url) | |
| # visited_urls = visited_urls[1:] | |
| # return visited_urls | |
| # base_url = "https://camels.readthedocs.io/en/latest/" | |
| # # base_url = "https://carla.readthedocs.io/en/latest/" | |
| # # urls = get_subpages(base_url) | |
| # tokenfile = open("urls.txt") | |
| # urls = tokenfile.readlines() | |
| # urls = [url.replace("\n","") for url in urls] | |
| # tokenfile.close() | |
| # print(urls) | |
| # # Load, chunk and index the contents of the blog. | |
| # loader = WebBaseLoader(urls) | |
| # docs = loader.load() | |
| def format_docs(docs): | |
| return "\n\n".join(doc.page_content for doc in docs) | |
| def RAG(llm, docs, embeddings): | |
| # Split text | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| splits = text_splitter.split_documents(docs) | |
| # Create vector store | |
| vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings) | |
| # Retrieve and generate using the relevant snippets of the documents | |
| retriever = vectorstore.as_retriever() | |
| # Prompt basis example for RAG systems | |
| prompt = hub.pull("rlm/rag-prompt") | |
| # Create the chain | |
| rag_chain = ( | |
| {"context": retriever | format_docs, "question": RunnablePassthrough()} | |
| | prompt | |
| | llm | |
| | StrOutputParser() | |
| ) | |
| return rag_chain | |
| # LLM model | |
| llm = ChatMistralAI(model="mistral-large-latest", rate_limiter=rate_limiter) | |
| # Embeddings | |
| embed_model = "sentence-transformers/multi-qa-distilbert-cos-v1" | |
| # embed_model = "nvidia/NV-Embed-v2" | |
| embeddings = HuggingFaceInstructEmbeddings(model_name=embed_model) | |
| # embeddings = MistralAIEmbeddings() | |
| # RAG chain | |
| rag_chain = RAG(llm, docs, embeddings) | |
| def handle_prompt(message, history): | |
| try: | |
| # Stream output | |
| out="" | |
| for chunk in rag_chain.stream(message): | |
| out += chunk | |
| yield out | |
| except: | |
| raise gr.Error("Requests rate limit exceeded") | |
| greetingsmessage = "Hi, I'm the CAMELS DocBot, I'm here to assist you with any question related to the CAMELS simulations documentation" | |
| example_questions = [ | |
| "How can i read a halo file?", | |
| "Which simulation suites are included in CAMELS?", | |
| "Which are the largest volumes in CAMELS simulations?", | |
| "How can I get the power spectrum of a simulation?" | |
| ] | |
| demo = gr.ChatInterface(handle_prompt, type="messages", title="CAMELS DocBot", examples=example_questions, theme=gr.themes.Soft(), description=greetingsmessage)#, chatbot=chatbot) | |
| demo.launch() |