Spaces:

Shankarm08
/

pdfcsvdatarag

Sleeping

App Files Files Community

Shankarm08 commited on Oct 6, 2024

Commit

4995935

verified ·

1 Parent(s): 930f177

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -53

app.py CHANGED Viewed

@@ -1,87 +1,120 @@
 import streamlit as st
-import torch
-from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
-from datasets import load_dataset
 import pandas as pd
 import pdfplumber
 import numpy as np
-from sklearn.metrics.pairwise import cosine_similarity
-# Load RAG model, tokenizer, and retriever
-tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
-retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
-model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
-# Function to get RAG embeddings
-def get_rag_embeddings(question, context):
-    inputs = tokenizer(question, context, return_tensors="pt", truncation=True)
-    with torch.no_grad():
-        output = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
-    return tokenizer.batch_decode(output, skip_special_tokens=True)[0]
-# Extract text from PDF
 def extract_text_from_pdf(pdf_file):
     with pdfplumber.open(pdf_file) as pdf:
-        text = ""
         for page in pdf.pages:
             page_text = page.extract_text()
-            if page_text:  # Check if the page has extractable text
                 text += page_text + "\n"
     return text
-# Load dataset (using SQuAD v2 as a placeholder)
-def load_squad_v2():
-    return load_dataset('squad_v2')
-# Store the PDF text and embeddings
-pdf_text = ""
-pdf_embeddings = None
-csv_data = None
-# Streamlit app UI
-st.title("RAG-Powered PDF & CSV Chatbot")
-# CSV file upload
 csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
 if csv_file:
     csv_data = pd.read_csv(csv_file)
-    st.write("CSV file loaded successfully!")
     st.write(csv_data)
-# PDF file upload
 pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
 if pdf_file:
     pdf_text = extract_text_from_pdf(pdf_file)
     if pdf_text.strip():
-        st.success("PDF loaded successfully!")
-        st.text_area("Extracted Text from PDF", pdf_text, height=200)
     else:
         st.warning("No extractable text found in the PDF.")
-# Load the SQuAD v2 dataset as an example for RAG retrieval
-dataset = load_squad_v2()
-# User input for chatbot
-user_input = st.text_input("Ask a question related to the PDF or CSV:")
-# Get response on button click
-if st.button("Get Response"):
-    if not pdf_text and csv_data is None:
-        st.warning("Please upload a PDF or CSV file first.")
-    else:
-        # Combine PDF text and CSV content for context in RAG
-        combined_context = ""
-        if pdf_text:
-            combined_context += pdf_text
-        if csv_data is not None:
-            combined_context += "\n" + csv_data.to_string()
-        # Get RAG-generated response
-        try:
-            response = get_rag_embeddings(user_input, combined_context)
             st.write("### Response:")
-            st.write(response)
-        except Exception as e:
-            st.error(f"Error while processing the question: {e}")

 import streamlit as st
 import pandas as pd
 import pdfplumber
+import torch
+import faiss
 import numpy as np
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer
+# Load the Sentence Transformer model for embeddings
+@st.cache_resource
+def load_embedder():
+    return SentenceTransformer('all-MiniLM-L6-v2')
+embedder = load_embedder()
+# Load a generative model for answer generation
+@st.cache_resource
+def load_generator():
+    return pipeline('text-generation', model='gpt2', tokenizer='gpt2', device=0 if torch.cuda.is_available() else -1)
+generator = load_generator()
+# Function to extract text from PDF
 def extract_text_from_pdf(pdf_file):
+    text = ""
     with pdfplumber.open(pdf_file) as pdf:
         for page in pdf.pages:
             page_text = page.extract_text()
+            if page_text:
                 text += page_text + "\n"
     return text
+# Function to split text into chunks
+def split_text(text, chunk_size=500):
+    sentences = text.split('. ')
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) <= chunk_size:
+            current_chunk += sentence + ". "
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence + ". "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+# Function to build FAISS index
+def build_faiss_index(chunks):
+    embeddings = embedder.encode(chunks)
+    embeddings = np.array(embeddings).astype('float32')
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
+    return index, embeddings
+# Streamlit app
+st.title("PDF and CSV Chatbot with RAG")
+# Upload CSV file
 csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
+csv_text = ""
 if csv_file:
     csv_data = pd.read_csv(csv_file)
+    st.write("### CSV Data:")
     st.write(csv_data)
+    csv_text = csv_data.to_csv(index=False)
+# Upload PDF file
 pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
+pdf_text = ""
 if pdf_file:
     pdf_text = extract_text_from_pdf(pdf_file)
     if pdf_text.strip():
+        st.write("### PDF Text:")
+        st.write(pdf_text)
     else:
         st.warning("No extractable text found in the PDF.")
+# Combine texts
+combined_text = csv_text + "\n" + pdf_text
+if combined_text.strip():
+    # Split text into chunks
+    chunks = split_text(combined_text)
+    # Build FAISS index
+    index, embeddings = build_faiss_index(chunks)
+    # Prepare for user input
+    user_input = st.text_input("Ask a question about the uploaded data:")
+    if st.button("Get Response"):
+        if user_input.strip():
+            # Get embedding of user question
+            question_embedding = embedder.encode([user_input])
+            question_embedding = np.array(question_embedding).astype('float32')
+            # Search FAISS index
+            k = 3  # number of nearest neighbors
+            distances, indices = index.search(question_embedding, k)
+            # Retrieve the most relevant chunks
+            retrieved_chunks = [chunks[idx] for idx in indices[0]]
+            # Combine retrieved chunks
+            context = " ".join(retrieved_chunks)
+            # Generate answer
+            prompt = context + "\n\nQuestion: " + user_input + "\nAnswer:"
+            response = generator(prompt, max_length=200, num_return_sequences=1)
+            # Display response
             st.write("### Response:")
+            st.write(response[0]['generated_text'].split("Answer:")[1].strip())
+        else:
+            st.warning("Please enter a question.")
+else:
+    st.info("Please upload a CSV file or a PDF file to proceed.")