gwfast_bot

Sleeping

App Files Files Community

jaywadekar commited on Jun 12

Commit

6c93f12

1 Parent(s): 215eca5

added support for docs

Browse files

Files changed (2) hide show

rag.py +198 -123
urls.txt +7 -7

rag.py CHANGED Viewed

@@ -16,6 +16,7 @@ import json
 import base64
 from bs4 import BeautifulSoup
 import re
 def github_to_raw(url):
     """Convert GitHub URL to raw content URL"""
@@ -144,144 +145,218 @@ class GitHubLoader(WebBaseLoader):
         text = re.sub(r'Skip to content|Sign in|Search or jump to|Footer navigation|Terms|Privacy|Security|Status|Docs', '', text)
         return text.strip()
-    def _scrape(self, url: str, *args, **kwargs) -> str:
-        """Scrape data from URL and clean it.
-        Args:
-            url: The URL to scrape
-            *args: Additional positional arguments
-            **kwargs: Additional keyword arguments including bs_kwargs
-        Returns:
-            str: The cleaned content
-        """
-        response = requests.get(url)
-        response.raise_for_status()
-        # For directory listings (tree URLs), use the API
-        if '/tree/' in url:
-            # Parse URL components
-            parts = url.replace("https://github.com/", "").split("/")
-            owner = parts[0]
-            repo = parts[1]
-            branch = parts[3]  # usually 'main' or 'master'
-            path = "/".join(parts[4:]) if len(parts) > 4 else ""
-            # Construct API URL
-            api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
-            api_response = requests.get(api_url)
-            api_response.raise_for_status()
-            # Parse directory listing
-            contents = api_response.json()
-            if isinstance(contents, list):
-                # Format directory contents
-                files = [f"{item['name']} ({item['type']})" for item in contents]
-                return "Directory contents:\n" + "\n".join(files)
-            else:
-                return f"Error: Unexpected API response for {url}"
-        # For regular files, parse HTML
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # For README and markdown files
-        readme_content = soup.find('article', class_='markdown-body')
-        if readme_content:
-            return self.clean_text(readme_content.get_text())
-        # For code files
-        code_content = soup.find('table', class_='highlight')
-        if code_content:
-            return self.clean_text(code_content.get_text())
-        # For other content, get main content
-        main_content = soup.find('main')
-        if main_content:
-            return self.clean_text(main_content.get_text())
-        # Final fallback
-        return self.clean_text(soup.get_text())
-# Load documentation from urls
 def load_docs():
     # Get urls
-    urlsfile = open("urls.txt")
-    urls = urlsfile.readlines()
-    urls = [url.replace("\n","") for url in urls]
-    urlsfile.close()
-    # Load documents from URLs
     docs = []
     for url in urls:
-        url = url.strip()
-        if not url:
-            continue
-        # Check if URL is a Jupyter notebook
-        if url.endswith('.ipynb') and 'github.com' in url and '/blob/' in url:
-            print(f"Loading notebook: {url}")
-            notebook_docs = load_github_notebook(url)
-            docs.extend(notebook_docs)
-        # Handle Python and Markdown files using raw content
-        elif url.endswith(('.py', '.md')) and 'github.com' in url and '/blob/' in url:
-            print(f"Loading raw content: {url}")
-            try:
-                raw_url = github_to_raw(url)
-                loader = WebBaseLoader([raw_url])
-                web_docs = loader.load()
-                # Preserve original URL in metadata
-                for doc in web_docs:
-                    doc.metadata['source'] = url
-                docs.extend(web_docs)
-            except Exception as e:
-                print(f"Error loading {url}: {str(e)}")
-        # Handle directory listings
-        elif '/tree/' in url and 'github.com' in url:
-            print(f"Loading directory: {url}")
-            try:
-                # Parse URL components
-                parts = url.replace("https://github.com/", "").split("/")
-                owner = parts[0]
-                repo = parts[1]
-                branch = parts[3]  # usually 'main' or 'master'
-                path = "/".join(parts[4:]) if len(parts) > 4 else ""
-                # Construct API URL
-                api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
-                response = requests.get(api_url)
-                response.raise_for_status()
-                # Parse directory listing
-                contents = response.json()
-                if isinstance(contents, list):
-                    # Format directory contents
-                    content = "Directory contents:\n" + "\n".join([f"{item['name']} ({item['type']})" for item in contents])
-                    docs.append(Document(page_content=content, metadata={'source': url}))
-                else:
-                    print(f"Error: Unexpected API response for {url}")
-            except Exception as e:
-                print(f"Error loading directory {url}: {str(e)}")
-        else:
-            print(f"Loading web page: {url}")
-            try:
-                loader = GitHubLoader([url])  # Use custom loader
-                web_docs = loader.load()
-                docs.extend(web_docs)
-            except Exception as e:
-                print(f"Error loading {url}: {str(e)}")
-    # Add source URLs as document names for reference
-    for i, doc in enumerate(docs):
-        if 'source' in doc.metadata:
-            doc.metadata['name'] = doc.metadata['source']
-        else:
-            doc.metadata['name'] = f"Document {i+1}"
-    print(f"Loaded {len(docs)} documents:")
-    for doc in docs:
-        print(f" - {doc.metadata.get('name')}")
     return docs
 def extract_reference(url):

 import base64
 from bs4 import BeautifulSoup
 import re
+from urllib.parse import urljoin, urlparse
 def github_to_raw(url):
     """Convert GitHub URL to raw content URL"""
         text = re.sub(r'Skip to content|Sign in|Search or jump to|Footer navigation|Terms|Privacy|Security|Status|Docs', '', text)
         return text.strip()
+    def lazy_load(self) -> list[Document]:
+        """Override lazy_load instead of _scrape to handle both BeautifulSoup and string returns."""
+        for url in self.web_paths:
+            try:
+                response = requests.get(url)
+                response.raise_for_status()
+                # For directory listings (tree URLs), use the API
+                if '/tree/' in url:
+                    # Parse URL components
+                    parts = url.replace("https://github.com/", "").split("/")
+                    owner = parts[0]
+                    repo = parts[1]
+                    branch = parts[3]  # usually 'main' or 'master'
+                    path = "/".join(parts[4:]) if len(parts) > 4 else ""
+                    # Construct API URL
+                    api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
+                    api_response = requests.get(api_url)
+                    api_response.raise_for_status()
+                    # Parse directory listing
+                    contents = api_response.json()
+                    if isinstance(contents, list):
+                        # Format directory contents
+                        content = "Directory contents:\n" + "\n".join([f"{item['name']} ({item['type']})" for item in contents])
+                        yield Document(
+                            page_content=self.clean_text(content),
+                            metadata={'source': url, 'type': 'github_directory'}
+                        )
+                        continue
+                # For regular files, parse HTML
+                soup = BeautifulSoup(response.text, 'html.parser')
+                # For README and markdown files
+                readme_content = soup.find('article', class_='markdown-body')
+                if readme_content:
+                    yield Document(
+                        page_content=self.clean_text(readme_content.get_text()),
+                        metadata={'source': url, 'type': 'github_markdown'}
+                    )
+                    continue
+                # For code files
+                code_content = soup.find('table', class_='highlight')
+                if code_content:
+                    yield Document(
+                        page_content=self.clean_text(code_content.get_text()),
+                        metadata={'source': url, 'type': 'github_code'}
+                    )
+                    continue
+                # For other content, get main content
+                main_content = soup.find('main')
+                if main_content:
+                    yield Document(
+                        page_content=self.clean_text(main_content.get_text()),
+                        metadata={'source': url, 'type': 'github_other'}
+                    )
+                    continue
+                # Fallback to whole page content
+                yield Document(
+                    page_content=self.clean_text(soup.get_text()),
+                    metadata={'source': url, 'type': 'github_fallback'}
+                )
+            except Exception as e:
+                print(f"Error processing {url}: {str(e)}")
+                continue
+    def load(self) -> list[Document]:
+        """Load method that returns a list of documents."""
+        return list(self.lazy_load())
+class ReadTheDocsLoader(WebBaseLoader):
+    """Custom loader for ReadTheDocs pages"""
+    def __init__(self, base_url: str):
+        """Initialize with base URL of the documentation."""
+        super().__init__([])
+        self.base_url = base_url.rstrip('/')
+    def clean_text(self, text: str) -> str:
+        """Clean text content from ReadTheDocs pages."""
+        # Remove excessive whitespace and newlines
+        text = re.sub(r'\s{2,}', ' ', text)
+        text = re.sub(r'\n{3,}', '\n\n', text)
+        # Remove common ReadTheDocs boilerplate
+        text = re.sub(r'View page source|Next|Previous|©.*?\.', '', text)
+        return text.strip()
+    def normalize_url(self, base_url: str, href: str) -> str:
+        """Normalize relative URLs to absolute URLs."""
+        # If it's already an absolute URL, return it
+        if href.startswith(('http://', 'https://')):
+            return href
+        # Handle relative URLs
+        return urljoin(base_url, href)
+    def get_all_pages(self) -> list[str]:
+        """Get all documentation pages starting from the base URL."""
+        visited = set()
+        to_visit = {self.base_url}
+        docs_urls = set()
+        while to_visit:
+            url = to_visit.pop()
+            if url in visited:
+                continue
+            visited.add(url)
+            try:
+                response = requests.get(url)
+                response.raise_for_status()
+                soup = BeautifulSoup(response.text, 'html.parser')
+                # Add current page if it's a documentation page
+                if url.startswith(self.base_url):
+                    docs_urls.add(url)
+                # Find all links
+                for link in soup.find_all('a'):
+                    href = link.get('href')
+                    if not href:
+                        continue
+                    # Skip anchor links and external links
+                    if href.startswith('#') or href.startswith(('http://', 'https://')) and not href.startswith(self.base_url):
+                        continue
+                    # Normalize the URL
+                    full_url = self.normalize_url(url, href)
+                    # Only follow links within the documentation domain
+                    if full_url.startswith(self.base_url):
+                        to_visit.add(full_url)
+            except Exception as e:
+                print(f"Error fetching {url}: {str(e)}")
+        return list(docs_urls)
+    def load(self) -> list[Document]:
+        """Load all documentation pages."""
+        urls = self.get_all_pages()
+        docs = []
+        for url in urls:
+            try:
+                response = requests.get(url)
+                response.raise_for_status()
+                soup = BeautifulSoup(response.text, 'html.parser')
+                # Get main content
+                main_content = soup.find('div', {'role': 'main'})
+                if not main_content:
+                    main_content = soup.find('main')
+                if not main_content:
+                    continue
+                # Clean content
+                content = self.clean_text(main_content.get_text())
+                if content:
+                    docs.append(Document(
+                        page_content=content,
+                        metadata={'source': url, 'type': 'readthedocs'}
+                    ))
+            except Exception as e:
+                print(f"Error processing {url}: {str(e)}")
+        return docs
 def load_docs():
+    """Load all documentation."""
     # Get urls
+    with open("urls.txt", "r") as f:
+        urls = [line.strip() for line in f.readlines()]
     docs = []
+    # Load GitHub content
     for url in urls:
+        if "github.com" in url or "raw.githubusercontent.com" in url:
+            if "/blob/" in url and url.endswith(".ipynb"):
+                # Handle Jupyter notebooks
+                notebook_docs = load_github_notebook(url)
+                docs.extend(notebook_docs)
+            elif "raw.githubusercontent.com" in url:
+                # Handle raw GitHub content directly
+                try:
+                    response = requests.get(url)
+                    response.raise_for_status()
+                    content = response.text
+                    docs.append(Document(
+                        page_content=content,
+                        metadata={'source': url, 'type': 'github_raw'}
+                    ))
+                except Exception as e:
+                    print(f"Error loading raw content from {url}: {str(e)}")
+            else:
+                # Handle other GitHub content
+                loader = GitHubLoader([url])
+                docs.extend(loader.load())
+    # Load ReadTheDocs content
+    rtd_loader = ReadTheDocsLoader("https://gwfast.readthedocs.io/en/latest")
+    docs.extend(rtd_loader.load())
     return docs
 def extract_reference(url):

urls.txt CHANGED Viewed

@@ -1,10 +1,10 @@
 https://github.com/CosmoStatGW/gwfast/blob/master/notebooks/gwfast_tutorial.ipynb
 https://github.com/CosmoStatGW/gwfast/blob/master/notebooks/new_features_tutorial.ipynb
 https://github.com/CosmoStatGW/gwfast/blob/master/notebooks/Stochastic_tutorial.ipynb
-https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/fisherTools.py
-https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/gwfastGlobals.py
-https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/gwfastUtils.py
-https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/network.py
-https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/signal.py
-https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/waveforms.py
-https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/stochastic/stochasticTools.py

 https://github.com/CosmoStatGW/gwfast/blob/master/notebooks/gwfast_tutorial.ipynb
 https://github.com/CosmoStatGW/gwfast/blob/master/notebooks/new_features_tutorial.ipynb
 https://github.com/CosmoStatGW/gwfast/blob/master/notebooks/Stochastic_tutorial.ipynb
+https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/fisherTools.py
+https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/gwfastGlobals.py
+https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/gwfastUtils.py
+https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/network.py
+https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/signal.py
+https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/waveforms.py
+https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/stochastic/stochasticTools.py