PabloVD commited on
Commit
35d69cc
1 Parent(s): 3ad9a49

Revert to using local file for urls since requesting urls in HuggingFace spaces does not work properly

Browse files
Files changed (2) hide show
  1. app.py +4 -27
  2. urls.txt +42 -0
app.py CHANGED
@@ -9,11 +9,8 @@ from langchain_core.runnables import RunnablePassthrough
9
  from langchain_community.embeddings import HuggingFaceInstructEmbeddings
10
  from langchain_text_splitters import RecursiveCharacterTextSplitter
11
  from langchain_mistralai import ChatMistralAI
12
- import requests
13
  from langchain_community.document_loaders import WebBaseLoader
14
- import bs4
15
  from langchain_core.rate_limiters import InMemoryRateLimiter
16
- from urllib.parse import urljoin
17
 
18
  # Define a limiter to avoid rate limit issues with MistralAI
19
  rate_limiter = InMemoryRateLimiter(
@@ -22,31 +19,11 @@ rate_limiter = InMemoryRateLimiter(
22
  max_bucket_size=10, # Controls the maximum burst size.
23
  )
24
 
25
- # Function to get all the subpages from a base url
26
- def get_subpages(base_url):
27
- visited_urls = []
28
- urls_to_visit = [base_url]
29
-
30
- while urls_to_visit:
31
- url = urls_to_visit.pop(0)
32
- if url in visited_urls:
33
- continue
34
-
35
- visited_urls.append(url)
36
- response = requests.get(url)
37
- soup = bs4.BeautifulSoup(response.content, "html.parser")
38
-
39
- for link in soup.find_all("a", href=True):
40
- full_url = urljoin(base_url, link['href'])
41
- if base_url in full_url and full_url.endswith(".html") and full_url not in visited_urls:
42
- urls_to_visit.append(full_url)
43
- visited_urls = visited_urls[1:]
44
-
45
- return visited_urls
46
-
47
  # Get urls
48
- base_url = "https://camels.readthedocs.io/en/latest/"
49
- urls = get_subpages(base_url)
 
 
50
 
51
  # Load, chunk and index the contents of the blog.
52
  loader = WebBaseLoader(urls)
 
9
  from langchain_community.embeddings import HuggingFaceInstructEmbeddings
10
  from langchain_text_splitters import RecursiveCharacterTextSplitter
11
  from langchain_mistralai import ChatMistralAI
 
12
  from langchain_community.document_loaders import WebBaseLoader
 
13
  from langchain_core.rate_limiters import InMemoryRateLimiter
 
14
 
15
  # Define a limiter to avoid rate limit issues with MistralAI
16
  rate_limiter = InMemoryRateLimiter(
 
19
  max_bucket_size=10, # Controls the maximum burst size.
20
  )
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # Get urls
23
+ urlsfile = open("urls.txt")
24
+ urls = urlsfile.readlines()
25
+ urls = [url.replace("\n","") for url in urls]
26
+ urlsfile.close()
27
 
28
  # Load, chunk and index the contents of the blog.
29
  loader = WebBaseLoader(urls)
urls.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://camels.readthedocs.io/en/latest/
2
+ https://camels.readthedocs.io/en/latest/news.html
3
+ https://camels.readthedocs.io/en/latest/goals.html
4
+ https://camels.readthedocs.io/en/latest/publications.html
5
+ https://camels.readthedocs.io/en/latest/data_access.html
6
+ https://camels.readthedocs.io/en/latest/citation.html
7
+ https://camels.readthedocs.io/en/latest/description.html
8
+ https://camels.readthedocs.io/en/latest/suites_sets.html
9
+ https://camels.readthedocs.io/en/latest/codes.html
10
+ https://camels.readthedocs.io/en/latest/parameters.html
11
+ https://camels.readthedocs.io/en/latest/organization.html
12
+ https://camels.readthedocs.io/en/latest/snapshots.html
13
+ https://camels.readthedocs.io/en/latest/subfind.html
14
+ https://camels.readthedocs.io/en/latest/SubLink.html
15
+ https://camels.readthedocs.io/en/latest/rockstar.html
16
+ https://camels.readthedocs.io/en/latest/ahf.html
17
+ https://camels.readthedocs.io/en/latest/caesar.html
18
+ https://camels.readthedocs.io/en/latest/Pk.html
19
+ https://camels.readthedocs.io/en/latest/Bk.html
20
+ https://camels.readthedocs.io/en/latest/pdf.html
21
+ https://camels.readthedocs.io/en/latest/VIDE.html
22
+ https://camels.readthedocs.io/en/latest/Lya.html
23
+ https://camels.readthedocs.io/en/latest/Xrays.html
24
+ https://camels.readthedocs.io/en/latest/Profiles.html
25
+ https://camels.readthedocs.io/en/latest/CMD.html
26
+ https://camels.readthedocs.io/en/latest/SAM.html
27
+ https://camels.readthedocs.io/en/latest/zoomGZ.html
28
+ https://camels.readthedocs.io/en/latest/tutorials.html
29
+ https://camels.readthedocs.io/en/latest/images.html
30
+ https://camels.readthedocs.io/en/latest/camels_library.html
31
+ https://camels.readthedocs.io/en/latest/pylians3.html
32
+ https://camels.readthedocs.io/en/latest/team.html
33
+ https://camels.readthedocs.io/en/latest/contact.html
34
+ https://camels.readthedocs.io/en/latest/logo.html
35
+ https://camels.readthedocs.io/en/latest/examples/Reading_Manipulating_Snapshots.html
36
+ https://camels.readthedocs.io/en/latest/examples/Pk.html
37
+ https://camels.readthedocs.io/en/latest/examples/Images.html
38
+ https://camels.readthedocs.io/en/latest/examples/particles_subhalos.html
39
+ https://camels.readthedocs.io/en/latest/index.html
40
+ https://camels.readthedocs.io/en/latest/Images.html
41
+ https://camels.readthedocs.io/en/latest/particles_subhalos.html
42
+ https://camels.readthedocs.io/en/latest/Reading_Manipulating_Snapshots.html