web-sge-agent / tools /fetch_page.py
watanabe3tipapa's picture
Upload 4 files
6b7ef92
import requests
import html2text
from readability import Document
from langchain.agents import Tool
from urllib.parse import urlparse, parse_qs, urlunparse
from langchain.text_splitter import RecursiveCharacterTextSplitter
def fetch_page(url, model_name='gpt-3.5-turbo', timeout_sec=10):
"""Tool to fetch the content of a web page from a given URL.
- This returns `title`, `content`, and `has_next` indicator. `content` is returned in markdown format.
- By default, only up to 2,000 tokens of content are retrieved.
- If there is more content available on the page, the `has_next` value will be True.
- To read the continuation, you can increment the `page` parameter with the same URL and input them again.
Returns
-------
Dict[str, Any]:
- status: str
- page_content
- title: str
- content: str
- has_next: bool
"""
# page parameter
parsed_url = urlparse(url)
parsed_qs = parse_qs(parsed_url.query)
page = int(parsed_qs.get("page", [1])[0]) - 1
url = urlunparse(
(parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "")
)
try:
response = requests.get(url, timeout=timeout_sec)
response.encoding = 'utf-8'
except requests.exceptions.Timeout:
return {
"status": 500,
"page_content": {'error_message': 'Could not download page due to Timeout Error. Please try to fetch other pages.'}
}
if response.status_code != 200:
return {
"status": response.status_code,
"page_content": {'error_message': 'Could not download page. Please try to fetch other pages.'}
}
try:
doc = Document(response.text)
title = doc.title()
html_content = doc.summary()
content = html2text.html2text(html_content)
except:
return {
"status": 500,
"page_content": {'error_message': 'Could not parse page. Please try to fetch other pages.'}
}
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
model_name=model_name,
chunk_size=1000,
chunk_overlap=0,
)
chunks = text_splitter.split_text(content)
if page >= len(chunks):
return {
"status": 500,
"page_content": {'error_message': 'page parameter looks invalid. Please try to fetch other pages.'}
}
else:
return {
"status": 200,
"page_content": {
"title": title,
"content": chunks[page],
"has_next": page < len(chunks) - 1
}
}
def get_fetch_page_tool():
fetch_page_tool_description = """
Tool to fetch the content of a web page from a given URL.
This returns `status` and `page_content` (`title`, `content` and `has_next` indicator).
If status is not 200, there was some error of fetching page. (Try fetch other pages.)
If a status code other than 200 is returned, please don't give up and make sure to check other pages.
By default, only up to 2,000 tokens of content are retrieved. If there is more content available on the page, the `has_next` value will be True.
To read the continuation, you can increment the `page` parameter with the same URL and input them again. (paging is start with 1, so next page is 2)
e.g. https://www.obamalibrary.gov/obamas/president-barack-obama?page=2
"""
return Tool(
name='fetch_page',
func=fetch_page,
description=fetch_page_tool_description
)