Spaces:

DataPrism
/

GPT-auto-webscraping

Runtime error

App Files Files Community

GianJSX commited on Jun 15, 2023

Commit

3505899

•

1 Parent(s): ce0b952

Upload 7 files

Browse files

Files changed (7) hide show

.gitignore +2 -0
AssistantService.py +22 -0
ExcecuteFunction.py +9 -0
LICENSE +21 -0
app.py +60 -0
config.ini.example +2 -0
requirements.txt +73 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ config.ini
2	+ __pycache__/

AssistantService.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from langchain.chat_models import ChatOpenAI
+from chains.output_format.base import chain_output_format
+from chains.code_generator.base import chain_code_generator
+import os
+class GPTAssistant():
+    def __init__(self,api_key:str):
+        os.environ['OPENAI_API_KEY'] = api_key
+        self.llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo', request_timeout=120, client=None)
+    def chain_response_format(self, html_content):
+        # prompt templates
+        output_format_chain = chain_output_format(self.llm)
+        # chain
+        return output_format_chain.run(html_content=html_content)
+    def chain_code_generator(self, output_format, html_content):
+        # Prompt templates
+        script_chain = chain_code_generator(self.llm)
+        return script_chain.run(output_format=output_format, html_content=html_content)

ExcecuteFunction.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import importlib
+def execute_function():
+    module = "output"
+    function = "extract_info"
+    module = importlib.import_module(module)
+    function = getattr(module, function)
+    print("returning function")
+    return function

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Tomas Bourgeois
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from AssistantService import GPTAssistant
+from openai.error import AuthenticationError
+import streamlit as st
+import configparser
+config = configparser.ConfigParser()
+config.read('config.ini')
+if 'DEFAULT' in config:
+    assistant_api_key = config['DEFAULT'].get('API-KEY', '')
+st.title("Web Scraping Assistant")
+st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you.")
+if assistant_api_key == '':
+    assistant_api_key = st.text_input("Paste your API key here:")
+    if assistant_api_key:
+        gpt_assistant = GPTAssistant(assistant_api_key)
+else:
+    gpt_assistant = GPTAssistant(assistant_api_key)
+html_content = st.text_input("Paste your piece of HTML here:")
+if html_content:
+    if st.button("Extract data format"):
+        try:
+            output = gpt_assistant.chain_response_format(html_content)
+            st.session_state['output_format'] = output
+        except NameError:
+            st.write("Complete the API key field")
+        except AuthenticationError:
+            st.write("Invalid API key")
+if 'output_format' in st.session_state:
+    output_format = st.code(st.session_state['output_format'], language="json")
+    if st.button("Generate the code"):
+        try:
+            python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
+            st.session_state['code_generated'] = python_code
+            st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"
+        except NameError:
+            st.write("Complete the API key field")
+        except AuthenticationError:
+            st.write("Invalid API key")
+if 'code_generated' in st.session_state:
+    python_function_label = st.write("Here is your python function:")
+    code_generated = st.code(st.session_state['code_generated'],language="python")
+    full_content = st.text_input("Paste your complete HTML here:")
+    if full_content and st.button("Test the code"):
+        html_data = full_content
+        result = None
+        exec(st.session_state['code_generated_exec'], globals())
+        if result:
+            st.write("data extracted successfully")
+            # show data in table
+            st.table(result)
+        else:
+            st.write("error extracting data")

config.ini.example ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [DEFAULT]
2	+ API-KEY=OpenAI API KEY HERE

requirements.txt ADDED Viewed

	@@ -0,0 +1,73 @@

+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==4.2.2
+async-timeout==4.0.2
+attrs==23.1.0
+beautifulsoup4==4.12.2
+blinker==1.6.2
+cachetools==5.3.0
+certifi==2023.5.7
+charset-normalizer==3.1.0
+click==8.1.3
+colorama==0.4.6
+dataclasses-json==0.5.7
+decorator==5.1.1
+entrypoints==0.4
+frozenlist==1.3.3
+gitdb==4.0.10
+GitPython==3.1.31
+greenlet==2.0.2
+idna==3.4
+importlib-metadata==6.6.0
+Jinja2==3.1.2
+jsonschema==4.17.3
+langchain==0.0.167
+lxml==4.9.2
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+marshmallow==3.19.0
+marshmallow-enum==1.5.1
+mdurl==0.1.2
+MechanicalSoup==1.2.0
+multidict==6.0.4
+mypy-extensions==1.0.0
+numexpr==2.8.4
+numpy==1.24.3
+openai==0.27.6
+openapi-schema-pydantic==1.2.4
+packaging==23.1
+pandas==2.0.1
+Pillow==9.5.0
+protobuf==3.20.3
+pyarrow==12.0.0
+pydantic==1.10.7
+pydeck==0.8.1b0
+Pygments==2.15.1
+Pympler==1.0.1
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+pytz==2023.3
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+requests==2.30.0
+rich==13.3.5
+six==1.16.0
+smmap==5.0.0
+soupsieve==2.4.1
+SQLAlchemy==2.0.13
+streamlit==1.22.0
+streamlit-ace==0.1.1
+tenacity==8.2.2
+toml==0.10.2
+toolz==0.12.0
+tornado==6.3.1
+tqdm==4.65.0
+typing-inspect==0.8.0
+typing_extensions==4.5.0
+tzdata==2023.3
+tzlocal==4.3
+urllib3==2.0.2
+validators==0.20.0
+watchdog==3.0.0
+yarl==1.9.2
+zipp==3.15.0