GianJSX commited on
Commit
3505899
1 Parent(s): ce0b952

Upload 7 files

Browse files
Files changed (7) hide show
  1. .gitignore +2 -0
  2. AssistantService.py +22 -0
  3. ExcecuteFunction.py +9 -0
  4. LICENSE +21 -0
  5. app.py +60 -0
  6. config.ini.example +2 -0
  7. requirements.txt +73 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ config.ini
2
+ __pycache__/
AssistantService.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chat_models import ChatOpenAI
2
+ from chains.output_format.base import chain_output_format
3
+ from chains.code_generator.base import chain_code_generator
4
+ import os
5
+
6
+ class GPTAssistant():
7
+ def __init__(self,api_key:str):
8
+ os.environ['OPENAI_API_KEY'] = api_key
9
+ self.llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo', request_timeout=120, client=None)
10
+
11
+ def chain_response_format(self, html_content):
12
+ # prompt templates
13
+ output_format_chain = chain_output_format(self.llm)
14
+
15
+ # chain
16
+ return output_format_chain.run(html_content=html_content)
17
+
18
+ def chain_code_generator(self, output_format, html_content):
19
+ # Prompt templates
20
+ script_chain = chain_code_generator(self.llm)
21
+
22
+ return script_chain.run(output_format=output_format, html_content=html_content)
ExcecuteFunction.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+
3
+ def execute_function():
4
+ module = "output"
5
+ function = "extract_info"
6
+ module = importlib.import_module(module)
7
+ function = getattr(module, function)
8
+ print("returning function")
9
+ return function
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Tomas Bourgeois
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from AssistantService import GPTAssistant
2
+ from openai.error import AuthenticationError
3
+ import streamlit as st
4
+ import configparser
5
+
6
+ config = configparser.ConfigParser()
7
+ config.read('config.ini')
8
+ if 'DEFAULT' in config:
9
+ assistant_api_key = config['DEFAULT'].get('API-KEY', '')
10
+
11
+ st.title("Web Scraping Assistant")
12
+ st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you.")
13
+ if assistant_api_key == '':
14
+ assistant_api_key = st.text_input("Paste your API key here:")
15
+ if assistant_api_key:
16
+ gpt_assistant = GPTAssistant(assistant_api_key)
17
+ else:
18
+ gpt_assistant = GPTAssistant(assistant_api_key)
19
+
20
+ html_content = st.text_input("Paste your piece of HTML here:")
21
+
22
+ if html_content:
23
+ if st.button("Extract data format"):
24
+ try:
25
+ output = gpt_assistant.chain_response_format(html_content)
26
+ st.session_state['output_format'] = output
27
+ except NameError:
28
+ st.write("Complete the API key field")
29
+ except AuthenticationError:
30
+ st.write("Invalid API key")
31
+
32
+ if 'output_format' in st.session_state:
33
+ output_format = st.code(st.session_state['output_format'], language="json")
34
+
35
+ if st.button("Generate the code"):
36
+ try:
37
+ python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
38
+ st.session_state['code_generated'] = python_code
39
+ st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"
40
+
41
+ except NameError:
42
+ st.write("Complete the API key field")
43
+ except AuthenticationError:
44
+ st.write("Invalid API key")
45
+
46
+
47
+ if 'code_generated' in st.session_state:
48
+ python_function_label = st.write("Here is your python function:")
49
+ code_generated = st.code(st.session_state['code_generated'],language="python")
50
+ full_content = st.text_input("Paste your complete HTML here:")
51
+ if full_content and st.button("Test the code"):
52
+ html_data = full_content
53
+ result = None
54
+ exec(st.session_state['code_generated_exec'], globals())
55
+ if result:
56
+ st.write("data extracted successfully")
57
+ # show data in table
58
+ st.table(result)
59
+ else:
60
+ st.write("error extracting data")
config.ini.example ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [DEFAULT]
2
+ API-KEY=OpenAI API KEY HERE
requirements.txt ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.4
2
+ aiosignal==1.3.1
3
+ altair==4.2.2
4
+ async-timeout==4.0.2
5
+ attrs==23.1.0
6
+ beautifulsoup4==4.12.2
7
+ blinker==1.6.2
8
+ cachetools==5.3.0
9
+ certifi==2023.5.7
10
+ charset-normalizer==3.1.0
11
+ click==8.1.3
12
+ colorama==0.4.6
13
+ dataclasses-json==0.5.7
14
+ decorator==5.1.1
15
+ entrypoints==0.4
16
+ frozenlist==1.3.3
17
+ gitdb==4.0.10
18
+ GitPython==3.1.31
19
+ greenlet==2.0.2
20
+ idna==3.4
21
+ importlib-metadata==6.6.0
22
+ Jinja2==3.1.2
23
+ jsonschema==4.17.3
24
+ langchain==0.0.167
25
+ lxml==4.9.2
26
+ markdown-it-py==2.2.0
27
+ MarkupSafe==2.1.2
28
+ marshmallow==3.19.0
29
+ marshmallow-enum==1.5.1
30
+ mdurl==0.1.2
31
+ MechanicalSoup==1.2.0
32
+ multidict==6.0.4
33
+ mypy-extensions==1.0.0
34
+ numexpr==2.8.4
35
+ numpy==1.24.3
36
+ openai==0.27.6
37
+ openapi-schema-pydantic==1.2.4
38
+ packaging==23.1
39
+ pandas==2.0.1
40
+ Pillow==9.5.0
41
+ protobuf==3.20.3
42
+ pyarrow==12.0.0
43
+ pydantic==1.10.7
44
+ pydeck==0.8.1b0
45
+ Pygments==2.15.1
46
+ Pympler==1.0.1
47
+ pyrsistent==0.19.3
48
+ python-dateutil==2.8.2
49
+ pytz==2023.3
50
+ pytz-deprecation-shim==0.1.0.post0
51
+ PyYAML==6.0
52
+ requests==2.30.0
53
+ rich==13.3.5
54
+ six==1.16.0
55
+ smmap==5.0.0
56
+ soupsieve==2.4.1
57
+ SQLAlchemy==2.0.13
58
+ streamlit==1.22.0
59
+ streamlit-ace==0.1.1
60
+ tenacity==8.2.2
61
+ toml==0.10.2
62
+ toolz==0.12.0
63
+ tornado==6.3.1
64
+ tqdm==4.65.0
65
+ typing-inspect==0.8.0
66
+ typing_extensions==4.5.0
67
+ tzdata==2023.3
68
+ tzlocal==4.3
69
+ urllib3==2.0.2
70
+ validators==0.20.0
71
+ watchdog==3.0.0
72
+ yarl==1.9.2
73
+ zipp==3.15.0