Spaces:

auto-academic
/

auto-draft

Running

App Files Files Community

shaocongma commited on May 7, 2023

Commit

8ef9348

•

1 Parent(s): 3afc671

Fix some tex compiler error.

Browse files

Files changed (4) hide show

.idea/.gitignore +2 -0
app.py +32 -26
auto_backgrounds.py +1 -1
utils/references.py +29 -15

.idea/.gitignore CHANGED Viewed

@@ -6,3 +6,5 @@
 /dataSources.local.xml
 # Editor-based HTTP Client requests
 /httpRequests/

 /dataSources.local.xml
 # Editor-based HTTP Client requests
 /httpRequests/
+**/__pycache__
+**/.idea

app.py CHANGED Viewed

@@ -6,18 +6,18 @@ from utils.file_operations import hash_name
 # note: App白屏bug：允许第三方cookie
 # todo:
-#   4. add auto_polishing function
-#   5. Use some simple method for simple tasks (including: writing abstract, conclusion, generate keywords, generate figures...)
 #       5.1 Use GPT 3.5 for abstract, conclusion, ... (or may not)
 #       5.2 Use local LLM to generate keywords, figures, ...
 #       5.3 Use embedding to find most related papers (find a paper dataset)
 #   6. get logs when the procedure is not completed.
 #   7. 自己的文件库； 更多的prompts
-#   8. Change prompts to langchain
-#   9. some references include &: journal={IEEE Power & Energy Society General Meeting}. Check them when generating it.
-#   10. some paper ids have : or - in the first word of title; remove them when generating paper id.
 #   11. distinguish citep and citet
-#   12. Change link to more appealing color
 openai_key = os.getenv("OPENAI_API_KEY")
 access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
@@ -40,14 +40,13 @@ else:
         IS_OPENAI_API_KEY_AVAILABLE = False
 def clear_inputs(text1, text2):
     return "", ""
-def wrapped_generator(title, description, openai_key = None,
-                      template = "ICLR2022",
-                      cache_mode = IS_CACHE_AVAILABLE, generator=None):
     # if `cache_mode` is True, then follow the following steps:
     #        check if "title"+"description" have been generated before
     #        if so, download from the cloud storage, return it
@@ -57,15 +56,16 @@ def wrapped_generator(title, description, openai_key = None,
         # generator = generate_backgrounds
         generator = generate_draft
         # generator = fake_generator
-    if openai_key is not None:
-        openai.api_key = openai_key
         openai.Model.list()
     if cache_mode:
         from utils.storage import list_all_files, download_file, upload_file
         # check if "title"+"description" have been generated before
-        input_dict = {"title": title, "description": description, "generator": "generate_draft"} #todo: modify here also
         file_name = hash_name(input_dict) + ".zip"
         file_list = list_all_files()
         # print(f"{file_name} will be generated. Check the file list {file_list}")
@@ -75,21 +75,23 @@ def wrapped_generator(title, description, openai_key = None,
             return file_name
         else:
             # generate the result.
-            # output = fake_generate_backgrounds(title, description, openai_key) # todo: use `generator` to control which function to use.
-            output = generator(title, description,  template, "gpt-4")
             upload_file(output)
             return output
     else:
         # output = fake_generate_backgrounds(title, description, openai_key)
-        output = generator(title, description,  template, "gpt-4")
         return output
-theme = gr.themes.Monochrome(font=gr.themes.GoogleFont("Questrial")).set(
-    background_fill_primary='#E5E4E2',
-    background_fill_secondary = '#F6F6F6',
-    button_primary_background_fill="#281A39"
-)
 with gr.Blocks(theme=theme) as demo:
     gr.Markdown('''
@@ -107,16 +109,20 @@ with gr.Blocks(theme=theme) as demo:
     ''')
     with gr.Row():
         with gr.Column(scale=2):
-            key =  gr.Textbox(value=openai_key, lines=1, max_lines=1, label="OpenAI Key", visible=not IS_OPENAI_API_KEY_AVAILABLE)
-            # generator = gr.Dropdown(choices=["学术论文", "文献总结"], value="文献总结", label="Selection", info="目前支持生成'学术论文'和'文献总结'.", interactive=True)
-            title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1, label="Title", info="论文标题")
             description = gr.Textbox(lines=5, label="Description (Optional)", visible=False)
             with gr.Row():
                 clear_button = gr.Button("Clear")
-                submit_button = gr.Button("Submit")
         with gr.Column(scale=1):
-            style_mapping = {True: "color:white;background-color:green", False: "color:white;background-color:red"} #todo: to match website's style
             availability_mapping = {True: "AVAILABLE", False: "NOT AVAILABLE"}
             gr.Markdown(f'''## Huggingface Space Status
              当`OpenAI API`显示AVAILABLE的时候这个Space可以直接使用.

 # note: App白屏bug：允许第三方cookie
 # todo:
+#   5. Use some simple method for simple tasks
+#   (including: writing abstract, conclusion, generate keywords, generate figures...)
 #       5.1 Use GPT 3.5 for abstract, conclusion, ... (or may not)
 #       5.2 Use local LLM to generate keywords, figures, ...
 #       5.3 Use embedding to find most related papers (find a paper dataset)
 #   6. get logs when the procedure is not completed.
 #   7. 自己的文件库； 更多的prompts
 #   11. distinguish citep and citet
+# future:
+#   8. Change prompts to langchain
+#   4. add auto_polishing function
+#   12. Change link to more appealing color # after the website is built;
 openai_key = os.getenv("OPENAI_API_KEY")
 access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
         IS_OPENAI_API_KEY_AVAILABLE = False
 def clear_inputs(text1, text2):
     return "", ""
+def wrapped_generator(paper_title, paper_description, openai_api_key=None,
+                      template="ICLR2022",
+                      cache_mode=IS_CACHE_AVAILABLE, generator=None):
     # if `cache_mode` is True, then follow the following steps:
     #        check if "title"+"description" have been generated before
     #        if so, download from the cloud storage, return it
         # generator = generate_backgrounds
         generator = generate_draft
         # generator = fake_generator
+    if openai_api_key is not None:
+        openai.api_key = openai_api_key
         openai.Model.list()
     if cache_mode:
         from utils.storage import list_all_files, download_file, upload_file
         # check if "title"+"description" have been generated before
+        input_dict = {"title": paper_title, "description": paper_description,
+                      "generator": "generate_draft"}  # todo: modify here also
         file_name = hash_name(input_dict) + ".zip"
         file_list = list_all_files()
         # print(f"{file_name} will be generated. Check the file list {file_list}")
             return file_name
         else:
             # generate the result.
+            # output = fake_generate_backgrounds(title, description, openai_key)
+            # todo: use `generator` to control which function to use.
+            output = generator(paper_title, paper_description, template, "gpt-4")
             upload_file(output)
             return output
     else:
         # output = fake_generate_backgrounds(title, description, openai_key)
+        output = generator(paper_title, paper_description, template, "gpt-4")
         return output
+theme = gr.themes.Default(font=gr.themes.GoogleFont("Questrial"))
+# .set(
+#     background_fill_primary='#E5E4E2',
+#     background_fill_secondary = '#F6F6F6',
+#     button_primary_background_fill="#281A39"
+# )
 with gr.Blocks(theme=theme) as demo:
     gr.Markdown('''
     ''')
     with gr.Row():
         with gr.Column(scale=2):
+            key = gr.Textbox(value=openai_key, lines=1, max_lines=1, label="OpenAI Key",
+                             visible=not IS_OPENAI_API_KEY_AVAILABLE)
+            # generator = gr.Dropdown(choices=["学术论文", "文献总结"], value="文献总结",
+            # label="Selection", info="目前支持生成'学术论文'和'文献总结'.", interactive=True)
+            title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
+                               label="Title", info="论文标题")
             description = gr.Textbox(lines=5, label="Description (Optional)", visible=False)
             with gr.Row():
                 clear_button = gr.Button("Clear")
+                submit_button = gr.Button("Submit", variant="primary")
         with gr.Column(scale=1):
+            style_mapping = {True: "color:white;background-color:green",
+                             False: "color:white;background-color:red"}  # todo: to match website's style
             availability_mapping = {True: "AVAILABLE", False: "NOT AVAILABLE"}
             gr.Markdown(f'''## Huggingface Space Status
              当`OpenAI API`显示AVAILABLE的时候这个Space可以直接使用.

auto_backgrounds.py CHANGED Viewed

@@ -91,7 +91,7 @@ def fake_generator(title, description="", template="ICLR2022", model="gpt-4"):
     return make_archive("sample-output.pdf", filename)
-def generate_draft(title, description="", template="ICLR2022", model="gpt-4", search_engine="ss", tldr=True, max_kw_refs=12):
     paper, destination_folder, _ = _generation_setup(title, description, template, model, search_engine, tldr, max_kw_refs)
     # todo: `list_of_methods` failed to be generated; find a solution ...

     return make_archive("sample-output.pdf", filename)
+def generate_draft(title, description="", template="ICLR2022", model="gpt-4", search_engine="ss", tldr=True, max_kw_refs=14):
     paper, destination_folder, _ = _generation_setup(title, description, template, model, search_engine, tldr, max_kw_refs)
     # todo: `list_of_methods` failed to be generated; find a solution ...

utils/references.py CHANGED Viewed

@@ -8,6 +8,7 @@
 import requests
 import re
 #########################################################
 # Some basic tools
 #########################################################
@@ -18,6 +19,7 @@ def remove_newlines(serie):
     serie = serie.replace('  ', ' ')
     return serie
 #########################################################
 # Semantic Scholar (SS) API
 #########################################################
@@ -35,10 +37,10 @@ def ss_search(keywords, limit=20, fields=None):
     return response.json()
 def _collect_papers_ss(keyword, counts=3, tldr=False):
     def externalIds2link(externalIds):
-        # externalIds is similar to "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}"
         if externalIds:
             # Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI
             # priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral)
@@ -58,7 +60,10 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
             return ""
     def extract_paper_id(last_name, year_str, title):
-        return last_name + year_str + title.split(' ', 1)[0]
     def extract_author_info(raw_authors):
         authors = [author['name'] for author in raw_authors]
@@ -67,17 +72,18 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
         last_name = authors[0].split()[-1]
         return authors_str, last_name
-    def parse_search_results(search_results):
         # turn the search result to a list of paper dictionary.
         papers = []
-        for raw_paper in search_results:
             if raw_paper["abstract"] is None:
                 continue
             authors_str, last_name = extract_author_info(raw_paper['authors'])
             year_str = str(raw_paper['year'])
             title = raw_paper['title']
-            journal = raw_paper['venue']
             if not journal:
                 journal = "arXiv preprint"
             paper_id = extract_paper_id(last_name, year_str, title).lower()
@@ -97,6 +103,7 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
             }
             papers.append(result)
         return papers
     raw_results = ss_search(keyword, limit=counts)
     if raw_results is not None:
         search_results = raw_results['data']
@@ -105,6 +112,7 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
     results = parse_search_results(search_results)
     return results
 #########################################################
 # ArXiv API
 #########################################################
@@ -174,9 +182,14 @@ def _collect_papers_arxiv(keyword, counts=3, tldr=False):
     results = parse_results(content)
     return results
 # Each `paper` is a dictionary containing (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal
 class References:
-    def __init__(self, load_papers = ""):
         if load_papers:
             # todo: read a json file from the given path
             #       this could be used to support pre-defined references
@@ -192,7 +205,7 @@ class References:
         """
         match method:
             case "arxiv":
-                process =_collect_papers_arxiv
             case "ss":
                 process = _collect_papers_ss
             case _:
@@ -246,16 +259,17 @@ class References:
             prompts[paper["paper_id"]] = paper["abstract"]
         return prompts
 if __name__ == "__main__":
     refs = References()
     keywords_dict = {
-  "Deep Q-Networks": 15,
-  "Policy Gradient Methods": 24,
-  "Actor-Critic Algorithms": 4,
-  "Model-Based Reinforcement Learning": 13,
-  "Exploration-Exploitation Trade-off": 7
-}
     refs.collect_papers(keywords_dict, method="ss", tldr=True)
     for p in refs.papers:
         print(p["paper_id"])
-    print(len(refs.papers))

 import requests
 import re
 #########################################################
 # Some basic tools
 #########################################################
     serie = serie.replace('  ', ' ')
     return serie
 #########################################################
 # Semantic Scholar (SS) API
 #########################################################
     return response.json()
 def _collect_papers_ss(keyword, counts=3, tldr=False):
     def externalIds2link(externalIds):
+        # Sample externalIds:
+        #   "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}"
         if externalIds:
             # Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI
             # priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral)
             return ""
     def extract_paper_id(last_name, year_str, title):
+        pattern = r'^\w+'
+        words = re.findall(pattern, title)
+        # return last_name + year_str + title.split(' ', 1)[0]
+        return last_name + year_str + words[0]
     def extract_author_info(raw_authors):
         authors = [author['name'] for author in raw_authors]
         last_name = authors[0].split()[-1]
         return authors_str, last_name
+    def parse_search_results(search_results_ss):
         # turn the search result to a list of paper dictionary.
         papers = []
+        for raw_paper in search_results_ss:
             if raw_paper["abstract"] is None:
                 continue
             authors_str, last_name = extract_author_info(raw_paper['authors'])
             year_str = str(raw_paper['year'])
             title = raw_paper['title']
+            # some journal may contain &; replace it. e.g. journal={IEEE Power & Energy Society General Meeting}
+            journal = raw_paper['venue'].replace("&", "\\&")
             if not journal:
                 journal = "arXiv preprint"
             paper_id = extract_paper_id(last_name, year_str, title).lower()
             }
             papers.append(result)
         return papers
     raw_results = ss_search(keyword, limit=counts)
     if raw_results is not None:
         search_results = raw_results['data']
     results = parse_search_results(search_results)
     return results
 #########################################################
 # ArXiv API
 #########################################################
     results = parse_results(content)
     return results
+#########################################################
+# References Class
+#########################################################
 # Each `paper` is a dictionary containing (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal
 class References:
+    def __init__(self, load_papers=""):
         if load_papers:
             # todo: read a json file from the given path
             #       this could be used to support pre-defined references
         """
         match method:
             case "arxiv":
+                process = _collect_papers_arxiv
             case "ss":
                 process = _collect_papers_ss
             case _:
             prompts[paper["paper_id"]] = paper["abstract"]
         return prompts
 if __name__ == "__main__":
     refs = References()
     keywords_dict = {
+        "Deep Q-Networks": 15,
+        "Policy Gradient Methods": 24,
+        "Actor-Critic Algorithms": 4,
+        "Model-Based Reinforcement Learning": 13,
+        "Exploration-Exploitation Trade-off": 7
+    }
     refs.collect_papers(keywords_dict, method="ss", tldr=True)
     for p in refs.papers:
         print(p["paper_id"])
+    print(len(refs.papers))