Spaces:

auto-academic
/

auto-draft

Running

App Files Files Community

CCCBora commited on May 16, 2023

Commit

c2b489f

•

2 Parent(s): 22325f3 b1f41c8

Merge pull request #5 from CCCBora/pre-defined-references

Browse files

Files changed (12) hide show

app.py +63 -43
auto_backgrounds.py +63 -53
latex_templates/ICLR2022/fig.png +0 -0
latex_templates/ICLR2022/template.tex +2 -1
latex_templates/example_references.bib +20 -0
latex_templates/pre_refs.bib +0 -17
requirements.txt +0 -0
section_generator.py +30 -17
utils/gpt_interaction.py +16 -0
utils/prompts.py +137 -38
utils/references.py +124 -124
utils/tex_processing.py +2 -1

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import os
 import openai
-from auto_backgrounds import generate_backgrounds, fake_generator, generate_draft
 from utils.file_operations import hash_name
 # note: App白屏bug：允许第三方cookie
@@ -9,19 +9,21 @@ from utils.file_operations import hash_name
 #   6. get logs when the procedure is not completed. *
 #   7. 自己的文件库； 更多的prompts
 #   8. Decide on how to generate the main part of a paper * (Langchain/AutoGPT
-#   9. Load .bibtex file to generate a pre-defined references list. *
 #   1. 把paper改成纯JSON?
 #   2. 实现别的功能
 #   3. Check API Key GPT-4 Support.
 #   8. Re-build some components using `langchain`
-#           - in `references.py`, use PromptTemplates.format -> str
 #           - in `gpt_interation`, use LLM
 # future:
 #   4. add auto_polishing function
 #   12. Change link to more appealing color # after the website is built;
 #   1. Check if there are any duplicated citations
 #   2. Remove potential thebibliography and bibitem in .tex file
 openai_key = os.getenv("OPENAI_API_KEY")
 access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
 secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
@@ -43,22 +45,19 @@ else:
         IS_OPENAI_API_KEY_AVAILABLE = False
-def clear_inputs(text1, text2):
     return "", ""
 def wrapped_generator(paper_title, paper_description, openai_api_key=None,
-                      template="ICLR2022",
-                      cache_mode=IS_CACHE_AVAILABLE, generator=None):
     # if `cache_mode` is True, then follow the following steps:
     #        check if "title"+"description" have been generated before
     #        if so, download from the cloud storage, return it
     #        if not, generate the result.
-    if generator is None:
-        # todo: add a Dropdown to select which generator to use.
-        # generator = generate_backgrounds
-        generator = generate_draft
-        # generator = fake_generator
     if openai_api_key is not None:
         openai.api_key = openai_api_key
         openai.Model.list()
@@ -66,9 +65,8 @@ def wrapped_generator(paper_title, paper_description, openai_api_key=None,
     if cache_mode:
         from utils.storage import list_all_files, download_file, upload_file
         # check if "title"+"description" have been generated before
         input_dict = {"title": paper_title, "description": paper_description,
-                      "generator": "generate_draft"}  # todo: modify here also
         file_name = hash_name(input_dict) + ".zip"
         file_list = list_all_files()
         # print(f"{file_name} will be generated. Check the file list {file_list}")
@@ -79,13 +77,17 @@ def wrapped_generator(paper_title, paper_description, openai_api_key=None,
         else:
             # generate the result.
             # output = fake_generate_backgrounds(title, description, openai_key)
-            # todo: use `generator` to control which function to use.
-            output = generator(paper_title, paper_description, template, "gpt-4")
             upload_file(output)
             return output
     else:
         # output = fake_generate_backgrounds(title, description, openai_key)
-        output = generator(paper_title, paper_description, template, "gpt-4")
         return output
@@ -96,21 +98,25 @@ theme = gr.themes.Default(font=gr.themes.GoogleFont("Questrial"))
 #     button_primary_background_fill="#281A39"
 # )
 with gr.Blocks(theme=theme) as demo:
     gr.Markdown('''
     # Auto-Draft: 文献整理辅助工具
-    本Demo提供对[Auto-Draft](https://github.com/CCCBora/auto-draft)的auto_draft功能的测试。通过输入想要生成的论文名称（比如Playing atari with deep reinforcement learning)，即可由AI辅助生成论文模板.
     ***2023-05-03 Update***: 在公开版本中为大家提供了输入OpenAI API Key的地址, 如果有GPT-4的API KEY的话可以在这里体验!
-    在这个Huggingface Organization里也提供一定额度的免费体验： [AUTO-ACADEMIC](https://huggingface.co/organizations/auto-academic/share/HPjgazDSlkwLNCWKiAiZoYtXaJIatkWDYM).
-    如果有更多想法和建议欢迎加入QQ群里交流, 如果我在Space里更新了Key我会第一时间通知大家. 群号: ***249738228***.
-    ## 用法
-    输入想要生成的论文名称（比如Playing Atari with Deep Reinforcement Learning), 点击Submit, 等待大概十分钟, 下载.zip格式的输出，在Overleaf上编译浏览.
     ''')
     with gr.Row():
@@ -123,6 +129,8 @@ with gr.Blocks(theme=theme) as demo:
             # 每个功能做一个tab
             with gr.Tab("学术论文"):
                 title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
                                    label="Title", info="论文标题")
@@ -130,33 +138,41 @@ with gr.Blocks(theme=theme) as demo:
                     description_pp = gr.Textbox(lines=5, label="Description (Optional)", visible=True,
                                                 info="对希望生成的论文的一些描述. 包括这篇论文的创新点, 主要贡献, 等.")
-                    interactive = False
-                    gr.Markdown('''
-                    ## 下面的功能我只做了UI, 还没来得及实现功能.
-                    ''')
                     with gr.Row():
                         with gr.Column():
                             gr.Markdown('''
-                            Upload .bib file (Optional)
-                            通过上传.bib文件来控制GPT-4模型必须参考哪些文献.
                             ''')
                             bibtex_file = gr.File(label="Upload .bib file", file_types=["text"],
-                                                  interactive=interactive)
                         with gr.Column():
                             search_engine = gr.Dropdown(label="Search Engine",
                                                         choices=["ArXiv", "Semantic Scholar", "Google Scholar", "None"],
-                                                        value= "Semantic Scholar",
-                                                        interactive=interactive,
-                                                        info="用于决定GPT-4用什么搜索引擎来搜索文献. 选择None的时候仅参考给定文献.")
-                            tldr = gr.Checkbox(value=True, label="TLDR;",
-                                               info="选择此筐表示将使用Semantic Scholar的TLDR作为文献的总结.",
-                                               interactive = interactive),
-                            use_cache = gr.Checkbox(label="总是重新生成",
-                                                    info="选择此筐表示将不会读取已经生成好的文章.",
-                                               interactive = interactive)
-                            slider = gr.Slider(minimum=1, maximum=30, value=20, label="最大参考文献数目",
-                                               info="过多参考文献会超出Token数限制导致报错，这里限制最大参考文献数目.")
                 with gr.Row():
                     clear_button_pp = gr.Button("Clear")
@@ -195,7 +211,11 @@ with gr.Blocks(theme=theme) as demo:
             file_output = gr.File(label="Output")
     clear_button_pp.click(fn=clear_inputs, inputs=[title, description_pp], outputs=[title, description_pp])
-    submit_button_pp.click(fn=wrapped_generator, inputs=[title, description_pp, key], outputs=file_output)
 demo.queue(concurrency_count=1, max_size=5, api_open=False)
 demo.launch()

 import gradio as gr
 import os
 import openai
+from auto_backgrounds import generate_backgrounds, generate_draft
 from utils.file_operations import hash_name
 # note: App白屏bug：允许第三方cookie
 #   6. get logs when the procedure is not completed. *
 #   7. 自己的文件库； 更多的prompts
 #   8. Decide on how to generate the main part of a paper * (Langchain/AutoGPT
 #   1. 把paper改成纯JSON?
 #   2. 实现别的功能
 #   3. Check API Key GPT-4 Support.
 #   8. Re-build some components using `langchain`
 #           - in `gpt_interation`, use LLM
+#   5. 从提供的bib文件中 找到cite和citedby的文章, 计算embeddings; 从整个paper list中 根据cos距离进行排序; 选取max_refs的文章
 # future:
 #   4. add auto_polishing function
 #   12. Change link to more appealing color # after the website is built;
 #   1. Check if there are any duplicated citations
 #   2. Remove potential thebibliography and bibitem in .tex file
+#######################################################################################################################
+# Check if openai and cloud storage available
+#######################################################################################################################
 openai_key = os.getenv("OPENAI_API_KEY")
 access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
 secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
         IS_OPENAI_API_KEY_AVAILABLE = False
+def clear_inputs(*args):
     return "", ""
 def wrapped_generator(paper_title, paper_description, openai_api_key=None,
+                      paper_template="ICLR2022", tldr=True, max_num_refs=50, selected_sections=None, bib_refs=None, model="gpt-4",
+                      cache_mode=IS_CACHE_AVAILABLE):
     # if `cache_mode` is True, then follow the following steps:
     #        check if "title"+"description" have been generated before
     #        if so, download from the cloud storage, return it
     #        if not, generate the result.
+    if bib_refs is not None:
+        bib_refs = bib_refs.name
     if openai_api_key is not None:
         openai.api_key = openai_api_key
         openai.Model.list()
     if cache_mode:
         from utils.storage import list_all_files, download_file, upload_file
         # check if "title"+"description" have been generated before
         input_dict = {"title": paper_title, "description": paper_description,
+                      "generator": "generate_draft"}
         file_name = hash_name(input_dict) + ".zip"
         file_list = list_all_files()
         # print(f"{file_name} will be generated. Check the file list {file_list}")
         else:
             # generate the result.
             # output = fake_generate_backgrounds(title, description, openai_key)
+            output = generate_draft(paper_title, paper_description, template=paper_template,
+                                    tldr=tldr, max_num_refs=max_num_refs,
+                                    sections=selected_sections, bib_refs=bib_refs, model=model)
+            # output = generate_draft(paper_title, paper_description, template, "gpt-4")
             upload_file(output)
             return output
     else:
         # output = fake_generate_backgrounds(title, description, openai_key)
+        output = generate_draft(paper_title, paper_description, template=paper_template,
+                                tldr=tldr, max_num_refs=max_num_refs,
+                                sections=selected_sections, bib_refs=bib_refs, model=model)
         return output
 #     button_primary_background_fill="#281A39"
 # )
+ACADEMIC_PAPER = """## 一键生成论文初稿
+1. 在Title文本框中输入想要生成的论文名称（比如Playing Atari with Deep Reinforcement Learning).
+2. 点击Submit. 等待大概十分钟.
+3. 在右侧下载.zip格式的输出，在Overleaf上编译浏览.
+"""
 with gr.Blocks(theme=theme) as demo:
     gr.Markdown('''
     # Auto-Draft: 文献整理辅助工具
+    本Demo提供对[Auto-Draft](https://github.com/CCCBora/auto-draft)的auto_draft功能的测试.
+    通过输入想要生成的论文名称（比如Playing atari with deep reinforcement learning)，即可由AI辅助生成论文模板.
     ***2023-05-03 Update***: 在公开版本中为大家提供了输入OpenAI API Key的地址, 如果有GPT-4的API KEY的话可以在这里体验!
+    在这个Huggingface Organization里也提供一定额度的免费体验： [AUTO-ACADEMIC](https://huggingface.co/auto-academic).
+    如果有更多想法和建议欢迎加入QQ群里交流, 如果我在Space里更新了Key我会第一时间通知大家. 群号: ***249738228***.
     ''')
     with gr.Row():
             # 每个功能做一个tab
             with gr.Tab("学术论文"):
+                gr.Markdown(ACADEMIC_PAPER)
                 title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
                                    label="Title", info="论文标题")
                     description_pp = gr.Textbox(lines=5, label="Description (Optional)", visible=True,
                                                 info="对希望生成的论文的一些描述. 包括这篇论文的创新点, 主要贡献, 等.")
                     with gr.Row():
                         with gr.Column():
+                            with gr.Row():
+                                template = gr.Dropdown(label="Template", choices=["ICLR2022"], value="ICLR2022",
+                                                       interactive=False,
+                                                       info="生成论文的参考模板. (暂不支持修改)")
+                                model_selection = gr.Dropdown(label="Model", choices=["gpt-4", "gpt-3.5-turbo"],
+                                                              value="gpt-4",
+                                                              interactive=True,
+                                                              info="生成论文用到的语言模型.")
                             gr.Markdown('''
+                            上传.bib文件提供AI需要参考的文献.
                             ''')
                             bibtex_file = gr.File(label="Upload .bib file", file_types=["text"],
+                                                  interactive=True)
+                            gr.Examples(
+                                examples=["latex_templates/example_references.bib"],
+                                inputs=bibtex_file
+                            )
                         with gr.Column():
                             search_engine = gr.Dropdown(label="Search Engine",
                                                         choices=["ArXiv", "Semantic Scholar", "Google Scholar", "None"],
+                                                        value="Semantic Scholar",
+                                                        interactive=False,
+                                                        info="用于决定GPT-4用什么搜索引擎来搜索文献. (暂不支持修改)")
+                            tldr_checkbox = gr.Checkbox(value=True, label="TLDR;",
+                                                        info="选择此筐表示将使用Semantic Scholar的TLDR作为文献的总结.",
+                                                        interactive=True)
+                            sections = gr.CheckboxGroup(
+                                choices=["introduction", "related works", "backgrounds", "methodology", "experiments",
+                                         "conclusion", "abstract"],
+                                type="value", label="生成章节", interactive=True,
+                                value=["introduction", "related works"])
+                            slider = gr.Slider(minimum=1, maximum=100, value=50, step=1,
+                                               interactive=True, label="最大参考文献数目")
                 with gr.Row():
                     clear_button_pp = gr.Button("Clear")
             file_output = gr.File(label="Output")
     clear_button_pp.click(fn=clear_inputs, inputs=[title, description_pp], outputs=[title, description_pp])
+    # submit_button_pp.click(fn=wrapped_generator,
+    # inputs=[title, description_pp, key, template, tldr, slider, sections, bibtex_file], outputs=file_output)
+    submit_button_pp.click(fn=wrapped_generator,
+                           inputs=[title, description_pp, key, template, tldr_checkbox, slider, sections, bibtex_file,
+                                   model_selection], outputs=file_output)
 demo.queue(concurrency_count=1, max_size=5, api_open=False)
 demo.launch()

auto_backgrounds.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os.path
 from utils.references import References
 from utils.file_operations import hash_name, make_archive, copy_templates
 from section_generator import section_generation_bg, keywords_generation, figures_generation, section_generation
@@ -25,16 +25,35 @@ def log_usage(usage, generating_target, print_out=True):
     TOTAL_COMPLETION_TOKENS += completion_tokens
     message = f"For generating {generating_target}, {total_tokens} tokens have been used ({prompts_tokens} for prompts; {completion_tokens} for completion). " \
-              f"{TOTAL_TOKENS} tokens have been used in total."
     if print_out:
         print(message)
     logging.info(message)
-def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
-                      search_engine="ss", tldr=False, max_kw_refs=10):
-    '''
-    todo: use `model` to control which model to use; may use another method to generate keywords or collect references
-    '''
     paper = {}
     paper_body = {}
@@ -45,13 +64,17 @@ def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
     # Generate keywords and references
     print("Initialize the paper information ...")
     input_dict = {"title": title, "description": description}
-    keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
-    print(f"keywords: {keywords}")
     log_usage(usage, "keywords")
-    ref = References(load_papers="")
-    ref.collect_papers(keywords, method=search_engine, tldr=tldr)
-    all_paper_ids = ref.to_bibtex(bibtex_path)  # todo: this will used to check if all citations are in this list
     print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
@@ -60,11 +83,12 @@ def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
     paper["references"] = ref.to_prompts()
     paper["body"] = paper_body
     paper["bibtex"] = bibtex_path
-    return paper, destination_folder, all_paper_ids
 def generate_backgrounds(title, description="", template="ICLR2022", model="gpt-4"):
     paper, destination_folder, _ = _generation_setup(title, description, template, model)
     for section in ["introduction", "related works", "backgrounds"]:
@@ -82,54 +106,40 @@ def generate_backgrounds(title, description="", template="ICLR2022", model="gpt-
     return make_archive(destination_folder, filename)
-def fake_generator(title, description="", template="ICLR2022", model="gpt-4"):
-    """
-    This function is used to test the whole pipeline without calling OpenAI API.
-    """
-    input_dict = {"title": title, "description": description, "generator": "generate_draft"}
-    filename = hash_name(input_dict) + ".zip"
-    return make_archive("sample-output.pdf", filename)
-def generate_draft(title, description="", template="ICLR2022", model="gpt-4", search_engine="ss", tldr=True, max_kw_refs=10):
-    paper, destination_folder, _ = _generation_setup(title, description, template, model, search_engine, tldr, max_kw_refs)
-    # todo: `list_of_methods` failed to be generated; find a solution ...
-    # print("Generating figures ...")
-    # usage = figures_generation(paper, destination_folder, model="gpt-3.5-turbo")
-    # log_usage(usage, "figures")
-    # for section in ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]:
-    for section in ["introduction", "related works", "backgrounds", "abstract"]:
-        try:
-            usage = section_generation(paper, section, destination_folder, model=model)
-            log_usage(usage, section)
-        except Exception as e:
-            message = f"Failed to generate {section}. {type(e).__name__} was raised:  {e}"
-            print(message)
-            logging.info(message)
-            max_attempts = 2
-            # todo: make this part more compact
-            # re-try `max_attempts` time
-            for i in range(max_attempts):
                 time.sleep(20)
-                try:
-                    usage = section_generation(paper, section, destination_folder, model=model)
-                    log_usage(usage, section)
-                    e = None
-                except Exception as e:
-                    pass
-                if e is None:
-                    break
     input_dict = {"title": title, "description": description, "generator": "generate_draft"}
     filename = hash_name(input_dict) + ".zip"
     return make_archive(destination_folder, filename)
 if __name__ == "__main__":
     title = "Using interpretable boosting algorithms for modeling environmental and agricultural data"
     description = ""
-    output = generate_draft(title, description, search_engine="ss", tldr=True, max_kw_refs=10)
     print(output)

 import os.path
+import json
 from utils.references import References
 from utils.file_operations import hash_name, make_archive, copy_templates
 from section_generator import section_generation_bg, keywords_generation, figures_generation, section_generation
     TOTAL_COMPLETION_TOKENS += completion_tokens
     message = f"For generating {generating_target}, {total_tokens} tokens have been used ({prompts_tokens} for prompts; {completion_tokens} for completion). " \
+              f"{TOTAL_TOKENS} tokens have been used in total.\n\n"
     if print_out:
         print(message)
     logging.info(message)
+def _generation_setup(title, description="", template="ICLR2022", tldr=False,
+                      max_kw_refs=10, max_num_refs=50, bib_refs=None):
+    """
+    This function handles the setup process for paper generation; it contains three folds
+        1. Copy the template to the outputs folder. Create the log file `generation.log`
+        2. Collect references based on the given `title` and `description`
+        3. Generate the basic `paper` object (a dictionary)
+    Parameters:
+        title (str): The title of the paper.
+        description (str, optional): A short description or abstract for the paper. Defaults to an empty string.
+        template (str, optional): The template to be used for paper generation. Defaults to "ICLR2022".
+        tldr (bool, optional): A flag indicating whether a TL;DR (Too Long; Didn't Read) summary should be generated for the collected papers. Defaults to False.
+        max_kw_refs (int, optional): The maximum number of references that can be associated with each keyword. Defaults to 10.
+        max_num_refs (int, optional): The maximum number of references that can be included in the paper. Defaults to 50.
+        bib_refs (list, optional): A list of pre-existing references in BibTeX format. Defaults to None.
+    Returns:
+    tuple: A tuple containing the following elements:
+        - paper (dict): A dictionary containing the generated paper information.
+        - destination_folder (str): The path to the destination folder where the generation log is saved.
+        - all_paper_ids (list): A list of all paper IDs collected for the references.
+    """
+    print("Generation setup...")
     paper = {}
     paper_body = {}
     # Generate keywords and references
     print("Initialize the paper information ...")
     input_dict = {"title": title, "description": description}
+    # keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
+    keywords, usage = keywords_generation(input_dict)
     log_usage(usage, "keywords")
+    # generate keywords dictionary
+    keywords = {keyword:max_kw_refs for keyword in keywords}
+    print(f"keywords: {keywords}\n\n")
+    ref = References(title, bib_refs)
+    ref.collect_papers(keywords, tldr=tldr)
+    all_paper_ids = ref.to_bibtex(bibtex_path, max_num_refs) #todo: max_num_refs has not implemented yet
     print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
     paper["references"] = ref.to_prompts()
     paper["body"] = paper_body
     paper["bibtex"] = bibtex_path
+    return paper, destination_folder, all_paper_ids #todo: use `all_paper_ids` to check if all citations are in this list
 def generate_backgrounds(title, description="", template="ICLR2022", model="gpt-4"):
+    # todo: to match the current generation setup
     paper, destination_folder, _ = _generation_setup(title, description, template, model)
     for section in ["introduction", "related works", "backgrounds"]:
     return make_archive(destination_folder, filename)
+def generate_draft(title, description="", template="ICLR2022",
+                   tldr=True, max_kw_refs=10, max_num_refs=30, sections=None, bib_refs=None, model="gpt-4"):
+    # pre-processing `sections` parameter;
+    if sections is None:
+        sections = ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]
+    # todo: add more parameters; select which section to generate; select maximum refs.
+    paper, destination_folder, _ = _generation_setup(title, description, template, tldr, max_kw_refs, max_num_refs, bib_refs)
+    for section in sections:
+        max_attempts = 4
+        attempts_count = 0
+        while attempts_count < max_attempts:
+            try:
+                usage = section_generation(paper, section, destination_folder, model=model)
+                log_usage(usage, section)
+                break
+            except Exception as e:
+                message = f"Failed to generate {section}. {type(e).__name__} was raised:  {e}"
+                print(message)
+                logging.info(message)
+                attempts_count += 1
                 time.sleep(20)
     input_dict = {"title": title, "description": description, "generator": "generate_draft"}
     filename = hash_name(input_dict) + ".zip"
+    print("\nMission completed.\n")
     return make_archive(destination_folder, filename)
 if __name__ == "__main__":
+    import openai
+    openai.api_key = os.getenv("OPENAI_API_KEY")
     title = "Using interpretable boosting algorithms for modeling environmental and agricultural data"
     description = ""
+    output = generate_draft(title, description, tldr=True, max_kw_refs=10)
     print(output)

latex_templates/ICLR2022/fig.png ADDED Viewed

latex_templates/ICLR2022/template.tex CHANGED Viewed

@@ -6,7 +6,8 @@
 \input{math_commands.tex}
 \usepackage{hyperref}
 \usepackage{url}
-\usepackage{algorithmicx}
 \title{TITLE}
 \author{GPT-4}

 \input{math_commands.tex}
 \usepackage{hyperref}
 \usepackage{url}
+\usepackage{algorithm}
+\usepackage{algorithmic}
 \title{TITLE}
 \author{GPT-4}

latex_templates/example_references.bib ADDED Viewed

	@@ -0,0 +1,20 @@

+@inproceedings{ma2020understanding,
+  title={Understanding the impact of model incoherence on convergence of incremental sgd with random reshuffle},
+  author={Ma, Shaocong and Zhou, Yi},
+  booktitle={International Conference on Machine Learning},
+  pages={6565--6574},
+  year={2020},
+  organization={PMLR}
+}
+@inproceedings{ma2020variance,
+ author = {Ma, Shaocong and Zhou, Yi and Zou, Shaofeng},
+ booktitle = {Advances in Neural Information Processing Systems},
+ editor = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin},
+ pages = {14796--14806},
+ publisher = {Curran Associates, Inc.},
+ title = {Variance-Reduced Off-Policy TDC Learning: Non-Asymptotic Convergence Analysis},
+ url = {https://proceedings.neurips.cc/paper_files/paper/2020/file/a992995ef4f0439b258f2360dbb85511-Paper.pdf},
+ volume = {33},
+ year = {2020}
+}

latex_templates/pre_refs.bib DELETED Viewed

@@ -1,17 +0,0 @@
-@article{1512.07669,
-          title = {Reinforcement Learning: Stochastic Approximation Algorithms for Markov
-  Decision Processes},
-          author = {Vikram Krishnamurthy},
-          journal={arXiv preprint arXiv:1512.07669},
-          year = {2015},
-          url = {http://arxiv.org/abs/1512.07669v1}
-        }
-@article{1511.02377,
-          title = {The Value Functions of Markov Decision Processes},
-          author = {Ehud Lehrer , Eilon Solan , Omri N. Solan},
-          journal={arXiv preprint arXiv:1511.02377},
-          year = {2015},
-          url = {http://arxiv.org/abs/1511.02377v1}
-        }

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

section_generator.py CHANGED Viewed

@@ -3,6 +3,9 @@ from utils.gpt_interaction import get_responses, extract_responses, extract_keyw
 from utils.figures import generate_random_figures
 import time
 import os
 #  three GPT-based content generator:
 #       1. section_generation: used to generate main content of the paper
@@ -23,7 +26,7 @@ def section_generation_bg(paper, section, save_to_path, model):
     print(f"Generating {section}...")
     prompts = generate_bg_summary_prompts(paper, section)
     gpt_response, usage = get_responses(prompts, model)
-    output = extract_responses(gpt_response)
     paper["body"][section] = output
     tex_file = os.path.join(save_to_path, f"{section}.tex")
     # tex_file = save_to_path + f"/{section}.tex"
@@ -56,36 +59,46 @@ def section_generation(paper, section, save_to_path, model):
     print(f"Generating {section}...")
     prompts = generate_paper_prompts(paper, section)
     gpt_response, usage = get_responses(prompts, model)
-    output = extract_responses(gpt_response)
     paper["body"][section] = output
     tex_file = os.path.join(save_to_path, f"{section}.tex")
     # tex_file = save_to_path + f"/{section}.tex"
     if section == "abstract":
         with open(tex_file, "w") as f:
-            f.write(r"\begin{abstract}")
-        with open(tex_file, "a") as f:
             f.write(output)
-        with open(tex_file, "a") as f:
-            f.write(r"\end{abstract}")
     else:
         with open(tex_file, "w") as f:
-            f.write(f"\section{{{section.upper()}}}\n")
-        with open(tex_file, "a") as f:
             f.write(output)
     time.sleep(5)
     print(f"{section} has been generated. Saved to {tex_file}.")
     return usage
-def keywords_generation(input_dict,  model, max_kw_refs = 10):
     title = input_dict.get("title")
-    description = input_dict.get("description", "")
-    if title is not None:
-        prompts = generate_keywords_prompts(title, description, max_kw_refs)
-        gpt_response, usage = get_responses(prompts, model)
-        keywords = extract_keywords(gpt_response)
-        return keywords, usage
-    else:
-        raise ValueError("`input_dict` must include the key 'title'.")
 def figures_generation(paper, save_to_path, model):
     prompts = generate_experiments_prompts(paper)

 from utils.figures import generate_random_figures
 import time
 import os
+from utils.prompts import KEYWORDS_SYSTEM
+from utils.gpt_interaction import get_gpt_responses
+import json
 #  three GPT-based content generator:
 #       1. section_generation: used to generate main content of the paper
     print(f"Generating {section}...")
     prompts = generate_bg_summary_prompts(paper, section)
     gpt_response, usage = get_responses(prompts, model)
+    output = gpt_response # extract_responses(gpt_response)
     paper["body"][section] = output
     tex_file = os.path.join(save_to_path, f"{section}.tex")
     # tex_file = save_to_path + f"/{section}.tex"
     print(f"Generating {section}...")
     prompts = generate_paper_prompts(paper, section)
     gpt_response, usage = get_responses(prompts, model)
+    output = gpt_response # extract_responses(gpt_response)
     paper["body"][section] = output
     tex_file = os.path.join(save_to_path, f"{section}.tex")
     # tex_file = save_to_path + f"/{section}.tex"
     if section == "abstract":
         with open(tex_file, "w") as f:
             f.write(output)
     else:
         with open(tex_file, "w") as f:
             f.write(output)
     time.sleep(5)
     print(f"{section} has been generated. Saved to {tex_file}.")
     return usage
+# def keywords_generation(input_dict,  model, max_kw_refs = 10):
+#     title = input_dict.get("title")
+#     description = input_dict.get("description", "")
+#     if title is not None:
+#         prompts = generate_keywords_prompts(title, description, max_kw_refs)
+#         gpt_response, usage = get_responses(prompts, model)
+#         keywords = extract_keywords(gpt_response)
+#         return keywords, usage
+#     else:
+#         raise ValueError("`input_dict` must include the key 'title'.")
+def keywords_generation(input_dict):
     title = input_dict.get("title")
+    max_attempts = 10
+    attempts_count = 0
+    while attempts_count < max_attempts:
+        try:
+            keywords, usage= get_gpt_responses(KEYWORDS_SYSTEM.format(min_refs_num=3, max_refs_num=5), title,
+                                     model="gpt-3.5-turbo", temperature=0.4)
+            print(keywords)
+            output = json.loads(keywords)
+            return output, usage
+        except json.decoder.JSONDecodeError:
+            attempts_count += 1
+            time.sleep(20)
+    raise RuntimeError("Fail to generate keywords.")
 def figures_generation(paper, save_to_path, model):
     prompts = generate_experiments_prompts(paper)

utils/gpt_interaction.py CHANGED Viewed

@@ -76,6 +76,22 @@ def get_responses(user_message, model="gpt-4", temperature=0.4, openai_key=None)
     log.info(assistant_message)
     return assistant_message, usage
 if __name__ == "__main__":
     test_strings = [r"f.write(r'hello world')", r"f.write(r'''hello world''')", r"f.write(r'''hello world",

     log.info(assistant_message)
     return assistant_message, usage
+def get_gpt_responses(systems, prompts, model="gpt-4", temperature=0.4):
+    conversation_history = [
+        {"role": "system", "content": systems},
+        {"role": "user", "content": prompts}
+    ]
+    response = openai.ChatCompletion.create(
+        model=model,
+        messages=conversation_history,
+        n=1,  # Number of responses you want to generate
+        temperature=temperature,  # Controls the creativity of the generated response
+    )
+    assistant_message = response['choices'][0]["message"]["content"]
+    usage = response['usage']
+    log.info(assistant_message)
+    return assistant_message, usage
 if __name__ == "__main__":
     test_strings = [r"f.write(r'hello world')", r"f.write(r'''hello world''')", r"f.write(r'''hello world",

utils/prompts.py CHANGED Viewed

@@ -1,24 +1,13 @@
 import logging
-log = logging.getLogger(__name__)
-INSTRUCTIONS = {"introduction": "Please include five paragraph: Establishing the motivation for the research. Explaining its importance and relevance to the AI community. Clearly state the problem you're addressing, your proposed solution, and the specific research questions or objectives. Briefly mention key related work for context. Explain the main differences from your work. ",
-                "related works": r"Please discuss key publications, methods, and techniques in your research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
-                "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX).  Do not include \section{...} but you can have \subsection{...}. ",
-                "methodology": "Please read the paper I have written and write the methodology section with three subsections: Concisely describe the techniques, algorithms, and procedures employed to address the research problem (use as many as formulas written in LaTeX). Explain the rationale behind choosing these methods, and provide sufficient detail for replication (use as many as formulas written in LaTeX). Do not make any list steps; instead, just put them in the same paragraph with sufficient explainations. Do not include \section{...} but you can have \subsection{...}. ",
-                "results": "Please write the theoretical results section using LaTeX. Include theorem and corollary to support this paper (with formulas). Explain what assumptions are used and why they are standard and necessary. Do not include \section{...}. ",
-                "experiments": "Please write the experiment section using LaTeX. Include a table to compare with other methods and bold our method. Include one figure comparison.png; this figure compares the loss curve with other methods. Do not include \section{...}. ",
-                "conclusion": "Please read the paper I have written and write the conclusion section.",
-                "abstract": "Please read the paper I have written and write the abstract."}
-INSTRUCTIONS["related works"] = r"Please discuss three to five main related fields to this paper. For each field, select " \
-                                r"five to ten key publications from references. For each reference, analyze its strengths and weaknesses in one or two sentences. " \
-                                r"Do not use \section{...} or \subsection{...}; use \paragraph{...} to list related fields. "
-BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
-                "related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
-                "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX).  Do not include \section{...} but you can have \subsection{...}. ",}
 def generate_keywords_prompts(title, description="", num_refs=5):
     prompts = f"I am writing a machine learning paper with the title '{title}'. {description}\n" \
                 f"Generate three to five keywords. For each keyword, rate it from 1 to {num_refs}; the larger number means more important." \
@@ -39,6 +28,83 @@ def generate_experiments_prompts(paper_info):
 def generate_paper_prompts(paper_info, section):
     title = paper_info["title"]
     description = paper_info["description"]
@@ -47,34 +113,57 @@ def generate_paper_prompts(paper_info, section):
     # fundamental_subprompt - describe the basic information of paper
     # instruction_subprompt - tell AI what to do
-    # references_subprompt - give AI references
     # self_subprompt - give AI existing written parts
     # output_subprompt - tell AI how to output
-    fundamental_subprompt = f"I am writing a machine learning paper with the title '{title}'. {description}\n"
-    instruction_subprompt = f"You need to write the {section} section. {INSTRUCTIONS[section]}\n"
-    # references_subprompt = f"Please read the following references: \n{references}\n"\
-    #                         f"Every time you use information from the references, you need to cite its id after the sentence; " \
-    #                        f"for example, the sentence where you use information from 1905.09788 \cite{{1905.09788}}. " \
-    #                        f"Please avoid citing the same reference in the same paragraph. \n"
-    references_subprompt = f"Please read the following references: \n{references}\n"\
-                            f"Every time you use information from the references, you need to appropriately cite it (using \citep or \citet)." \
-                           f"For example of \citep, the sentence where you use information from lei2022adaptive \citep{{lei2022adaptive}}. " \
-                           f"For example of \citet, \citet{{lei2022adaptive}} claims some information. \n" \
-                           f"Please avoid citing the same reference in the same paragraph. \n"
-    self_subprompt = f"Here is the paper that I have written: {paper}.\n"
-    output_subprompt = r"Put your response (do not include \section{...}) in the following Python script:" \
-                        f"with open(\"{section}.tex\", \"w\") as f: f.write(r'''your_response''')"
     if section in ["introduction", "related works", "backgrounds"]:
         # title + references + instruction
-        prompts = fundamental_subprompt + instruction_subprompt + references_subprompt + output_subprompt
-    elif section in ["experiments"]:
-        # only title and instruction
-        prompts = fundamental_subprompt + instruction_subprompt + output_subprompt
-    elif section in ["methodology", "abstract", "conclusion"]:
         # title + instruction + paper
-        prompts = fundamental_subprompt + instruction_subprompt + self_subprompt + output_subprompt
     else:
         raise NotImplementedError
@@ -82,6 +171,16 @@ def generate_paper_prompts(paper_info, section):
     return prompts
 def generate_bg_summary_prompts(paper_info, section):
     title = paper_info["title"]
     description = paper_info["description"]

 import logging
+from langchain import PromptTemplate
+log = logging.getLogger(__name__)
+######################################################################################################################
+# Some basic functions
+######################################################################################################################
 def generate_keywords_prompts(title, description="", num_refs=5):
     prompts = f"I am writing a machine learning paper with the title '{title}'. {description}\n" \
                 f"Generate three to five keywords. For each keyword, rate it from 1 to {num_refs}; the larger number means more important." \
+######################################################################################################################
+# System Message
+######################################################################################################################
+# two parameters: min_refs_num, max_refs_num
+# keywords_system_template = """You are an assistant designed to provide accurate and informative keywords of searching academic papers.
+# Instructions
+# - Your response should always be a Python list; e.g. ["keyword1", "keyword2", "keyword3"]
+# - The length of list should between {min_refs_num} and {max_refs_num}
+# - Use specific phrases as keywords and avoid using too general words (e.g. machine learning)"""
+keywords_system_template = """You are an assistant designed to provide accurate and informative keywords of searching academic papers.\n
+Instructions:\n
+- Your response should follow the following output format: ["field1", "field2", "field3", "field4"]\n
+- The length of this Python list should between {min_refs_num} and {max_refs_num}."""
+# two parameters: min_refs_num, max_refs_num
+exp_methods_system_template = """You are an assistant designed to provide most related algorithms or methods to a given paper title.
+Instructions
+- Your response should always be a Python list; e.g. ["method_name_1", "method_name_2", "method_name_3"]
+- The length of list should between {min_exps_num} and {max_exps_num}
+- Use abbreviation to make each method's name have 5 characters or less."""
+# one parameter: research_field
+section_generation_system_template = r"""You are an assistant designed to write academic papers in the field of {research_field} using LaTeX.
+Instructions
+- Your response should be professional and in academic tone.
+- Always give a high-level overview at the beginning of each section or subsection.
+"""
+KEYWORDS_SYSTEM = PromptTemplate(input_variables=["min_refs_num", "max_refs_num"],
+                                 template=keywords_system_template)
+EXP_METHODS_SYSTEM = PromptTemplate(input_variables=["min_exps_num", "max_exps_num"],
+                                    template=exp_methods_system_template)
+SECTION_GENERATION_SYSTEM = PromptTemplate(input_variables=["research_field"],
+                                           template=section_generation_system_template)
+######################################################################################################################
+# Academic Paper
+######################################################################################################################
+INSTRUCTIONS = {"introduction":
+                    "- Include five paragraph: Establishing the motivation for the research. Explaining its importance and relevance to the AI community. Clearly state the problem you're addressing, your proposed solution, and the specific research questions or objectives. Briefly mention key related works for context and explain the main differences from this work. List three novel contributions of this paper.",
+               "results":
+                    "Write the theoretical results section using LaTeX. Include theorem and corollary to support this paper (with formulas). Explain what assumptions are used and why they are standard and necessary. Do not include \section{...}. ",
+                "conclusion":
+                    "- Read the existing parts of paper and write the conclusion section.",
+                "abstract":
+                    "- Read the existing parts of paper and write the abstract."}
+INSTRUCTIONS["backgrounds"] = "- Start from one high-level paragraph to state the central problem in this field with detailed examples in industrial applications and theoretical challenges. \n" \
+                              "- Followed by two to three subsections:  Explain the foundational concepts and notations that underpin your research using as many as mathematical formulas (written in LaTeX). " \
+                              "Introduce more necessary mathematical notations, equations, or algorithms that are connected to this work. Present detailed discussions on how these concepts are applied in this paper."
+INSTRUCTIONS["related works"] = r"- Discuss three to five main related fields to this paper. " \
+                                r"For each field, select five to ten key publications from references. " \
+                                r"For each reference, analyze its strengths and weaknesses in one or two sentences. " \
+                                r"Present the related works in a logical manner, often chronologically. " \
+                                r"Consider using a taxonomy or categorization to structure the discussion. " \
+                                r"Do not use \section{...} or \subsection{...}; use \paragraph{...} to list related fields. "
+INSTRUCTIONS["methodology"] =  "- Provide a high-level overview of the proposed method at the beginning of this section. \n " \
+                               "- Assume you have some figures ('fig1.png', 'fig2.png', ...); they can be any figures you need (e.g. flow chart, model architecture, sample output, simulation result, or others you need). Insert figures you need with informative caption. \n" \
+                               "- Use one subsection to give a detailed formulation of the proposed method and explain how it overcomes the weakness of existing methods mentioned in this paper. " \
+                                 " If necessary, write pseudo codes wrapped by \\begin{{algorithm}} ... \\end{{algorithm}} to explain the detailed steps instead of simply listing them. \n" \
+                                "- Use one follow-up subsection to highlight the key concepts in the proposed method. " \
+                                "  Elaborate the novelty of these key concepts using formulas and inserting appropriate figures. \n" \
+                                "- Ensure the name of each subsection to be specific. \n"
+INSTRUCTIONS["experiments"] =  "- Provide a high-level overview at the beginning of this section.\n " \
+                               "- If necessary, include a table to compare with other methods and bold our method.\n" \
+                               "- Assume you have some figures ('exp1.png', 'exp2.png', ...); they can be any figures you need (e.g. loss curves, comparison with other methods, visualization, or others you need). Insert figures you need with informative caption. \n" \
+                               "- If necessary, use different subsections to distinguish different experimental setup."
 def generate_paper_prompts(paper_info, section):
     title = paper_info["title"]
     description = paper_info["description"]
     # fundamental_subprompt - describe the basic information of paper
     # instruction_subprompt - tell AI what to do
+    # ref_instruction_subprompt - give AI references
     # self_subprompt - give AI existing written parts
     # output_subprompt - tell AI how to output
+    fundamental_subprompt = "Your task is to write the {section} section of the machine learning paper with the title '{title}'. {description}\n"
+    instruction_subprompt = "\n" \
+                            "Your response should follow the following instructions:\n" \
+                            "{instruction}\n" \
+                            "- Start with \section{{{section}}}\n"
+    ref_instruction_subprompt = "- Read references. " \
+                                "Every time you use information from the references, you need to appropriately cite it (using \citep or \citet)." \
+                                "For example of \citep, the sentence where you use information from lei2022adaptive \citep{{lei2022adaptive}}. " \
+                                "For example of \citet, \citet{{lei2022adaptive}} claims some information.\n" \
+                                "- Avoid citing the same reference in a same paragraph.\n" \
+                                "\n" \
+                                "References:\n" \
+                                "{references}"
+    self_subprompt = "The existing parts of this paper is provided here: {paper}.\n"
+    output_subprompt = "Your response should start with \section{{{section}}}. Ensure that it can be directly compiled by LeTaX."
+    abstract_output_subprompt  = "Your response should start with \\begin{{abstract}} and should end with \\end{{abstract}}. Ensure that it can be directly compiled by LeTaX."
+    reivew_prompts =  PromptTemplate(
+        input_variables=["title", "description", "instruction", "section", "references"],
+        template=fundamental_subprompt + instruction_subprompt + ref_instruction_subprompt + output_subprompt)
+    summarization_prompts = PromptTemplate(
+        input_variables=["title", "description", "instruction", "section", "paper"],
+        template=fundamental_subprompt + instruction_subprompt + self_subprompt + output_subprompt)
+    abstract_prompts = PromptTemplate(
+        input_variables=["title", "description", "instruction", "section", "paper"],
+        template=fundamental_subprompt + instruction_subprompt + self_subprompt + abstract_output_subprompt)
     if section in ["introduction", "related works", "backgrounds"]:
         # title + references + instruction
+        prompts = reivew_prompts.format(title=title,
+                                          description=description,
+                                          instruction=INSTRUCTIONS[section],
+                                          section=section,
+                                          references=references)
+    elif section in ["abstract"]:
+        # title + instruction + paper
+        prompts = abstract_prompts.format(title=title,
+                                          description=description,
+                                          instruction=INSTRUCTIONS[section],
+                                          section=section,
+                                          paper=paper)
+    elif section in ["methodology",  "experiments", "conclusion"]:
         # title + instruction + paper
+        prompts = summarization_prompts.format(title=title,
+                                          description=description,
+                                          instruction=INSTRUCTIONS[section],
+                                          section=section,
+                                          paper=paper)
     else:
         raise NotImplementedError
     return prompts
+######################################################################################################################
+# Literature Review
+######################################################################################################################
+BG_INSTRUCTIONS = {"introduction": "Please include four paragraph: Establishing the motivation for this survey. Explaining its importance and relevance to the AI community. Clearly state the coverage of this survey and the specific research questions or objectives. Briefly mention key related work for context. ",
+                "related works": r"Please discuss key publications, methods, and techniques in related research area. Analyze the strengths and weaknesses of existing methods, and present the related works in a logical manner, often chronologically. Consider using a taxonomy or categorization to structure the discussion. Do not use \section{...} or \subsection{...}; use \paragraph{...} instead. ",
+                "backgrounds": r"Please clearly state the central problem in this field. Explain the foundational theories, concepts, and principles that underpin your research using as many as mathematical formulas or equations (written in LaTeX). Introduce any necessary mathematical notations, equations, or algorithms that are central to this field (written them in LaTeX).  Do not include \section{...} but you can have \subsection{...}. ",}
 def generate_bg_summary_prompts(paper_info, section):
     title = paper_info["title"]
     description = paper_info["description"]

utils/references.py CHANGED Viewed

@@ -1,17 +1,27 @@
 # Each `paper` is a dictionary containing:
-#       (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal
 #
 # Generate references:
 #   `Reference` class:
 #       1. Read a given .bib file to collect papers; use `search_paper_abstract` method to fill missing abstract.
-#       2. Given some keywords; use ArXiv or Semantic Scholar API to find papers.
 #       3. Generate bibtex from the selected papers. --> to_bibtex()
 #       4. Generate prompts from the selected papers: --> to_prompts()
 #               A sample prompt: {"paper_id": "paper summary"}
 import requests
 import re
 import bibtexparser
 from scholarly import scholarly
 from scholarly import ProxyGenerator
@@ -31,11 +41,14 @@ def remove_newlines(serie):
 def search_paper_abstract(title):
     pg = ProxyGenerator()
     success = pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155")
-    scholarly.use_proxy(pg)
-    # input the title of a paper, return its abstract
-    search_query = scholarly.search_pubs(title)
-    paper = next(search_query)
-    return remove_newlines(paper['bib']['abstract'])
 def load_papers_from_bibtex(bib_file_path):
@@ -46,6 +59,7 @@ def load_papers_from_bibtex(bib_file_path):
     else:
         bib_papers = []
         for bibitem in bib_database.entries:
             paper_id = bibitem.get("ID")
             title = bibitem.get("title")
             if title is None:
@@ -68,7 +82,6 @@ def load_papers_from_bibtex(bib_file_path):
             bib_papers.append(result)
         return bib_papers
 ######################################################################################################################
 # Semantic Scholar (SS) API
 ######################################################################################################################
@@ -124,6 +137,7 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
         authors_str = " and ".join(authors)
         try:
             last_name = authors[0].split()[-1]
         except IndexError:
             last_name = "ma"
         # pattern = r'^\w+'
@@ -131,6 +145,9 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
         return authors_str, last_name
     def parse_search_results(search_results_ss):
         # turn the search result to a list of paper dictionary.
         papers_ss = []
         for raw_paper in search_results_ss:
@@ -140,16 +157,26 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
             authors_str, last_name = extract_author_info(raw_paper['authors'])
             year_str = str(raw_paper['year'])
             title = raw_paper['title']
             # some journal may contain &; replace it. e.g. journal={IEEE Power & Energy Society General Meeting}
             journal = raw_paper['venue'].replace("&", "\\&")
             if not journal:
                 journal = "arXiv preprint"
             paper_id = extract_paper_id(last_name, year_str, title).lower()
             link = externalIds2link(raw_paper['externalIds'])
             if tldr and raw_paper['tldr'] is not None:
                 abstract = raw_paper['tldr']['text']
             else:
                 abstract = remove_newlines(raw_paper['abstract'])
             result = {
                 "paper_id": paper_id,
                 "title": title,
@@ -157,134 +184,64 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
                 "link": link,
                 "authors": authors_str,
                 "year": year_str,
-                "journal": journal
             }
             papers_ss.append(result)
         return papers_ss
     raw_results = ss_search(keyword, limit=counts)
     if raw_results is not None:
-        search_results = raw_results['data']
     else:
         search_results = []
     results = parse_search_results(search_results)
     return results
-######################################################################################################################
-# ArXiv API
-######################################################################################################################
-def _collect_papers_arxiv(keyword, counts=3, tldr=False):
-    # Build the arXiv API query URL with the given keyword and other parameters
-    def build_query_url(keyword, results_limit=3, sort_by="relevance", sort_order="descending"):
-        base_url = "http://export.arxiv.org/api/query?"
-        query = f"search_query=all:{keyword}&start=0&max_results={results_limit}"
-        query += f"&sortBy={sort_by}&sortOrder={sort_order}"
-        return base_url + query
-    # Fetch search results from the arXiv API using the constructed URL
-    def fetch_search_results(query_url):
-        response = requests.get(query_url)
-        return response.text
-    # Parse the XML content of the API response to extract paper information
-    def parse_results(content):
-        from xml.etree import ElementTree as ET
-        root = ET.fromstring(content)
-        namespace = "{http://www.w3.org/2005/Atom}"
-        entries = root.findall(f"{namespace}entry")
-        results = []
-        for entry in entries:
-            title = entry.find(f"{namespace}title").text
-            link = entry.find(f"{namespace}id").text
-            summary = entry.find(f"{namespace}summary").text
-            summary = remove_newlines(summary)
-            # Extract the authors
-            authors = entry.findall(f"{namespace}author")
-            author_list = []
-            for author in authors:
-                name = author.find(f"{namespace}name").text
-                author_list.append(name)
-            authors_str = " and ".join(author_list)
-            # Extract the year
-            published = entry.find(f"{namespace}published").text
-            year = published.split("-")[0]
-            founds = re.search(r'\d+\.\d+', link)
-            if founds is None:
-                # some links are not standard; such as "https://arxiv.org/abs/cs/0603127v1".
-                # will be solved in the future.
-                continue
-            else:
-                arxiv_id = founds.group(0)
-            journal = f"arXiv preprint arXiv:{arxiv_id}"
-            result = {
-                "paper_id": arxiv_id,
-                "title": title,
-                "link": link,
-                "abstract": summary,
-                "authors": authors_str,
-                "year": year,
-                "journal": journal
-            }
-            results.append(result)
-        return results
-    query_url = build_query_url(keyword, counts)
-    content = fetch_search_results(query_url)
-    results = parse_results(content)
-    return results
 ######################################################################################################################
 # References Class
 ######################################################################################################################
 class References:
-    def __init__(self, load_papers=""):
-        if load_papers:
-            # todo: (1) too large bibtex may make have issues on token limitations; may truncate to 5 or 10
-            #       (2) google scholar didn't give a full abstract for some papers ...
-            #       (3) may use langchain to support long input
-            self.papers = load_papers_from_bibtex(load_papers)
         else:
-            self.papers = []
-    def collect_papers(self, keywords_dict, method="arxiv", tldr=False):
         """
         keywords_dict:
             {"machine learning": 5, "language model": 2};
             the first is the keyword, the second is how many references are needed.
         """
-        match method:
-            case "arxiv":
-                process = _collect_papers_arxiv
-            case "ss":
-                process = _collect_papers_ss
-            case _:
-                raise NotImplementedError("Other sources have not been not supported yet.")
         for key, counts in keywords_dict.items():
-            self.papers = self.papers + process(key, counts, tldr)
-        seen = set()
-        papers = []
-        for paper in self.papers:
-            paper_id = paper["paper_id"]
-            if paper_id not in seen:
-                seen.add(paper_id)
-                papers.append(paper)
-        self.papers = papers
-    def to_bibtex(self, path_to_bibtex="ref.bib"):
         """
         Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
         """
-        papers = self.papers
         # clear the bibtex file
         with open(path_to_bibtex, "w", encoding="utf-8") as file:
@@ -292,7 +249,12 @@ class References:
         bibtex_entries = []
         paper_ids = []
         for paper in papers:
             bibtex_entry = f"""@article{{{paper["paper_id"]},
           title = {{{paper["title"]}}},
           author = {{{paper["authors"]}}},
@@ -308,31 +270,69 @@ class References:
                 file.write("\n\n")
         return paper_ids
-    def to_prompts(self):
         # `prompts`:
         #   {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
         #   this will be used to instruct GPT model to cite the correct bibtex entry.
         prompts = {}
-        for paper in self.papers:
             prompts[paper["paper_id"]] = paper["abstract"]
         return prompts
 if __name__ == "__main__":
-    # refs = References()
     # keywords_dict = {
-    #     "Deep Q-Networks": 15,
-    #     "Policy Gradient Methods": 24,
     #     "Actor-Critic Algorithms": 4,
-    #     "Model-Based Reinforcement Learning": 13,
-    #     "Exploration-Exploitation Trade-off": 7
     # }
-    # refs.collect_papers(keywords_dict, method="ss", tldr=True)
-    # for p in refs.papers:
-    #     print(p["paper_id"])
-    # print(len(refs.papers))
-    bib = "D:\\Projects\\auto-draft\\latex_templates\\pre_refs.bib"
-    papers = load_papers_from_bibtex(bib)
-    for paper in papers:
-        print(paper)

 # Each `paper` is a dictionary containing:
+#       (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal (8) embeddings
 #
 # Generate references:
 #   `Reference` class:
 #       1. Read a given .bib file to collect papers; use `search_paper_abstract` method to fill missing abstract.
+#       2. Given some keywords; use Semantic Scholar API to find papers.
 #       3. Generate bibtex from the selected papers. --> to_bibtex()
 #       4. Generate prompts from the selected papers: --> to_prompts()
 #               A sample prompt: {"paper_id": "paper summary"}
+# todo: (1) citations & citedby of provided papers:
+    #       load the pre-defined papers; use S2 to find all related works
+    #       add all citations to `bib_papers`
+    #       add all citedby to `bib_papers`
+    #       use Semantic Scholar to find their embeddings
+#       (2) separate references:
+    #       divide references into different groups to reduce the tokens count
+    #       for generating different paragraph of related works, use different set of references
 import requests
 import re
 import bibtexparser
+import random
 from scholarly import scholarly
 from scholarly import ProxyGenerator
 def search_paper_abstract(title):
     pg = ProxyGenerator()
     success = pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155")
+    if success:
+        scholarly.use_proxy(pg)
+        # input the title of a paper, return its abstract
+        search_query = scholarly.search_pubs(title)
+        found_paper = next(search_query)
+    else:
+        raise RuntimeError("ScraperAPI fails.")
+    return remove_newlines(found_paper['bib']['abstract'])
 def load_papers_from_bibtex(bib_file_path):
     else:
         bib_papers = []
         for bibitem in bib_database.entries:
+            # Add each paper to `bib_papers`
             paper_id = bibitem.get("ID")
             title = bibitem.get("title")
             if title is None:
             bib_papers.append(result)
         return bib_papers
 ######################################################################################################################
 # Semantic Scholar (SS) API
 ######################################################################################################################
         authors_str = " and ".join(authors)
         try:
             last_name = authors[0].split()[-1]
+            last_name = last_name.replace("'", "")
         except IndexError:
             last_name = "ma"
         # pattern = r'^\w+'
         return authors_str, last_name
     def parse_search_results(search_results_ss):
+        if len(search_results_ss) == 0:
+            return []
         # turn the search result to a list of paper dictionary.
         papers_ss = []
         for raw_paper in search_results_ss:
             authors_str, last_name = extract_author_info(raw_paper['authors'])
             year_str = str(raw_paper['year'])
             title = raw_paper['title']
             # some journal may contain &; replace it. e.g. journal={IEEE Power & Energy Society General Meeting}
             journal = raw_paper['venue'].replace("&", "\\&")
             if not journal:
                 journal = "arXiv preprint"
             paper_id = extract_paper_id(last_name, year_str, title).lower()
             link = externalIds2link(raw_paper['externalIds'])
             if tldr and raw_paper['tldr'] is not None:
                 abstract = raw_paper['tldr']['text']
             else:
                 abstract = remove_newlines(raw_paper['abstract'])
+            # some papers have no embeddings; handle this case
+            embeddings_dict = raw_paper.get('embedding')
+            if embeddings_dict is None:
+                continue
+            else:
+                embeddings = raw_paper['embedding']['vector']
             result = {
                 "paper_id": paper_id,
                 "title": title,
                 "link": link,
                 "authors": authors_str,
                 "year": year_str,
+                "journal": journal,
+                "embeddings": embeddings
             }
             papers_ss.append(result)
         return papers_ss
     raw_results = ss_search(keyword, limit=counts)
     if raw_results is not None:
+        search_results = raw_results.get("data")
+        if search_results is None:
+            search_results = []
     else:
         search_results = []
     results = parse_search_results(search_results)
     return results
 ######################################################################################################################
 # References Class
 ######################################################################################################################
 class References:
+    def __init__(self, title, load_papers):
+        if load_papers is not None:
+            self.papers = {}
+            self.papers["customized_refs"] = load_papers_from_bibtex(load_papers)
         else:
+            self.papers = {}
+        self.title = title
+    def load_papers(self, bibtex, keyword):
+        self.papers[keyword] = load_papers_from_bibtex(bibtex)
+    def generate_keywords_dict(self):
+        keywords_dict = {}
+        for k in self.papers:
+            keywords_dict[k] = len(self.papers[k])
+        return keywords_dict
+    def collect_papers(self, keywords_dict, tldr=False):
         """
         keywords_dict:
             {"machine learning": 5, "language model": 2};
             the first is the keyword, the second is how many references are needed.
         """
         for key, counts in keywords_dict.items():
+            self.papers[key] = _collect_papers_ss(key, counts, tldr)
+    def to_bibtex(self, path_to_bibtex="ref.bib", max_num_refs=50):
         """
         Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
         """
+        # todo:
+        #   use embeddings to evaluate; keep top k relevant references in papers
+        #   send (title, .bib file) to evaluate embeddings; recieve truncated papers
+        papers = self._get_papers(keyword = "_all")
+        random.shuffle(papers)
+        papers = papers[:max_num_refs]
         # clear the bibtex file
         with open(path_to_bibtex, "w", encoding="utf-8") as file:
         bibtex_entries = []
         paper_ids = []
+        seen = set()
         for paper in papers:
+            if paper["paper_id"] in seen:
+                continue
+            else:
+                seen.add(paper["paper_id"])
             bibtex_entry = f"""@article{{{paper["paper_id"]},
           title = {{{paper["title"]}}},
           author = {{{paper["authors"]}}},
                 file.write("\n\n")
         return paper_ids
+    def _get_papers(self, keyword = "_all"):
+        if keyword == "_all":
+            papers = []
+            for k, v in self.papers.items():
+                papers = papers + v
+        else:
+            papers = self.papers["keyword"]
+        return papers
+    def to_prompts(self, keyword = "_all"):
         # `prompts`:
         #   {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
         #   this will be used to instruct GPT model to cite the correct bibtex entry.
+        papers = self._get_papers(keyword)
         prompts = {}
+        for paper in papers:
             prompts[paper["paper_id"]] = paper["abstract"]
         return prompts
+    def to_json(self, keyword = "_all"):
+        papers = self._get_papers(keyword)
+        papers_json = {}
+        for paper in papers:
+            papers_json[paper["paper_id"]] = paper
+        return papers_json
 if __name__ == "__main__":
+    # testing search results
+    r = ss_search("Deep Q-Networks", limit=1)  # a list of raw papers
+    if r['total'] > 0:
+        paper = r['data'][0]
+        # print(paper)
+    # resting References
+    refs = References()
     # keywords_dict = {
+    #     "Deep Q-Networks": 5,
     #     "Actor-Critic Algorithms": 4,
+    #     "Exploration-Exploitation Trade-off": 3
     # }
+    # refs.collect_papers(keywords_dict, tldr=True)
+    # for k in refs.papers:
+    #     papers = refs.papers[k] # for each keyword, there is a list of papers
+    #     print("keyword: ", k)
+    #     for paper in papers:
+    #         print(paper["paper_id"])
+    #
+    # refs.to_bibtex()
+    # papers_json = refs.to_json() # this json can be used to find the most relevant papers
+    # with open("papers.json", "w",  encoding='utf-8') as text_file:
+    #     text_file.write(f"{papers_json}")
+    #
+    # prompts = refs.to_prompts()
+    # print(prompts)
+    bib = "test.bib"
+    refs.load_papers(bib, "variance-reduction rl")
+    print(refs.papers)
+    prompts = refs.to_prompts()
+    for k in prompts:
+        print(f"{k}: {prompts[k]}\n")
+    # for paper in papers:
+    #     print(paper)

utils/tex_processing.py CHANGED Viewed

@@ -19,10 +19,11 @@ def replace_title(save_to_path, title):
 # check if citations are in bibtex.
 # replace citations
 # sometimes the output may include thebibliography and bibitem . remove all of it.

 # check if citations are in bibtex.
 # replace citations
 # sometimes the output may include thebibliography and bibitem . remove all of it.
+# return all .png and replace it using placeholder.