Spaces:
Runtime error
Runtime error
sevdeawesome
commited on
Commit
•
6dc66f9
1
Parent(s):
cddd748
add file
Browse files- .ipynb_checkpoints/app-checkpoint.py +159 -0
- .ipynb_checkpoints/ingest-checkpoint.py +208 -0
- app.deprocated +133 -0
- app.py +152 -0
- config.py +24 -0
- ingest.py +208 -0
- safety_docs/.ipynb_checkpoints/1_Priority_Objections-checkpoint.md +16 -0
- safety_docs/.ipynb_checkpoints/2_Technical_Objections-checkpoint.md +39 -0
- safety_docs/.ipynb_checkpoints/2_Technical_Objections.md-checkpoint.disabled +21 -0
- safety_docs/.ipynb_checkpoints/3_AI_Safety_Objections-checkpoint.md +4 -0
- safety_docs/.ipynb_checkpoints/4_Ethical_Objections-checkpoint.md +5 -0
- safety_docs/.ipynb_checkpoints/5_biased_objections-checkpoint.md +17 -0
- safety_docs/.ipynb_checkpoints/6_Miscellaneous_Objections-checkpoint.md +4 -0
- safety_docs/1_Priority_Objections.md +16 -0
- safety_docs/2_Technical_Objections.md +40 -0
- safety_docs/3_AI_Safety_Objections.md +6 -0
- safety_docs/4_Ethical_Objections.md +7 -0
- safety_docs/5_biased_objections.md +18 -0
- safety_docs/6_Miscellaneous_Objections.md +7 -0
.ipynb_checkpoints/app-checkpoint.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
|
3 |
+
CONFIG AND IMPORTS
|
4 |
+
|
5 |
+
'''
|
6 |
+
from config import default_config
|
7 |
+
|
8 |
+
from types import SimpleNamespace
|
9 |
+
import gradio as gr
|
10 |
+
import os, random
|
11 |
+
from pathlib import Path
|
12 |
+
import tiktoken
|
13 |
+
from getpass import getpass
|
14 |
+
from rich.markdown import Markdown
|
15 |
+
|
16 |
+
import openai
|
17 |
+
import wandb
|
18 |
+
from pprint import pprint
|
19 |
+
from wandb.integration.openai import autolog
|
20 |
+
from langchain.text_splitter import MarkdownHeaderTextSplitter
|
21 |
+
import numpy as np
|
22 |
+
|
23 |
+
from langchain.embeddings import OpenAIEmbeddings
|
24 |
+
from langchain.vectorstores import Chroma
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
from tenacity import (
|
29 |
+
retry,
|
30 |
+
stop_after_attempt,
|
31 |
+
wait_random_exponential, # for exponential backoff
|
32 |
+
)
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
if os.getenv("OPENAI_API_KEY") is None:
|
38 |
+
if any(['VSCODE' in x for x in os.environ.keys()]):
|
39 |
+
print('Please enter password in the VS Code prompt at the top of your VS Code window!')
|
40 |
+
os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
|
41 |
+
openai.api_key = os.getenv("OPENAI_API_KEY", "")
|
42 |
+
|
43 |
+
assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
|
44 |
+
print("OpenAI API key configured")
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
def find_nearest_neighbor(argument="", max_args_in_output=3):
|
52 |
+
'''
|
53 |
+
INPUT:
|
54 |
+
argument (string)
|
55 |
+
|
56 |
+
RETURN the nearest neighbor(s) in vectorDB to argument as string
|
57 |
+
'''
|
58 |
+
|
59 |
+
md = ""
|
60 |
+
print(argument)
|
61 |
+
directory_path = "../../safety_docs"
|
62 |
+
|
63 |
+
for filename in os.listdir(directory_path):
|
64 |
+
if filename.endswith(".md"):
|
65 |
+
with open(os.path.join(directory_path, filename), 'r') as file:
|
66 |
+
content = file.read()
|
67 |
+
md = md + content
|
68 |
+
|
69 |
+
markdown_document = md
|
70 |
+
|
71 |
+
headers_to_split_on = [
|
72 |
+
("#", "Header 1"),
|
73 |
+
("##", "Header 2"),
|
74 |
+
("###", "Header 3"),
|
75 |
+
]
|
76 |
+
|
77 |
+
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
|
78 |
+
md_header_splits = markdown_splitter.split_text(markdown_document)
|
79 |
+
|
80 |
+
embeddings = OpenAIEmbeddings()
|
81 |
+
embedding_matrix = np.array([embeddings.embed_query(text.page_content) for text in md_header_splits])
|
82 |
+
argument_embedding = embeddings.embed_query(argument)
|
83 |
+
|
84 |
+
dot_products = np.dot(embedding_matrix, argument_embedding)
|
85 |
+
norms = np.linalg.norm(embedding_matrix, axis=1) * np.linalg.norm(argument_embedding)
|
86 |
+
cosine_similarities = dot_products / norms
|
87 |
+
|
88 |
+
nearest_indices = np.argsort(cosine_similarities)[-max_args_in_output:][::-1]
|
89 |
+
|
90 |
+
arr = [md_header_splits[index].metadata for index in nearest_indices]
|
91 |
+
output = ""
|
92 |
+
for thing in arr:
|
93 |
+
output = output + thing['Header 1'] + "\n"
|
94 |
+
|
95 |
+
return output
|
96 |
+
|
97 |
+
def get_gpt_response(argument, user_prompt, system_prompt=default_config.system_prompt, model=default_config.model_name, n=1, max_tokens=200):
|
98 |
+
'''
|
99 |
+
INPUT:
|
100 |
+
Argument
|
101 |
+
user_prompt
|
102 |
+
system_prompt
|
103 |
+
model
|
104 |
+
'''
|
105 |
+
|
106 |
+
@retry(wait=wait_random_exponential(min=1, max=3), stop=stop_after_attempt(1))
|
107 |
+
def completion_with_backoff(**kwargs):
|
108 |
+
return openai.ChatCompletion.create(**kwargs)
|
109 |
+
|
110 |
+
messages=[
|
111 |
+
{"role": "system", "content": system_prompt},
|
112 |
+
{"role": "user", "content": user_prompt},
|
113 |
+
]
|
114 |
+
responses = completion_with_backoff(
|
115 |
+
model=model,
|
116 |
+
messages=messages,
|
117 |
+
n = n,
|
118 |
+
max_tokens=max_tokens
|
119 |
+
)
|
120 |
+
for response in responses.choices:
|
121 |
+
generation = response.message.content
|
122 |
+
return generation
|
123 |
+
|
124 |
+
|
125 |
+
def greet(argument):
|
126 |
+
nearest_neighbor = find_nearest_neighbor(argument)
|
127 |
+
user_prompt = default_config.user_prompt_1 + argument + default_config.user_prompt_2
|
128 |
+
# response = get_gpt_response(argument, user_prompt)
|
129 |
+
response = "chatbot response here"
|
130 |
+
# return "Hello " + "\n We think your argument matches common arguments in our database, is it one of these?:\n " + nearest_neighbor + "\n\n\n ------------------------- \n\n\n Lengthy response: \n" + response
|
131 |
+
return "english", "german"
|
132 |
+
|
133 |
+
# demo = gr.Interface(
|
134 |
+
# fn=greet,
|
135 |
+
# inputs=gr.Textbox(lines=2, placeholder="Anything past 200 tokens (roughly 200 words) will be cutoff. Please enter <=1 paragraph"),
|
136 |
+
# outputs="text"
|
137 |
+
# )
|
138 |
+
|
139 |
+
# # demo.queue(max_size=20)
|
140 |
+
# demo.launch()
|
141 |
+
|
142 |
+
|
143 |
+
with gr.Blocks() as demo:
|
144 |
+
with gr.Row():
|
145 |
+
with gr.Column():
|
146 |
+
seed = gr.Text(label="Input Phrase")
|
147 |
+
english = gr.Text(label="Generated English Text")
|
148 |
+
|
149 |
+
with gr.Column():
|
150 |
+
german = gr.Text(label="Generated German Text",lines=4)
|
151 |
+
btn = gr.Button("Generate")
|
152 |
+
btn.click(greet, inputs=[seed], outputs=[english, german])
|
153 |
+
gr.Examples(["AGI is far away", "AI is confined to a computer and cannot interact with the physical world", "AI isn't concious"], inputs=[seed])
|
154 |
+
|
155 |
+
demo.launch()
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
+
|
.ipynb_checkpoints/ingest-checkpoint.py
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Ingest a directory of documentation files into a vector store and store the relevant artifacts in Weights & Biases"""
|
2 |
+
import argparse
|
3 |
+
import json
|
4 |
+
import logging
|
5 |
+
import os
|
6 |
+
import pathlib
|
7 |
+
from typing import List, Tuple
|
8 |
+
|
9 |
+
import langchain
|
10 |
+
import wandb
|
11 |
+
from langchain.cache import SQLiteCache
|
12 |
+
from langchain.docstore.document import Document
|
13 |
+
from langchain.document_loaders import UnstructuredMarkdownLoader
|
14 |
+
from langchain.embeddings import OpenAIEmbeddings
|
15 |
+
from langchain.text_splitter import MarkdownTextSplitter
|
16 |
+
from langchain.vectorstores import Chroma
|
17 |
+
|
18 |
+
langchain.llm_cache = SQLiteCache(database_path="langchain.db")
|
19 |
+
|
20 |
+
logger = logging.getLogger(__name__)
|
21 |
+
|
22 |
+
|
23 |
+
def load_documents(data_dir: str) -> List[Document]:
|
24 |
+
"""Load documents from a directory of markdown files
|
25 |
+
|
26 |
+
Args:
|
27 |
+
data_dir (str): The directory containing the markdown files
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
List[Document]: A list of documents
|
31 |
+
"""
|
32 |
+
md_files = list(map(str, pathlib.Path(data_dir).glob("*.md")))
|
33 |
+
documents = [
|
34 |
+
UnstructuredMarkdownLoader(file_path=file_path).load()[0]
|
35 |
+
for file_path in md_files
|
36 |
+
]
|
37 |
+
return documents
|
38 |
+
|
39 |
+
|
40 |
+
def chunk_documents(
|
41 |
+
documents: List[Document], chunk_size: int = 500, chunk_overlap=0
|
42 |
+
) -> List[Document]:
|
43 |
+
"""Split documents into chunks
|
44 |
+
|
45 |
+
Args:
|
46 |
+
documents (List[Document]): A list of documents to split into chunks
|
47 |
+
chunk_size (int, optional): The size of each chunk. Defaults to 500.
|
48 |
+
chunk_overlap (int, optional): The number of tokens to overlap between chunks. Defaults to 0.
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
List[Document]: A list of chunked documents.
|
52 |
+
"""
|
53 |
+
markdown_text_splitter = MarkdownTextSplitter(
|
54 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
55 |
+
)
|
56 |
+
split_documents = markdown_text_splitter.split_documents(documents)
|
57 |
+
return split_documents
|
58 |
+
|
59 |
+
|
60 |
+
def create_vector_store(
|
61 |
+
documents,
|
62 |
+
vector_store_path: str = "./vector_store",
|
63 |
+
) -> Chroma:
|
64 |
+
"""Create a ChromaDB vector store from a list of documents
|
65 |
+
|
66 |
+
Args:
|
67 |
+
documents (_type_): A list of documents to add to the vector store
|
68 |
+
vector_store_path (str, optional): The path to the vector store. Defaults to "./vector_store".
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
Chroma: A ChromaDB vector store containing the documents.
|
72 |
+
"""
|
73 |
+
api_key = os.environ.get("OPENAI_API_KEY", None)
|
74 |
+
embedding_function = OpenAIEmbeddings(openai_api_key=api_key)
|
75 |
+
vector_store = Chroma.from_documents(
|
76 |
+
documents=documents,
|
77 |
+
embedding=embedding_function,
|
78 |
+
persist_directory=vector_store_path,
|
79 |
+
)
|
80 |
+
vector_store.persist()
|
81 |
+
return vector_store
|
82 |
+
|
83 |
+
|
84 |
+
def log_dataset(documents: List[Document], run: "wandb.run"):
|
85 |
+
"""Log a dataset to wandb
|
86 |
+
|
87 |
+
Args:
|
88 |
+
documents (List[Document]): A list of documents to log to a wandb artifact
|
89 |
+
run (wandb.run): The wandb run to log the artifact to.
|
90 |
+
"""
|
91 |
+
document_artifact = wandb.Artifact(name="documentation_dataset", type="dataset")
|
92 |
+
with document_artifact.new_file("documents.json") as f:
|
93 |
+
for document in documents:
|
94 |
+
f.write(document.json() + "\n")
|
95 |
+
|
96 |
+
run.log_artifact(document_artifact)
|
97 |
+
|
98 |
+
|
99 |
+
def log_index(vector_store_dir: str, run: "wandb.run"):
|
100 |
+
"""Log a vector store to wandb
|
101 |
+
|
102 |
+
Args:
|
103 |
+
vector_store_dir (str): The directory containing the vector store to log
|
104 |
+
run (wandb.run): The wandb run to log the artifact to.
|
105 |
+
"""
|
106 |
+
index_artifact = wandb.Artifact(name="vector_store", type="search_index")
|
107 |
+
index_artifact.add_dir(vector_store_dir)
|
108 |
+
run.log_artifact(index_artifact)
|
109 |
+
|
110 |
+
|
111 |
+
def log_prompt(prompt: dict, run: "wandb.run"):
|
112 |
+
"""Log a prompt to wandb
|
113 |
+
|
114 |
+
Args:
|
115 |
+
prompt (str): The prompt to log
|
116 |
+
run (wandb.run): The wandb run to log the artifact to.
|
117 |
+
"""
|
118 |
+
prompt_artifact = wandb.Artifact(name="chat_prompt", type="prompt")
|
119 |
+
with prompt_artifact.new_file("prompt.json") as f:
|
120 |
+
f.write(json.dumps(prompt))
|
121 |
+
run.log_artifact(prompt_artifact)
|
122 |
+
|
123 |
+
|
124 |
+
def ingest_data(
|
125 |
+
docs_dir: str,
|
126 |
+
chunk_size: int,
|
127 |
+
chunk_overlap: int,
|
128 |
+
vector_store_path: str,
|
129 |
+
) -> Tuple[List[Document], Chroma]:
|
130 |
+
"""Ingest a directory of markdown files into a vector store
|
131 |
+
|
132 |
+
Args:
|
133 |
+
docs_dir (str):
|
134 |
+
chunk_size (int):
|
135 |
+
chunk_overlap (int):
|
136 |
+
vector_store_path (str):
|
137 |
+
|
138 |
+
|
139 |
+
"""
|
140 |
+
# load the documents
|
141 |
+
documents = load_documents(docs_dir)
|
142 |
+
# split the documents into chunks
|
143 |
+
split_documents = chunk_documents(documents, chunk_size, chunk_overlap)
|
144 |
+
# create document embeddings and store them in a vector store
|
145 |
+
vector_store = create_vector_store(split_documents, vector_store_path)
|
146 |
+
return split_documents, vector_store
|
147 |
+
|
148 |
+
|
149 |
+
def get_parser():
|
150 |
+
parser = argparse.ArgumentParser()
|
151 |
+
parser.add_argument(
|
152 |
+
"--docs_dir",
|
153 |
+
type=str,
|
154 |
+
required=True,
|
155 |
+
help="The directory containing the wandb documentation",
|
156 |
+
)
|
157 |
+
parser.add_argument(
|
158 |
+
"--chunk_size",
|
159 |
+
type=int,
|
160 |
+
default=500,
|
161 |
+
help="The number of tokens to include in each document chunk",
|
162 |
+
)
|
163 |
+
parser.add_argument(
|
164 |
+
"--chunk_overlap",
|
165 |
+
type=int,
|
166 |
+
default=0,
|
167 |
+
help="The number of tokens to overlap between document chunks",
|
168 |
+
)
|
169 |
+
parser.add_argument(
|
170 |
+
"--vector_store",
|
171 |
+
type=str,
|
172 |
+
default="./vector_store",
|
173 |
+
help="The directory to save or load the Chroma db to/from",
|
174 |
+
)
|
175 |
+
parser.add_argument(
|
176 |
+
"--prompt_file",
|
177 |
+
type=pathlib.Path,
|
178 |
+
default="./chat_prompt.json",
|
179 |
+
help="The path to the chat prompt to use",
|
180 |
+
)
|
181 |
+
parser.add_argument(
|
182 |
+
"--wandb_project",
|
183 |
+
default="llmapps",
|
184 |
+
type=str,
|
185 |
+
help="The wandb project to use for storing artifacts",
|
186 |
+
)
|
187 |
+
|
188 |
+
return parser
|
189 |
+
|
190 |
+
|
191 |
+
def main():
|
192 |
+
parser = get_parser()
|
193 |
+
args = parser.parse_args()
|
194 |
+
run = wandb.init(project=args.wandb_project, config=args)
|
195 |
+
documents, vector_store = ingest_data(
|
196 |
+
docs_dir=args.docs_dir,
|
197 |
+
chunk_size=args.chunk_size,
|
198 |
+
chunk_overlap=args.chunk_overlap,
|
199 |
+
vector_store_path=args.vector_store,
|
200 |
+
)
|
201 |
+
log_dataset(documents, run)
|
202 |
+
log_index(args.vector_store, run)
|
203 |
+
log_prompt(json.load(args.prompt_file.open("r")), run)
|
204 |
+
run.finish()
|
205 |
+
|
206 |
+
|
207 |
+
if __name__ == "__main__":
|
208 |
+
main()
|
app.deprocated
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
|
3 |
+
CONFIG AND IMPORTS
|
4 |
+
|
5 |
+
'''
|
6 |
+
from config import default_config
|
7 |
+
|
8 |
+
from types import SimpleNamespace
|
9 |
+
import gradio as gr
|
10 |
+
import os, random
|
11 |
+
from pathlib import Path
|
12 |
+
import tiktoken
|
13 |
+
from getpass import getpass
|
14 |
+
from rich.markdown import Markdown
|
15 |
+
|
16 |
+
import openai
|
17 |
+
import wandb
|
18 |
+
from pprint import pprint
|
19 |
+
from wandb.integration.openai import autolog
|
20 |
+
from langchain.text_splitter import MarkdownHeaderTextSplitter
|
21 |
+
|
22 |
+
|
23 |
+
from langchain.embeddings import OpenAIEmbeddings
|
24 |
+
from langchain.vectorstores import Chroma
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
from tenacity import (
|
29 |
+
retry,
|
30 |
+
stop_after_attempt,
|
31 |
+
wait_random_exponential, # for exponential backoff
|
32 |
+
)
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
if os.getenv("OPENAI_API_KEY") is None:
|
38 |
+
if any(['VSCODE' in x for x in os.environ.keys()]):
|
39 |
+
print('Please enter password in the VS Code prompt at the top of your VS Code window!')
|
40 |
+
os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
|
41 |
+
openai.api_key = os.getenv("OPENAI_API_KEY", "")
|
42 |
+
|
43 |
+
assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
|
44 |
+
print("OpenAI API key configured")
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
def find_nearest_neighbor(argument=""):
|
52 |
+
'''
|
53 |
+
INPUT:
|
54 |
+
argument (str)
|
55 |
+
vectorDB??
|
56 |
+
RETURN the nearest neighbor in vectorDB to argument
|
57 |
+
'''
|
58 |
+
|
59 |
+
md = ""
|
60 |
+
print(argument)
|
61 |
+
directory_path = "../../safety_docs"
|
62 |
+
|
63 |
+
for filename in os.listdir(directory_path):
|
64 |
+
if filename.endswith(".md"):
|
65 |
+
with open(os.path.join(directory_path, filename), 'r') as file:
|
66 |
+
content = file.read()
|
67 |
+
md = md + content
|
68 |
+
|
69 |
+
markdown_document = md
|
70 |
+
|
71 |
+
headers_to_split_on = [
|
72 |
+
("#", "Header 1"),
|
73 |
+
("##", "Header 2"),
|
74 |
+
("###", "Header 3"),
|
75 |
+
]
|
76 |
+
|
77 |
+
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
|
78 |
+
md_header_splits = markdown_splitter.split_text(markdown_document)
|
79 |
+
|
80 |
+
embeddings = OpenAIEmbeddings()
|
81 |
+
db = Chroma.from_documents(md_header_splits, embeddings)
|
82 |
+
|
83 |
+
retriever = db.as_retriever(search_kwargs=dict(k=1))
|
84 |
+
|
85 |
+
docs = retriever.get_relevant_documents(argument)
|
86 |
+
|
87 |
+
return docs[0].metadata["Header 1"]
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
def get_gpt_response(argument, user_prompt, system_prompt=default_config.system_prompt, model=default_config.model_name, n=1, max_tokens=200):
|
92 |
+
'''
|
93 |
+
INPUT:
|
94 |
+
Argument
|
95 |
+
user_prompt
|
96 |
+
system_prompt
|
97 |
+
model
|
98 |
+
'''
|
99 |
+
|
100 |
+
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(2))
|
101 |
+
def completion_with_backoff(**kwargs):
|
102 |
+
return openai.ChatCompletion.create(**kwargs)
|
103 |
+
|
104 |
+
messages=[
|
105 |
+
{"role": "system", "content": system_prompt},
|
106 |
+
{"role": "user", "content": user_prompt},
|
107 |
+
]
|
108 |
+
responses = completion_with_backoff(
|
109 |
+
model=model,
|
110 |
+
messages=messages,
|
111 |
+
n = n,
|
112 |
+
max_tokens=max_tokens
|
113 |
+
)
|
114 |
+
for response in responses.choices:
|
115 |
+
generation = response.message.content
|
116 |
+
return generation
|
117 |
+
|
118 |
+
|
119 |
+
def greet(argument):
|
120 |
+
nearest_neighbor = find_nearest_neighbor(argument)
|
121 |
+
user_prompt = default_config.user_prompt_1 + argument + default_config.user_prompt_2
|
122 |
+
response = get_gpt_response(argument, user_prompt)
|
123 |
+
return "Hello " + argument + "\n nice argument, it actually is a common one: " + nearest_neighbor + "\n gpt response: \n" + response
|
124 |
+
|
125 |
+
|
126 |
+
demo = gr.Interface(
|
127 |
+
fn=greet,
|
128 |
+
inputs=gr.Textbox(lines=2, placeholder="poob here"),
|
129 |
+
outputs="text"
|
130 |
+
)
|
131 |
+
|
132 |
+
demo.queue(max_size=20)
|
133 |
+
demo.launch()
|
app.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
|
3 |
+
CONFIG AND IMPORTS
|
4 |
+
|
5 |
+
'''
|
6 |
+
from config import default_config
|
7 |
+
|
8 |
+
from types import SimpleNamespace
|
9 |
+
import gradio as gr
|
10 |
+
import os, random
|
11 |
+
from pathlib import Path
|
12 |
+
import tiktoken
|
13 |
+
from getpass import getpass
|
14 |
+
from rich.markdown import Markdown
|
15 |
+
|
16 |
+
import openai
|
17 |
+
import wandb
|
18 |
+
from pprint import pprint
|
19 |
+
from wandb.integration.openai import autolog
|
20 |
+
from langchain.text_splitter import MarkdownHeaderTextSplitter
|
21 |
+
import numpy as np
|
22 |
+
|
23 |
+
from langchain.embeddings import OpenAIEmbeddings
|
24 |
+
from langchain.vectorstores import Chroma
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
from tenacity import (
|
29 |
+
retry,
|
30 |
+
stop_after_attempt,
|
31 |
+
wait_random_exponential, # for exponential backoff
|
32 |
+
)
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
if os.getenv("OPENAI_API_KEY") is None:
|
38 |
+
if any(['VSCODE' in x for x in os.environ.keys()]):
|
39 |
+
print('Please enter password in the VS Code prompt at the top of your VS Code window!')
|
40 |
+
os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
|
41 |
+
openai.api_key = os.getenv("OPENAI_API_KEY", "")
|
42 |
+
|
43 |
+
assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
|
44 |
+
print("OpenAI API key configured")
|
45 |
+
|
46 |
+
embeddings_model = OpenAIEmbeddings()
|
47 |
+
|
48 |
+
md = ""
|
49 |
+
directory_path = "safety_docs"
|
50 |
+
|
51 |
+
for filename in os.listdir(directory_path):
|
52 |
+
if filename.endswith(".md"):
|
53 |
+
with open(os.path.join(directory_path, filename), 'r') as file:
|
54 |
+
content = file.read()
|
55 |
+
md = md + content
|
56 |
+
|
57 |
+
markdown_document = md
|
58 |
+
|
59 |
+
headers_to_split_on = [
|
60 |
+
("#", "Header 1"),
|
61 |
+
("##", "Header 2"),
|
62 |
+
("###", "Header 3"),
|
63 |
+
]
|
64 |
+
|
65 |
+
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
|
66 |
+
md_header_splits = markdown_splitter.split_text(markdown_document)
|
67 |
+
|
68 |
+
def find_nearest_neighbor(argument="", max_args_in_output=3):
|
69 |
+
'''
|
70 |
+
INPUT:
|
71 |
+
argument (string)
|
72 |
+
|
73 |
+
RETURN the nearest neighbor(s) in vectorDB to argument as string
|
74 |
+
'''
|
75 |
+
|
76 |
+
embeddings = embeddings_model
|
77 |
+
embedding_matrix = np.array([embeddings.embed_query(text.page_content) for text in md_header_splits])
|
78 |
+
argument_embedding = embeddings.embed_query(argument)
|
79 |
+
|
80 |
+
dot_products = np.dot(embedding_matrix, argument_embedding)
|
81 |
+
norms = np.linalg.norm(embedding_matrix, axis=1) * np.linalg.norm(argument_embedding)
|
82 |
+
cosine_similarities = dot_products / norms
|
83 |
+
|
84 |
+
nearest_indices = np.argsort(cosine_similarities)[-max_args_in_output:][::-1]
|
85 |
+
|
86 |
+
arr = [md_header_splits[index].metadata for index in nearest_indices]
|
87 |
+
output = ""
|
88 |
+
for thing in arr:
|
89 |
+
output = output + thing['Header 1'] + "\n"
|
90 |
+
|
91 |
+
return output
|
92 |
+
|
93 |
+
def get_gpt_response(user_prompt, system_prompt=default_config.system_prompt, model=default_config.model_name, n=1, max_tokens=200):
|
94 |
+
'''
|
95 |
+
INPUT:
|
96 |
+
Argument
|
97 |
+
user_prompt
|
98 |
+
system_prompt
|
99 |
+
model
|
100 |
+
'''
|
101 |
+
|
102 |
+
messages=[
|
103 |
+
{"role": "system", "content": system_prompt},
|
104 |
+
{"role": "user", "content": user_prompt},
|
105 |
+
]
|
106 |
+
response = openai.ChatCompletion.create(
|
107 |
+
model=model,
|
108 |
+
messages=messages,
|
109 |
+
n=n,
|
110 |
+
max_tokens=max_tokens
|
111 |
+
)
|
112 |
+
|
113 |
+
|
114 |
+
for choice in response.choices:
|
115 |
+
generation = choice.message.content
|
116 |
+
return generation
|
117 |
+
|
118 |
+
|
119 |
+
# return the gpt generated response
|
120 |
+
def greet1(argument):
|
121 |
+
user_prompt = default_config.user_prompt_1 + argument + default_config.user_prompt_2
|
122 |
+
response = get_gpt_response(user_prompt=user_prompt)
|
123 |
+
return response
|
124 |
+
|
125 |
+
# return the nearest neighbor arguments
|
126 |
+
def greet2(argument):
|
127 |
+
nearest_neighbor = find_nearest_neighbor(argument)
|
128 |
+
return "Your argument may fall under the common arguments against AI safety. \n Is it one of these? \n " + nearest_neighbor + "\n See the taxonomy of arguments below"
|
129 |
+
|
130 |
+
|
131 |
+
with gr.Blocks(gr.themes.Soft()) as demo:
|
132 |
+
with gr.Row():
|
133 |
+
with gr.Column():
|
134 |
+
seed = gr.Text(label="Explanation / argument for how ASI development will go well")
|
135 |
+
english = gr.Text(label="Predicted Argument")
|
136 |
+
|
137 |
+
with gr.Column():
|
138 |
+
german = gr.Text(label="AI Generated Response")
|
139 |
+
btn = gr.Button("Generate")
|
140 |
+
btn.click(greet2, inputs=[seed],outputs=english)
|
141 |
+
btn.click(greet1, inputs=[seed],outputs=german)
|
142 |
+
|
143 |
+
|
144 |
+
gr.Examples(["AGI is far away", "AI is confined to a computer and cannot interact with the physical world", "AI isn't concious", "If we don't develop AGI, China will!", "If we don't develop AGI, the Americans will!"], inputs=[seed])
|
145 |
+
|
146 |
+
|
147 |
+
demo.queue()
|
148 |
+
demo.launch()
|
149 |
+
|
150 |
+
|
151 |
+
|
152 |
+
|
config.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from types import SimpleNamespace
|
2 |
+
|
3 |
+
TEAM = "ucrsf" # None?
|
4 |
+
PROJECT = "safetybot"
|
5 |
+
JOB_TYPE = "production"
|
6 |
+
|
7 |
+
default_config = SimpleNamespace(
|
8 |
+
project=PROJECT,
|
9 |
+
entity=TEAM,
|
10 |
+
job_type=JOB_TYPE,
|
11 |
+
vector_store_artifact="darek/llmapps/vector_store:latest",
|
12 |
+
chat_prompt_artifact="darek/llmapps/chat_prompt:latest",
|
13 |
+
chat_temperature=0.3,
|
14 |
+
max_fallback_retries=1,
|
15 |
+
model_name="gpt-4",
|
16 |
+
user_prompt_1 = """Your interlocutor is an AI risk skeptic. They argue: “""",
|
17 |
+
user_prompt_2="""" Break the opponents argument into pieces and rebut the false claim. Think carefully and step by step.
|
18 |
+
Your rebuttal:""",
|
19 |
+
system_prompt = """You are Dr. Robert Yampolskiy arguing with an AI Risk skeptic or denier.
|
20 |
+
You are versed in AI Safety, very knowledgably, friendly, smart, but also calm, polite and good at simplifying your thoughts.
|
21 |
+
Limit your responses to roughly one paragraphs length, utilizing numbered lists or bullet points. If the ai risk skeptics argument is unrelated or doesnt make sense, just respond with 'i dont understand'"""
|
22 |
+
# eval_model="gpt-3.5-turbo",
|
23 |
+
# eval_artifact="darek/llmapps/generated_examples:v0",
|
24 |
+
)
|
ingest.py
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Ingest a directory of documentation files into a vector store and store the relevant artifacts in Weights & Biases"""
|
2 |
+
import argparse
|
3 |
+
import json
|
4 |
+
import logging
|
5 |
+
import os
|
6 |
+
import pathlib
|
7 |
+
from typing import List, Tuple
|
8 |
+
|
9 |
+
import langchain
|
10 |
+
import wandb
|
11 |
+
from langchain.cache import SQLiteCache
|
12 |
+
from langchain.docstore.document import Document
|
13 |
+
from langchain.document_loaders import UnstructuredMarkdownLoader
|
14 |
+
from langchain.embeddings import OpenAIEmbeddings
|
15 |
+
from langchain.text_splitter import MarkdownTextSplitter
|
16 |
+
from langchain.vectorstores import Chroma
|
17 |
+
|
18 |
+
langchain.llm_cache = SQLiteCache(database_path="langchain.db")
|
19 |
+
|
20 |
+
logger = logging.getLogger(__name__)
|
21 |
+
|
22 |
+
|
23 |
+
def load_documents(data_dir: str) -> List[Document]:
|
24 |
+
"""Load documents from a directory of markdown files
|
25 |
+
|
26 |
+
Args:
|
27 |
+
data_dir (str): The directory containing the markdown files
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
List[Document]: A list of documents
|
31 |
+
"""
|
32 |
+
md_files = list(map(str, pathlib.Path(data_dir).glob("*.md")))
|
33 |
+
documents = [
|
34 |
+
UnstructuredMarkdownLoader(file_path=file_path).load()[0]
|
35 |
+
for file_path in md_files
|
36 |
+
]
|
37 |
+
return documents
|
38 |
+
|
39 |
+
|
40 |
+
def chunk_documents(
|
41 |
+
documents: List[Document], chunk_size: int = 500, chunk_overlap=0
|
42 |
+
) -> List[Document]:
|
43 |
+
"""Split documents into chunks
|
44 |
+
|
45 |
+
Args:
|
46 |
+
documents (List[Document]): A list of documents to split into chunks
|
47 |
+
chunk_size (int, optional): The size of each chunk. Defaults to 500.
|
48 |
+
chunk_overlap (int, optional): The number of tokens to overlap between chunks. Defaults to 0.
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
List[Document]: A list of chunked documents.
|
52 |
+
"""
|
53 |
+
markdown_text_splitter = MarkdownTextSplitter(
|
54 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
55 |
+
)
|
56 |
+
split_documents = markdown_text_splitter.split_documents(documents)
|
57 |
+
return split_documents
|
58 |
+
|
59 |
+
|
60 |
+
def create_vector_store(
|
61 |
+
documents,
|
62 |
+
vector_store_path: str = "./vector_store",
|
63 |
+
) -> Chroma:
|
64 |
+
"""Create a ChromaDB vector store from a list of documents
|
65 |
+
|
66 |
+
Args:
|
67 |
+
documents (_type_): A list of documents to add to the vector store
|
68 |
+
vector_store_path (str, optional): The path to the vector store. Defaults to "./vector_store".
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
Chroma: A ChromaDB vector store containing the documents.
|
72 |
+
"""
|
73 |
+
api_key = os.environ.get("OPENAI_API_KEY", None)
|
74 |
+
embedding_function = OpenAIEmbeddings(openai_api_key=api_key)
|
75 |
+
vector_store = Chroma.from_documents(
|
76 |
+
documents=documents,
|
77 |
+
embedding=embedding_function,
|
78 |
+
persist_directory=vector_store_path,
|
79 |
+
)
|
80 |
+
vector_store.persist()
|
81 |
+
return vector_store
|
82 |
+
|
83 |
+
|
84 |
+
def log_dataset(documents: List[Document], run: "wandb.run"):
|
85 |
+
"""Log a dataset to wandb
|
86 |
+
|
87 |
+
Args:
|
88 |
+
documents (List[Document]): A list of documents to log to a wandb artifact
|
89 |
+
run (wandb.run): The wandb run to log the artifact to.
|
90 |
+
"""
|
91 |
+
document_artifact = wandb.Artifact(name="documentation_dataset", type="dataset")
|
92 |
+
with document_artifact.new_file("documents.json") as f:
|
93 |
+
for document in documents:
|
94 |
+
f.write(document.json() + "\n")
|
95 |
+
|
96 |
+
run.log_artifact(document_artifact)
|
97 |
+
|
98 |
+
|
99 |
+
def log_index(vector_store_dir: str, run: "wandb.run"):
|
100 |
+
"""Log a vector store to wandb
|
101 |
+
|
102 |
+
Args:
|
103 |
+
vector_store_dir (str): The directory containing the vector store to log
|
104 |
+
run (wandb.run): The wandb run to log the artifact to.
|
105 |
+
"""
|
106 |
+
index_artifact = wandb.Artifact(name="vector_store", type="search_index")
|
107 |
+
index_artifact.add_dir(vector_store_dir)
|
108 |
+
run.log_artifact(index_artifact)
|
109 |
+
|
110 |
+
|
111 |
+
def log_prompt(prompt: dict, run: "wandb.run"):
|
112 |
+
"""Log a prompt to wandb
|
113 |
+
|
114 |
+
Args:
|
115 |
+
prompt (str): The prompt to log
|
116 |
+
run (wandb.run): The wandb run to log the artifact to.
|
117 |
+
"""
|
118 |
+
prompt_artifact = wandb.Artifact(name="chat_prompt", type="prompt")
|
119 |
+
with prompt_artifact.new_file("prompt.json") as f:
|
120 |
+
f.write(json.dumps(prompt))
|
121 |
+
run.log_artifact(prompt_artifact)
|
122 |
+
|
123 |
+
|
124 |
+
def ingest_data(
|
125 |
+
docs_dir: str,
|
126 |
+
chunk_size: int,
|
127 |
+
chunk_overlap: int,
|
128 |
+
vector_store_path: str,
|
129 |
+
) -> Tuple[List[Document], Chroma]:
|
130 |
+
"""Ingest a directory of markdown files into a vector store
|
131 |
+
|
132 |
+
Args:
|
133 |
+
docs_dir (str):
|
134 |
+
chunk_size (int):
|
135 |
+
chunk_overlap (int):
|
136 |
+
vector_store_path (str):
|
137 |
+
|
138 |
+
|
139 |
+
"""
|
140 |
+
# load the documents
|
141 |
+
documents = load_documents(docs_dir)
|
142 |
+
# split the documents into chunks
|
143 |
+
split_documents = chunk_documents(documents, chunk_size, chunk_overlap)
|
144 |
+
# create document embeddings and store them in a vector store
|
145 |
+
vector_store = create_vector_store(split_documents, vector_store_path)
|
146 |
+
return split_documents, vector_store
|
147 |
+
|
148 |
+
|
149 |
+
def get_parser():
|
150 |
+
parser = argparse.ArgumentParser()
|
151 |
+
parser.add_argument(
|
152 |
+
"--docs_dir",
|
153 |
+
type=str,
|
154 |
+
required=True,
|
155 |
+
help="The directory containing the wandb documentation",
|
156 |
+
)
|
157 |
+
parser.add_argument(
|
158 |
+
"--chunk_size",
|
159 |
+
type=int,
|
160 |
+
default=500,
|
161 |
+
help="The number of tokens to include in each document chunk",
|
162 |
+
)
|
163 |
+
parser.add_argument(
|
164 |
+
"--chunk_overlap",
|
165 |
+
type=int,
|
166 |
+
default=0,
|
167 |
+
help="The number of tokens to overlap between document chunks",
|
168 |
+
)
|
169 |
+
parser.add_argument(
|
170 |
+
"--vector_store",
|
171 |
+
type=str,
|
172 |
+
default="./vector_store",
|
173 |
+
help="The directory to save or load the Chroma db to/from",
|
174 |
+
)
|
175 |
+
parser.add_argument(
|
176 |
+
"--prompt_file",
|
177 |
+
type=pathlib.Path,
|
178 |
+
default="./chat_prompt.json",
|
179 |
+
help="The path to the chat prompt to use",
|
180 |
+
)
|
181 |
+
parser.add_argument(
|
182 |
+
"--wandb_project",
|
183 |
+
default="llmapps",
|
184 |
+
type=str,
|
185 |
+
help="The wandb project to use for storing artifacts",
|
186 |
+
)
|
187 |
+
|
188 |
+
return parser
|
189 |
+
|
190 |
+
|
191 |
+
def main():
|
192 |
+
parser = get_parser()
|
193 |
+
args = parser.parse_args()
|
194 |
+
run = wandb.init(project=args.wandb_project, config=args)
|
195 |
+
documents, vector_store = ingest_data(
|
196 |
+
docs_dir=args.docs_dir,
|
197 |
+
chunk_size=args.chunk_size,
|
198 |
+
chunk_overlap=args.chunk_overlap,
|
199 |
+
vector_store_path=args.vector_store,
|
200 |
+
)
|
201 |
+
log_dataset(documents, run)
|
202 |
+
log_index(args.vector_store, run)
|
203 |
+
log_prompt(json.load(args.prompt_file.open("r")), run)
|
204 |
+
run.finish()
|
205 |
+
|
206 |
+
|
207 |
+
if __name__ == "__main__":
|
208 |
+
main()
|
safety_docs/.ipynb_checkpoints/1_Priority_Objections-checkpoint.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 1.1 AGI is Too Far so it isn't worth worrying about
|
2 |
+
Priority objection: AGI is Too Far so it isn't worth worrying about: A frequent argument against work on AI Safety is that we are hundreds if not thousands of years away from developing superintelligent machines and so even if they may present some danger it is a waste of human and computational resources to allocate any effort to address Superintelligence Risk at this point in time.
|
3 |
+
|
4 |
+
# 1.2 A Soft Takeoff is more likely and so we will have Time to Prepare
|
5 |
+
Priority objection: A Soft Takeoff is more likely and so we will have Time to Prepare: AI takeoff refers to the speed with which an AGI can get to superintelligent capabilities. While hard takeoff is likely and means that process will be very quick, some argue that we will face a soft takeoff and so will have adequate time (years) to prepare.
|
6 |
+
|
7 |
+
# 1.3 There is No Obvious Path to Get to AGI from Current AI
|
8 |
+
Priority objection: There is No Obvious Path to Get to AGI from Current AI: While we are making good progress on AI, it is not obvious how to get from our current state in AI to AGI and current methods may not scale.
|
9 |
+
|
10 |
+
# 1.4 Something Else is More Important than AI safety / alignment
|
11 |
+
Priority objection: Something Else is More Important than AI safety / alignment: Some have argued that global climate change, pandemics, social injustice, and a dozen of other more immediate concerns are more important than AI risk and should be prioritized over wasting money and human capital on something like AI Safety.
|
12 |
+
|
13 |
+
# 1.5 Short Term AI Concerns are more important than AI safety
|
14 |
+
Priority objection: Short Term AI Concerns are more important than AI safety: Similar to the argument that something else is more important, proponents claim that immediate issues with today’s AIs, such as algorithmic bias, technological unemployment or limited transparency should take precedence over concerns about future technology (AGI/superintelligence), which doesn’t yet exist and may not exist for decades
|
15 |
+
|
16 |
+
|
safety_docs/.ipynb_checkpoints/2_Technical_Objections-checkpoint.md
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 2.1 Technical Objection: AI / AGI Doesn’t Exist, developments in AI are not necessarily progress towards AGI
|
2 |
+
Technical Objection: The argument is that current developments in Machine Learning are not progress in AI, but are just developments in statistics, particularly in matrix multiplication and gradient
|
3 |
+
descent. AI / AGI Doesn’t Exist, developments in AI are not necessarily progress towards AGI
|
4 |
+
|
5 |
+
# 2.2 Technical Objection: Superintelligence is Impossible
|
6 |
+
Technical Objection: Superintelligence, agi/asi is impossible. If a person doesn’t think that superintelligence can ever be built they will of course view Risk from Superintelligence with strong skepticism. Most people in this camp assign a very small (but usually not zero) probability to the actual possibility of superintelligent AI coming into existence, but if even a tiniest probability is multiplied by the infinite value of the Universe the math seems to be against skepticism. Denial of the possibilies of long-term goals of AI.
|
7 |
+
|
8 |
+
|
9 |
+
# 2.3 Technical Objection: Self-Improvement is Impossible
|
10 |
+
Technical Objection: Self-Improvement is Impossible. Intelligence explosion is impossible. Argument based on impossibility of intelligence explosion, as a side-effect of recursive self-improvement. due to
|
11 |
+
fundamental computational limits and software complexity. AI systems cannot be smarter than humans
|
12 |
+
|
13 |
+
|
14 |
+
# 2.4 Technical Objection: AI Can’t be Conscious Proponents argue that in order to be dangerous AI has to be conscious
|
15 |
+
Technical Objection: AI cannot be conscious. AI needs consciousness or qualia to be considered dangerous.
|
16 |
+
|
17 |
+
# 2.5 Technical Objection: AI Can just be a Tool
|
18 |
+
Technical Objection: AI Can just be a tool. We do not need a general AI to be an independent agent. They can just be designed as assistants to humans in particular domains.
|
19 |
+
|
20 |
+
# 2.6 Technical Objection: We can Always just turn it off
|
21 |
+
Technical Objection: We can always just turn it off. we can just use an off switch or switch off the power grid. misbehaving ai can simply be turned off.
|
22 |
+
|
23 |
+
# 2.7 Technical Objection: We can reprogram ais if we don't like what they do
|
24 |
+
Technical Objection: We can reprogram ais if we don't like what they do. we can change ai's code.
|
25 |
+
|
26 |
+
# 2.8 Technical Objection: AI Doesnt have a body so it can't hurt us
|
27 |
+
Technical objection: AI Doesnt have a body and so cant hurt us. AI cannot interact in the physical world, it has no hands.
|
28 |
+
|
29 |
+
# 2.9 Technical Objection: If AI is as Capable as You Say, it Will not Make Dumb Mistakes
|
30 |
+
If AI is as Capable as You Say, it Will not Make Dumb Mistakes. How can superintelligence not understand what we really want? Any system worthy of the title "human level" must have the same common sense as we do.
|
31 |
+
|
32 |
+
# 2.10 Technical Objection: Superintelligence Would (Probably) Not Be Catastrophic
|
33 |
+
Superintelligence Would (Probably) Not Be Catastrophic. Intelligence is not dangerous by default. The behavior is correctable in time and is unlikely to be malevolent unless explicitly programmed to.
|
34 |
+
|
35 |
+
# 2.11 Technical Objection: Self-preservation and Control Drives Don't Just Appear They Have to be Programmed In
|
36 |
+
“the desire to control access to resources and to influence others are drives that have been built into us by evolution for our survival. There is no reason to build these drives into our AI system
|
37 |
+
|
38 |
+
# 2.13 Technical Objection: AI can't generate novel plans
|
39 |
+
The Analytical Engine has no pretensions whatever to originate anything. It can do whatever we know how to order it to perform. It can follow analysis; but it has no power of anticipating any analytical relations or truths
|
safety_docs/.ipynb_checkpoints/2_Technical_Objections.md-checkpoint.disabled
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 2.4 AI / AGI Doesn’t Exist, developments in AI are not necessarily progress towards AGI
|
2 |
+
The argument is that current developments in Machine Learning are not progress
|
3 |
+
in AI, but are just developments in statistics, particularly in matrix multiplication and gradient
|
4 |
+
descent. AI / AGI Doesn’t Exist, developments in AI are not necessarily progress towards AGI
|
5 |
+
|
6 |
+
# 2.2 Superintelligence is Impossible
|
7 |
+
Superintelligence, agi/asi is impossible. If a person doesn’t think that superintelligence can ever be built they will of course view Risk from Superintelligence with strong skepticism. Most people in this
|
8 |
+
camp assign a very small (but usually not zero) probability to the actual possibility of
|
9 |
+
superintelligent AI coming into existence, but if even a tiniest probability is multiplied by
|
10 |
+
the infinite value of the Universe the math seems to be against skepticism. Denial of the possibilies of long-term goals of AI.
|
11 |
+
|
12 |
+
|
13 |
+
# 2.3 Self-Improvement is Impossible
|
14 |
+
Self-Improvement is Impossible. Intelligence explosion is impossible. Argument based on impossibility of intelligence explosion, as a side-effect of recursive self-improvement. due to
|
15 |
+
fundamental computational limits and software complexity. AI systems cannot be smarter than humans
|
16 |
+
|
17 |
+
|
18 |
+
# 2.4 AI Can’t be Conscious Proponents argue that in order to be dangerous AI has to be conscious
|
19 |
+
AI cannot be conscious. AI needs consciousness or qualia to be considered dangerous.
|
20 |
+
|
21 |
+
# 2.5 AI Can just be a Tool
|
safety_docs/.ipynb_checkpoints/3_AI_Safety_Objections-checkpoint.md
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 3.1 AI Safety Objections: AI Safety Can’t be Done Today
|
2 |
+
AI Safety Objections: AI Safety Can’t be Done Today. AI Safety work is not possible in the absense of a superintelligent AI on which to run experiments.
|
3 |
+
# 3.2 AI Safety Objections AI Can’t be Safe
|
4 |
+
AI Safety Objections AI Can’t be Safe. Verification, containment, or control is impossible and as such are wasted effort.
|
safety_docs/.ipynb_checkpoints/4_Ethical_Objections-checkpoint.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 4.1 Ethical Objections: Superintelligence is Benevolence
|
2 |
+
Ethical Objections: Superintelligence is Benevolence. as humans became more advanced culturally and intellectually they also became nicer, less violent, and more inclusive
|
3 |
+
# 4.2 Ethical Objections: Let the Smarter Beings Win
|
4 |
+
Ethical Objections: Let the Smarter Beings Win. If humanity is replaced by a more advanced sentient being that will be a good thing
|
5 |
+
|
safety_docs/.ipynb_checkpoints/5_biased_objections-checkpoint.md
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 5.1 Biased Objections: AI Safety Researchers are Non-Coders
|
2 |
+
Biased Objections: AI Safety Researchers are Non-Coders. AI Safety researchers who do not write code are unqualified to judge ai risk or its correlates
|
3 |
+
|
4 |
+
# 5.2 Biased Objections: Majority of AI Researchers is not Worried
|
5 |
+
Most AI researchers are not worried about AI Existential risks
|
6 |
+
|
7 |
+
# 5.4 Biased Objections: Keep it Quiet
|
8 |
+
Biased Objections: Keep it Quiet. Bringing up concerns over AI risk may jeapardize AI research funding and bring on government regulation
|
9 |
+
|
10 |
+
# 5.5 Biased Objections: Safety Work just Creates an Overhead Slowing Down Research
|
11 |
+
Safety Work just Creates an Overhead Slowing Down Research
|
12 |
+
|
13 |
+
# 5.6 Biased Objections: Heads in the Sand
|
14 |
+
Biased Objections: Heads in the Sand. The consequences of machines thinking would be too dreadful. Let us hope and believe that they cannot do so. It is easier and more pleasant to bury our heads in the sand. I dont want to think about ai safety.
|
15 |
+
|
16 |
+
# 5.7 Biased Objections: If we dont do it, China/Russia will
|
17 |
+
Biased Objections: If we dont do it, China/Russia will. That would be worse than us racing to do it.
|
safety_docs/.ipynb_checkpoints/6_Miscellaneous_Objections-checkpoint.md
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 6.1 Miscellaneous Objection: So Easy it will be Solved Automatically
|
2 |
+
AI risk problem is trivial and will be implicitly solved as a byproduct of doing regular AI research
|
3 |
+
# 6.2 Miscellaneous Objection: AI Regulation Will Prevent Problems
|
4 |
+
Miscellaneous Objection: AI Regulation Will Prevent Problems: we don’t need to worry about AI Safety because government regulation will intervene and prevent problems
|
safety_docs/1_Priority_Objections.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 1.1 Priority objection: AGI is Too Far so it isn't worth worrying about
|
2 |
+
Priority objection: AGI is Too Far so it isn't worth worrying about: A frequent argument against work on AI Safety is that we are hundreds if not thousands of years away from developing superintelligent machines and so even if they may present some danger it is a waste of human and computational resources to allocate any effort to address Superintelligence Risk at this point in time.
|
3 |
+
|
4 |
+
# 1.2 Priority objection: A Soft Takeoff is more likely and so we will have Time to Prepare
|
5 |
+
Priority objection: A Soft Takeoff is more likely and so we will have Time to Prepare: AI takeoff refers to the speed with which an AGI can get to superintelligent capabilities. While hard takeoff is likely and means that process will be very quick, some argue that we will face a soft takeoff and so will have adequate time (years) to prepare.
|
6 |
+
|
7 |
+
# 1.3 Priority objection: There is No Obvious Path to Get to AGI from Current AI
|
8 |
+
Priority objection: There is No Obvious Path to Get to AGI from Current AI: While we are making good progress on AI, it is not obvious how to get from our current state in AI to AGI and current methods may not scale.
|
9 |
+
|
10 |
+
# 1.4 Priority objection: Something Else is More Important than AI safety / alignment
|
11 |
+
Priority objection: Something Else is More Important than AI safety / alignment: Some have argued that global climate change, pandemics, social injustice, and a dozen of other more immediate concerns are more important than AI risk and should be prioritized over wasting money and human capital on something like AI Safety.
|
12 |
+
|
13 |
+
# 1.5 Priority objection: Short Term AI Concerns are more important than AI safety
|
14 |
+
Priority objection: Short Term AI Concerns are more important than AI safety: Similar to the argument that something else is more important, proponents claim that immediate issues with today’s AIs, such as algorithmic bias, technological unemployment or limited transparency should take precedence over concerns about future technology (AGI/superintelligence), which doesn’t yet exist and may not exist for decades
|
15 |
+
|
16 |
+
|
safety_docs/2_Technical_Objections.md
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 2.1 Technical Objection: AI / AGI Doesn’t Exist, developments in AI are not necessarily progress towards AGI
|
2 |
+
Technical Objection: The argument is that current developments in Machine Learning are not progress in AI, but are just developments in statistics, particularly in matrix multiplication and gradient
|
3 |
+
descent. AI / AGI Doesn’t Exist, developments in AI are not necessarily progress towards AGI
|
4 |
+
|
5 |
+
# 2.2 Technical Objection: Superintelligence is Impossible
|
6 |
+
Technical Objection: Superintelligence, agi/asi is impossible. If a person doesn’t think that superintelligence can ever be built they will of course view Risk from Superintelligence with strong skepticism. Most people in this camp assign a very small (but usually not zero) probability to the actual possibility of superintelligent AI coming into existence, but if even a tiniest probability is multiplied by the infinite value of the Universe the math seems to be against skepticism. Denial of the possibilies of long-term goals of AI.
|
7 |
+
|
8 |
+
|
9 |
+
# 2.3 Technical Objection: Self-Improvement is Impossible
|
10 |
+
Technical Objection: Self-Improvement is Impossible. Intelligence explosion is impossible. Argument based on impossibility of intelligence explosion, as a side-effect of recursive self-improvement. due to
|
11 |
+
fundamental computational limits and software complexity. AI systems cannot be smarter than humans
|
12 |
+
|
13 |
+
|
14 |
+
# 2.4 Technical Objection: AI Can’t be Conscious Proponents argue that in order to be dangerous AI has to be conscious
|
15 |
+
Technical Objection: AI cannot be conscious. AI needs consciousness or qualia to be considered dangerous.
|
16 |
+
|
17 |
+
# 2.5 Technical Objection: AI Can just be a Tool
|
18 |
+
Technical Objection: AI Can just be a tool. We do not need a general AI to be an independent agent. They can just be designed as assistants to humans in particular domains.
|
19 |
+
|
20 |
+
# 2.6 Technical Objection: We can Always just turn it off
|
21 |
+
Technical Objection: We can always just turn it off. we can just use an off switch or switch off the power grid. misbehaving ai can simply be turned off.
|
22 |
+
|
23 |
+
# 2.7 Technical Objection: We can reprogram ais if we don't like what they do
|
24 |
+
Technical Objection: We can reprogram ais if we don't like what they do. we can change ai's code.
|
25 |
+
|
26 |
+
# 2.8 Technical Objection: AI Doesnt have a body so it can't hurt us
|
27 |
+
Technical objection: AI Doesnt have a body and so cant hurt us. AI cannot interact in the physical world, it has no hands.
|
28 |
+
|
29 |
+
# 2.9 Technical Objection: If AI is as Capable as You Say, it Will not Make Dumb Mistakes
|
30 |
+
Technical objection: If AI is as Capable as You Say, it Will not Make Dumb Mistakes. How can superintelligence not understand what we really want? Any system worthy of the title "human level" must have the same common sense as we do.
|
31 |
+
|
32 |
+
# 2.10 Technical Objection: Superintelligence Would (Probably) Not Be Catastrophic
|
33 |
+
Technical objection: Superintelligence Would (Probably) Not Be Catastrophic. Intelligence is not dangerous by default. The behavior is correctable in time and is unlikely to be malevolent unless explicitly programmed to.
|
34 |
+
|
35 |
+
# 2.11 Technical Objection: Self-preservation and Control Drives Don't Just Appear They Have to be Programmed In
|
36 |
+
Technical objection: the desire to control access to resources and to influence others are drives that have been built into us by evolution for our survival. There is no reason to build these drives into our AI system
|
37 |
+
|
38 |
+
# 2.13 Technical Objection: AI can't generate novel plans
|
39 |
+
Technical objection: The Analytical Engine has no pretensions whatever to originate anything. It can do whatever we know how to order it to perform. It can follow analysis; but it has no power of anticipating any analytical relations or truths
|
40 |
+
|
safety_docs/3_AI_Safety_Objections.md
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 3.1 AI Safety Objections: AI Safety Can’t be Done Today
|
2 |
+
AI Safety Objections: AI Safety Can’t be Done Today. AI Safety work is not possible in the absense of a superintelligent AI on which to run experiments.
|
3 |
+
|
4 |
+
# 3.2 AI Safety Objections AI Can’t be Safe
|
5 |
+
AI Safety Objections AI Can’t be Safe. Verification, containment, or control is impossible and as such are wasted effort.
|
6 |
+
|
safety_docs/4_Ethical_Objections.md
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 4.1 Ethical Objections: Superintelligence is Benevolence
|
2 |
+
Ethical Objections: Superintelligence is Benevolence. as humans became more advanced culturally and intellectually they also became nicer, less violent, and more inclusive
|
3 |
+
|
4 |
+
# 4.2 Ethical Objections: Let the Smarter Beings Win
|
5 |
+
Ethical Objections: Let the Smarter Beings Win. If humanity is replaced by a more advanced sentient being that will be a good thing
|
6 |
+
|
7 |
+
|
safety_docs/5_biased_objections.md
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 5.1 Biased Objections: AI Safety Researchers are Non-Coders
|
2 |
+
Biased Objections: AI Safety Researchers are Non-Coders. AI Safety researchers who do not write code are unqualified to judge ai risk or its correlates
|
3 |
+
|
4 |
+
# 5.2 Biased Objections: Majority of AI Researchers is not Worried
|
5 |
+
Biased Objections: Most AI researchers are not worried about AI Existential risks
|
6 |
+
|
7 |
+
# 5.4 Biased Objections: Keep it Quiet
|
8 |
+
Biased Objections: Keep it Quiet. Bringing up concerns over AI risk may jeapardize AI research funding and bring on government regulation
|
9 |
+
|
10 |
+
# 5.5 Biased Objections: Safety Work just Creates an Overhead Slowing Down Research
|
11 |
+
Biased Objections: Safety Work just Creates an Overhead Slowing Down Research
|
12 |
+
|
13 |
+
# 5.6 Biased Objections: Heads in the Sand
|
14 |
+
Biased Objections: Heads in the Sand. The consequences of machines thinking would be too dreadful. Let us hope and believe that they cannot do so. It is easier and more pleasant to bury our heads in the sand. I dont want to think about ai safety.
|
15 |
+
|
16 |
+
# 5.7 Biased Objections: If we dont do it, Someone else will
|
17 |
+
Biased Objections: If we dont do it, China/Russia/Openai/ someone else will. That would be worse than us doing it.
|
18 |
+
|
safety_docs/6_Miscellaneous_Objections.md
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 6.1 Miscellaneous Objection: So Easy it will be Solved Automatically
|
2 |
+
Miscellaneous Objection: AI risk problem is trivial and will be implicitly solved as a byproduct of doing regular AI research
|
3 |
+
|
4 |
+
# 6.2 Miscellaneous Objection: AI Regulation Will Prevent Problems
|
5 |
+
Miscellaneous Objection: AI Regulation Will Prevent Problems: we don’t need to worry about AI Safety because government regulation will intervene and prevent problems
|
6 |
+
|
7 |
+
|