File size: 3,728 Bytes
f7db77c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import os
from commons.Configs import configs
from commons.File import file
import openai
from openai.embeddings_utils import cosine_similarity
import json
class OpenAIClient:
def __init__(self, debug=False):
self.debug = debug
openai.api_key = configs.OPENAI_KEY
self.embeddingsModel = configs.embeddingsModel
def buildPrompt(self, name, variables):
# used by prepareutils.Dataset
promptFilePath = os.path.join(configs.promptsDir, f"{name}.prompt.txt")
prompt = file.readFile(promptFilePath)
for key, value in variables.items():
prompt = prompt.replace(f"{{{key}}}", value)
return prompt
def generateSyntheticQuestions(self, prompt, debugSentence=""):
# used by prepareutils.Dataset
"""Use OpenAI completion API to generate synthetic questions for each sentence"""
# ----------------------------------------------
# generate questions (responseText)
# ----------------------------------------------
response = openai.ChatCompletion.create(
model=configs.chatCompletionModel,
messages=[{"role": "user", "content": prompt}]
)
responseText = response['choices'][0]['message']['content']
# ----------------------------------------------
# split questions and answers
# ----------------------------------------------
# make all question/answers to be on the same line
# and remove the response header
questionAnswers = responseText.replace("\n", "").split('(Q)', 1)[1]
# one line per question/answer
questionAnswers = questionAnswers.split('(Q)')
# split question and answers
questionAnswers = [x.split('(A)', 1) for x in questionAnswers]
# remove invalid rows and strip
questionAnswers = [[x[0].strip(), x[1].strip()]
for x in questionAnswers if len(x) == 2]
jsonData = [{"question": x[0], "answer": x[1]}
for x in questionAnswers]
# ----------------------------------------------
# debug
if self.debug:
print("Sentence: ", debugSentence)
print("Response text: ", responseText)
print("jsonData: ", json.dumps(jsonData, indent=4))
# ----------------------------------------------
return jsonData
def generateEmbeddings(self, sentences):
# used by prepareutils.Embeddings
response = openai.Embedding.create(
input=sentences,
model=self.embeddingsModel,
)
embeddings = []
for x in response['data']:
embeddings.append(x['embedding'])
assert len(embeddings) == len(sentences)
return embeddings
def searchBestEmbeddingIndex(self, embeddedQuestion, embeddingsToSearch):
# find the most similar sentence
# used by ask.py
"""Search for the best embedding index"""
maxSimilarity = 0
maxSimilarityIndex = 0
for i, embedding in enumerate(embeddingsToSearch):
# similarity = cosineSimilarity(
# np.array(questionEmbedding['data'][0]['embedding']), embedding)
similarity = cosine_similarity(embeddedQuestion, embedding)
if similarity > maxSimilarity:
maxSimilarity = similarity
maxSimilarityIndex = i
# return the most similar sentence index
return maxSimilarityIndex
# return the most similar embedding
# return df.iloc[maxSimilarityIndex].sentences
openaiClient = OpenAIClient()
|