from langchain.chat_models import ChatOpenAI from langchain.schema import ( HumanMessage, SystemMessage ) import tiktoken import re def num_tokens_from_string(string: str, encoder) -> int: num_tokens = len(encoder.encode(string)) return num_tokens def feed_articles_to_gpt_with_links(information, question): prompt = "The following pieces of information includes relevant articles. \nUse the following sentences to answer question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer. " prompt += "Please state the number of the article used to answer the question after your response\n" end_prompt = "\n----------------\n" prompt += end_prompt content = "" seperator = "<<<<>>>>" token_count = 0 encoder = tiktoken.encoding_for_model("gpt-3.5-turbo") token_count += num_tokens_from_string(prompt, encoder) articles = [contents for score, contents, uuids, titles, domains in information] uuids = [uuids for score, contents, uuids, titles, domains in information] domains = [domains for score, contents, uuids, titles, domains in information] for i in range(len(articles)): addition = "Article " + str(i + 1) + ": " + articles[i] + seperator addition += articles[i] + seperator token_count += num_tokens_from_string(addition, encoder) if token_count > 3500: print(i) break content += addition prompt += content llm = ChatOpenAI(temperature=0.0) message = [ SystemMessage(content=prompt), HumanMessage(content=question) ] response = llm(message) print(response.content) print("response length: ", len(response.content)) answer_found_prompt = "Please check if the following response found the answer. If yes, return 1 and if no, return 0. \n" message = [ SystemMessage(content=answer_found_prompt), HumanMessage(content=response.content) ] print(llm(message).content) if llm(message).content == "0": return "I could not find the answer.", [], [], [] # sources = "\n Sources: \n" # for i in range(len(uuids)): # link = "https://tobaccowatcher.globaltobaccocontrol.org/articles/" + uuids[i] + "/" + "\n" # sources += link # response.content += sources lowercase_response = response.content.lower() # remove parentheses lowercase_response = re.sub('[()]', '', lowercase_response) lowercase_split = lowercase_response.split() used_article_num = [] for i in range(len(lowercase_split)): if lowercase_split[i] == "article": next_word = lowercase_split[i + 1] # get rid of non-numenric characters next_word = ''.join(c for c in next_word if c.isdigit()) print("Article number: ", next_word) # append only if it is not present in the list if next_word not in used_article_num: used_article_num.append(next_word) # if empty print("Used article num: ", used_article_num) if not used_article_num: print("I could not find the answer. Reached") return "I could not find the answer.", [], [], [] used_article_num = [int(num) - 1 for num in used_article_num] links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids] titles = [titles for score, contents, uuids, titles, domains in information] links = [links[i] for i in used_article_num] titles = [titles[i] for i in used_article_num] domains = [domains[i] for i in used_article_num] # get rid of substring that starts with (Article and ends with ) response_without_source = re.sub("""\(Article.*\)""", "", response.content) return response_without_source, links, titles, domains