Spaces:
Sleeping
Sleeping
''' | |
1.更新了llama-index的库。对应的函数名和用法都有所改变。 | |
''' | |
# import gradio as gr | |
import openai | |
import requests | |
import csv | |
from llama_index import PromptHelper | |
# from llama_index import GPTSimpleVectorIndex ## renamed in the latest version. | |
from llama_index import LLMPredictor | |
from llama_index import ServiceContext | |
from langchain.chat_models import ChatOpenAI | |
from langchain import OpenAI | |
# from fastapi import FastAPI #* 实现流式数据 | |
# from fastapi.responses import StreamingResponse #* 实现流式数据 | |
import sys | |
import os | |
import math | |
import pandas as pd | |
import numpy as np | |
import PyPDF2 | |
##* in the latest version: GPTSimpleVectorIndex was renamed to GPTVectorStoreIndex, try removing it from the end of your imports | |
from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTVectorStoreIndex, LLMPredictor, PromptHelper | |
from llama_index import StorageContext, load_index_from_storage | |
from llama_index import ServiceContext | |
from llama_index import download_loader | |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
import sys | |
import os | |
from rich import print | |
## enironment settings. | |
os.environ["OPENAI_API_KEY"] = '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' | |
openai.api_key = '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' | |
# file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf" | |
# file_path = "/Users/yunshi/Downloads/txt_dir/2023年百人会电动论坛 纪要 20230401.pdf" | |
## 建立index或者的过程。 | |
def construct_index(directory_path): | |
# file_path = f"{directory_path}/uploaded_file.pdf" | |
file_path = directory_path | |
# set maximum input si771006 | |
# max_input_size = 4096 #* working | |
max_input_size = 4096 | |
# set number of output tokens | |
# num_outputs = 3000 #* working | |
num_outputs = 1000 | |
# set maximum chunk overlap | |
max_chunk_overlap = -1000 #* working | |
# set chunk size limit | |
# chunk_size_limit = 600 | |
chunk_size_limit = 6000 #* working | |
# ## add chunk_overlap_ratio according to github. | |
# chunk_overlap_ratio= 0.1 | |
# define LLM | |
# llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.5, model_name="gpt-3.5-turbo", max_tokens=2000)) | |
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo-16k", max_tokens=512,streaming=True)) | |
## 好像work了,2023.09.22, 注意这里的写法有调整。 | |
# prompt_helper = PromptHelper(max_input_s≈ize, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit) | |
prompt_helper = PromptHelper(max_input_size, num_outputs, chunk_overlap_ratio= 0.1, chunk_size_limit=chunk_size_limit) | |
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper) | |
## 如果是txt文件,那么需要用如下命令。注意与PDF文件的区别。 | |
# documents = SimpleDirectoryReader(directory_path).load_data() | |
## 如果是PDF文件,那么需要用如下命令。注意与txt文件的区别。切需要from llama_index import download_loader。 | |
#NOTE: 这里可以问:give me an example of GPT-4 solving math problem. 会回答关于这个PDF中的内容,所以可以确认这个程序调用了in-context learning的功能。 | |
# CJKPDFReader = download_loader("CJKPDFReader") ## 最新的版本好像不行了,需要用下面的命令。 | |
# loader = CJKPDFReader() | |
PDFReader = download_loader("PDFReader") # working。 | |
loader = PDFReader() | |
# documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。 | |
print('directory_path now:', directory_path) | |
# print('111') | |
# documents = loader.load_data(file="/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf") #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。 | |
documents = loader.load_data(file=directory_path) #! 注意这里是指向文件本身,而不同于txt文件的指文件夹。 | |
print('222') | |
# index = GPTSimpleVectorIndex( | |
# documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper | |
# ) | |
# index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context) ## oringinal version, working. | |
# print('documents:', documents) | |
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) #* the funciton renamed. | |
print('333') | |
# index.save_to_disk('/Users/yunshi/Downloads/txt_dir/index.json') ## in the latest version, this function is not working. | |
return index, service_context | |
def process_file(file_path,username): | |
print('process_file starts') | |
# file_path = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf" | |
#! 第一次运行是需要开启这个function。如果测试通过index,因此不需要在运行了。记得上传PDF和JSON文件到云服务器上。 | |
index, service_context = construct_index(file_path) | |
# index.storage_context.persist(persist_dir="/Users/yunshi/Downloads/txt_dir/") #* 存储到本地,为以后调用。 | |
index.storage_context.persist(persist_dir=f"./{username}/") #* 存储到本地,为以后调用。 | |
print(index) | |
# process_file(file_path) |