Spaces:

ravithejads
/

docsques

Runtime error

App Files Files Community

ravithejads commited on Feb 22, 2023

Commit

6844495

•

1 Parent(s): 4557c88

Create app.py

Browse files

Files changed (1) hide show

app.py +258 -0

app.py ADDED Viewed

	@@ -0,0 +1,258 @@

+from gpt_index import Document, GPTListIndex
+import gradio as gr
+import openai
+import os
+import PyPDF2
+import docx
+import pytesseract
+from PIL import Image
+def pdftotext(file_name):
+  """
+  Function to extract text from .pdf format files
+  """
+  text = []
+  # Open the PDF file in read-binary mode
+  with open(file_name, 'rb') as file:
+    # Create a PDF object
+    pdf = PyPDF2.PdfReader(file)
+    # Get the number of pages in the PDF document
+    num_pages = len(pdf.pages)
+    # Iterate over every page
+    for page in range(num_pages):
+      # Extract the text from the page
+      result = pdf.pages[page].extract_text()
+      text.append(result)
+  text = "\n".join(text)
+  return text
+def docxtotext(file_name):
+  """
+  Function to read .docx format files
+  """
+  # Open the Word document
+  document = docx.Document(file_name)
+  # Extract the text from the document
+  text = '\n'.join([paragraph.text for paragraph in document.paragraphs])
+  return text
+def readtextfile(file_name):
+  """
+  Function to read .txt format files
+  """
+  # Open the Text document
+  with open(file_name, 'r') as file:
+    text = file.read()
+  return text
+def imagetotext(file_name):
+  """
+  Function to extract text from images
+  """
+  # Open the image using PIL
+  image = Image.open(file_name)
+  # Extract the text from the image
+  text = pytesseract.image_to_string(image)
+  return text
+def preprocesstext(text):
+  """
+  Function to preprocess text
+  """
+  # Split the string into lines
+  lines = text.splitlines()
+  # Use a list comprehension to filter out empty lines
+  lines = [line for line in lines if line.strip()]
+  # Join the modified lines back into a single string
+  text = '\n'.join(lines)
+  return text
+def processfiles(files):
+  """
+  Function to extract text from documents
+  """
+  textlist = []
+  # Iterate over provided files
+  for file in files:
+    # Get file name
+    file_name = file.name
+    # Get extention of file name
+    ext = file_name.split(".")[-1].lower()
+    # Process document based on extention
+    if ext == "pdf":
+      text = pdftotext(file_name)
+    elif ext == "docx":
+      text = docxtotext(file_name)
+    elif ext == "txt":
+      text = readtextfile(file_name)
+    elif ext in ["png", "jpg", "jpeg"]:
+      text = imagetotext(file_name)
+    else:
+      text = ""
+    # Preprocess text
+    text = preprocesstext(text)
+    # Append the text to final result
+    textlist.append(text)
+  return textlist
+def createdocuments(textlist):
+  """
+  Function to create documents as needed for indexing.
+  """
+  documents = []
+  # Create Document for indexing
+  for text in textlist:
+    documents.append(Document(text))
+  return documents
+def fileformatvaliditycheck(files):
+  """
+  Function to check validity of file formats
+  """
+  for file1 in files:
+    file_name = file1.name
+    # Get extention of file name
+    ext = file_name.split(".")[-1].lower()
+    if ext not in ["pdf", "txt", "docx", "png", "jpg", "jpeg"]:
+      return False
+  return True
+def openaiapikeyvaliditycheck(openaikey):
+  """
+  Function to check validity of openai key
+  """
+  # Set the API key
+  openai.api_key = openaikey
+  # Test the API key by making a request to the OpenAI API
+  try:
+      response = openai.Model.list()
+      return "Valid OpenAI API key"
+  except openai.OpenAIError:
+    apikeylink = "https://beta.openai.com/account/api-keys"
+    return f"Incorrect OpenAI API key provided: {openaikey}. You can find your OpenAI API key here - {apikeylink}"
+def createindex(files, openaikey):
+  """
+  Function to create index
+  """
+  # Basic Checks
+  if not files:
+    return "Upload file before proceeding further."
+  fileformatvalidity = fileformatvaliditycheck(files)
+  if not fileformatvalidity:
+    return "Please upload documents in pdf/txt/docx/png/jpg/jpeg format only."
+  if not openaikey:
+    return "Please enter your openai key."
+  openaiapikeyvality = openaiapikeyvaliditycheck(openaikey)
+  if openaiapikeyvality != "Valid OpenAI API key":
+    return openaiapikeyvality
+  # Store openai key in environment
+  os.environ['OPENAI_API_KEY'] = openaikey
+  # Process the Documents
+  doctextlist = processfiles(files)
+  documents = createdocuments(doctextlist)
+  # Create index
+  index = GPTListIndex(documents, chunk_size_limit = 3500)
+  # Save index
+  index.save_to_disk('index.json')
+  return "Uploading documents successfully. OpenAI API Key provided is Valid."
+def docques(query, openaikey):
+  """
+  Function to for quering on the index created
+  """
+  # Store openai key in environment
+  os.environ['OPENAI_API_KEY'] = openaikey
+  # Load index
+  index = GPTListIndex.load_from_disk('index.json')
+  # Query based on index
+  response = index.query(query, response_mode="tree_summarize")
+  return response
+def cleartext(query, output):
+  """
+  Function to clear text
+  """
+  return ["", ""]
+with gr.Blocks() as demo:
+    gr.Markdown(
+    """
+    <h1><center><b>DocQues</center></h1>
+    """)
+    gr.Markdown(
+    """
+    This app answers your queries on longer and multiple documents (pdf/docx/txt/png/jpeg/jpg) you upload. It uses <a href = "https://github.com/jerryjliu/gpt_index">GPT-Index</a> and OpenAI GPT3 in the backend, get your
+    <a href = "https://beta.openai.com/account/api-keys">Openai key here</a> before proceeding further.\n
+    """)
+    gr.Markdown(
+        """
+        <br>**Use this space effectively by following below 2 step process.**</br>
+        *Step-1*
+        <br>- Upload pdf/docx/txt/png/jpeg/jpg format documents.
+        <br>- Enter your openai key.
+        <br>- Click upload and wait to see if upload is successful or not. </br>
+        *Step-2*
+        <br>- Enter your query.
+        <br>- Click submit.
+        <br>- Check Answer </br>
+        Please refer to the GitHub repo this Space is based on, here - <a href = "https://github.com/ravi03071991/DocQues">DocQues</a> .
+        """
+    )
+    with gr.Row():
+      with gr.Column():
+        files = gr.File(label = "Upload pdf/docx/txt format documents.", file_count="multiple")
+        openaikey = gr.Textbox(lines = 1, label = "Enter your OpenAI Key.")
+        upload_button = gr.Button("Upload")
+        query = gr.Textbox(lines = 2, label = "Enter Your Question.")
+        submit_button = gr.Button("Submit")
+      with gr.Column():
+        upload_output = gr.Textbox(label = "Upload/ Error.")
+        ans_output = gr.Textbox(label = "Answer.")
+        clear_button = gr.Button("Clear")
+    # Upload button for uploading files and openai key.
+    upload_button.click(createindex, inputs=[files, openaikey], outputs= [upload_output] )
+    # Submit button for submitting query.
+    submit_button.click(docques, inputs=[query, openaikey], outputs= [ans_output] )
+    # Clear button for clearing query and answer.
+    clear_button.click(cleartext, inputs=[query, ans_output], outputs= [query, ans_output] )
+demo.launch()