Spaces:

thrag
/

demo-1

Sleeping

File size: 7,442 Bytes

8d8cebe

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading settings from ../../env/ai.json\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "# If the file does not exist it'll default to the manual setting see below\n",
    "filePathToSettingsFile = '../../env/ai.json'\n",
    "\n",
    "# Is there a settings file? \n",
    "if os.path.exists(filePathToSettingsFile):\n",
    "    # Yes there is so load settings from there\n",
    "    \n",
    "    print(f'Loading settings from {filePathToSettingsFile}')\n",
    "    f = open(filePathToSettingsFile)\n",
    "    settingsJson = json.load(f)\n",
    "    del f\n",
    "\n",
    "    for key in settingsJson:\n",
    "        os.environ[key] = settingsJson[key]\n",
    "        \n",
    "    del settingsJson\n",
    "else:        \n",
    "    # Set variables manually\n",
    "    \n",
    "    print('Setting variables manually as there is not ai.json settings file')\n",
    "\n",
    "    # Update the variables below with your own settings\n",
    "    os.environ['REQUESTS_CA_BUNDLE'] = '../../env/ZCert.pem'    \n",
    "    os.environ['HUGGING_FACE_API_KEY'] = 'Get here: https://huggingface.co/settings/tokens'\n",
    "    os.environ['OPENAI_API_KEY'] = 'Get here: https://platform.openai.com/account/api-keys'\n",
    "    os.environ[\"SERPAPI_API_KEY\"] = 'serpapi KEY, Get here: https://serpapi.com/manage-api-key'    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/Ask%20A%20Book%20Questions.ipynb\n",
    "from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "from pdfminer.high_level import extract_text\n",
    "\n",
    "rootFolder = '../rag-demo-1-data/'\n",
    "literatureFolder = 'literature/'\n",
    "historyOfRomeFolder = 'history-roman/'\n",
    "\n",
    "currentFolder = f'{rootFolder}{literatureFolder}'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "for filename in glob.glob(f\"{currentFolder}*.pdf\"):\n",
    "\n",
    "    print(f'About to extract {filename}')\n",
    "    try:\n",
    "        text = extract_text(filename)\n",
    "        text = text.encode('ascii', errors='ignore').decode('ascii')\n",
    "\n",
    "        textFilename = f'{filename}.txt'\n",
    "        print(textFilename)\n",
    "        with open(textFilename, 'w') as f:\n",
    "            f.write(text)\n",
    "            \n",
    "        os.rename(filename, f\"{filename}.done\")\n",
    "    except Exception as err:\n",
    "        print(f\"Error with file {filename} {err}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import DirectoryLoader\n",
    "from langchain.document_loaders import TextLoader\n",
    "\n",
    "loader = DirectoryLoader(currentFolder, glob=\"**/*.txt\", loader_cls=TextLoader)\n",
    "docs = loader.load()\n",
    "\n",
    "print(len(docs))\n",
    "print(len(docs[0].page_content))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "16596\n"
     ]
    }
   ],
   "source": [
    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
    "texts = text_splitter.split_documents(docs)\n",
    "print(len(texts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'source': '..\\\\rag-demo-1-data\\\\literature\\\\moby-dick.pdf.txt'}\n"
     ]
    }
   ],
   "source": [
    "#print(texts[8000].page_content)\n",
    "print(texts[8000].metadata)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.vectorstores import Pinecone\n",
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "import pinecone\n",
    "\n",
    "embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "index_name = \"\" # put in the name of your pinecone index here\n",
    "pinecone.init(api_key='', environment='gcp-starter')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "docsearch = Pinecone.from_documents(texts, embeddings, index_name=index_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"What is moby dick?\"\n",
    "searchResult = docsearch.similarity_search(query, k=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "..\\rag-demo-1-data\\literature\\moby-dick.pdf.txt\n",
      "Moby Dick By Herman MelvilleDownload free eBooks of classic literature, books and \n",
      "novels at Planet eBook. Subscribe to our free eBooks blog \n",
      "and email newsletter.\fETYMOLOGY.(Supplied  by  a  Late  Consumptive  Usher  to  a  Grammar \n",
      "School)The  pale  Usherthreadbare  in  coat,  heart,  body,  and \n",
      "brain; I see him now. He was ever dusting his old lexicons \n",
      "and grammars, with a queer handkerchief, mockingly em-\n",
      "bellished with all the gay flags of all the known nations of \n",
      "the world. He loved to dust his old grammars; it somehow \n",
      "mildly reminded him of his mortality.While you take in hand to school others, and to teach them \n",
      "by what name a whale-fish is to be called in our tongue \n",
      "leaving out, through ignorance, the letter H, which almost \n",
      "alone maketh the signification of the word, you deliver that \n",
      "which is not true. HACKLUYTWHALE.  Sw. and Dan. HVAL. This animal is named from \n",
      "roundness or rolling; for in Dan. HVALT is arched or vaulted.\n"
     ]
    }
   ],
   "source": [
    "\n",
    "print(searchResult[0].metadata['source'])\n",
    "print(searchResult[0].page_content)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}