# !pip install langchain langchain-groq sentence-transformers langchainhub faiss-cpu gradio gradio_client yfinance duckduckgo-search import pandas as pd import io import requests import os import json import matplotlib.pyplot as plt from datetime import datetime, timedelta import requests from bs4 import BeautifulSoup import requests import yfinance as yf import ast import re from datetime import datetime, timedelta import pytz # import langchain libraries # !pip install langchain langchain-groq langchainhub duckduckgo-search from langchain.agents import AgentExecutor from langchain.agents import create_react_agent from langchain.agents import create_structured_chat_agent from langchain import hub from langchain_groq import ChatGroq from langchain_core.prompts import ChatPromptTemplate from langchain.agents import Tool from langchain_community.tools import DuckDuckGoSearchResults from langchain.schema.output_parser import StrOutputParser from langchain_core.prompts import PromptTemplate from langchain_community.tools import DuckDuckGoSearchRun from langchain.chains.combine_documents import create_stuff_documents_chain from langchain.chains import create_retrieval_chain from langchain import hub from langchain.chains import RetrievalQA from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain_community.document_loaders.csv_loader import CSVLoader from langchain.tools import DuckDuckGoSearchRun from langchain_core.output_parsers import JsonOutputParser from langchain.agents import AgentExecutor, create_tool_calling_agent from langchain_core.prompts import ChatPromptTemplate from langchain.agents import AgentExecutor, create_structured_chat_agent #import gradio libraries # !pip install gradio gradio_client import gradio as gr #import vectorstore libraries # !pip install faiss-cpu from langchain_community.vectorstores import FAISS embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") ############################################ ############################################ # # Code steps involved: # 1. Define the LLM # 2. Extract data from NSE # 2. Process the datafrme and store it as CSV files # 3. Use Langchain CSV Loaders to load the CSV data # 4. Create Vector Stores # 5. Create company lists # 6. Create the LLM functions required # 7. Create the python functions for stock data and charting functions # 8. Create Gradio Blocks # 9. Find any recent real time addition to NSE data and add it to the vector stores. # 10. Create retrievers and langchain QA retrieval chains # 11. Define charts for default # 12. Gradio app ########################################## ########################################## # Get the data from NSE as pandas dataframe # Function to get dataframe from NSE website # Data from two pages: NSE Announcements and NSE corporate actions are fetched and hence two dataframes groq_api_key = os.getenv('groq') serp_api_key = os.getenv('serp') def get_pd(d): # Get the current date current_date = datetime.now() # Get the previous day previous_day = current_date - timedelta(days=d) # Format the dates in the required format (dd-mm-yyyy) current_date_str = current_date.strftime("%d-%m-%Y") previous_day_str = previous_day.strftime("%d-%m-%Y") base_url = 'https://www.nseindia.com' session = requests.Session() headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ' 'like Gecko) ' 'Chrome/80.0.3987.149 Safari/537.36', 'accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'} r = session.get(base_url, headers=headers, timeout=120) cookies = dict(r.cookies) # Use the dates in the URL url1 = f"https://www.nseindia.com/api/corporate-announcements?index=equities&from_date={previous_day_str}&to_date={current_date_str}&csv=true" url2 = f"https://www.nseindia.com/api/corporates-corporateActions?index=equities&csv=true" response1 = session.get(url1, timeout=120, headers=headers, cookies=cookies) response2 = session.get(url2, timeout=120, headers=headers, cookies=cookies) content1 = response1.content content2 = response2.content df=pd.read_csv(io.StringIO(content1.decode('utf-8'))) dfca=pd.read_csv(io.StringIO(content2.decode('utf-8'))) return df, dfca # Process the datafrme and store it as CSV files # To increase the speed of prcocessing in RAG, I decided to use three separate vectostores # First vector store will all data, second store with minimum data and third one with CA related data # Owing to context window problem of RAG, it is always good to ensure that we don't have any irrelevant data df_old, dfca = get_pd(1) df_back = df_old.copy() df_back.to_csv("df_backup.csv",index=False) df_old.drop(['RECEIPT','DISSEMINATION','DIFFERENCE'],axis=1,inplace=True) df_old2 = df_old.drop(['ATTACHMENT'],axis=1) # Save it as a CSV file df_old.to_csv("nse_data_old.csv", index=False) # df_old1.to_csv("nse_data_old1.csv", index=False) df_old2.to_csv("nse_data_old2.csv", index=False) dfca.to_csv("nse_ca.csv", index=False) # Use Langchain CSV Loaders to load the CSV data loader = CSVLoader("nse_data_old.csv") data_old = loader.load() loader2 = CSVLoader("nse_data_old2.csv") data_old_2 = loader2.load() loader3 = CSVLoader("nse_ca.csv") data_ca = loader3.load() global vectorstore,vectorstore2,vectorstore3, colist, colist_tracked # Create vectorstores - I tried Chroma but FAISS turned out to be successful vectorstore = FAISS.from_documents(data_old, embedding_function) vectorstore2 = FAISS.from_documents(data_old_2, embedding_function) vectorstore3 = FAISS.from_documents(data_ca, embedding_function) vectorstore.save_local("vectorstore") vectorstore2.save_local("vectorstore2") vectorstore3.save_local("vectorstore3") ########################### # Create company list # Upload the NIFTY company names - this is currently hardcoded as NIFTY does not change as often but can be made dynamic co1 = pd.read_csv('ind_nifty50list.csv') # Create company lists required # Get the column you want to convert to a list column_name = "Company Name" # # Convert the column to a list co_list1 = co1[column_name].tolist() # # These are the companies that are being tracked - this can be uploaded / hardcoded co_list_tracked = ['Reliance Industries Limited', 'Infosys Limited','ICICI Bank Ltd', 'Indusind Bank Ltd','Ramco Systems', \ 'Zydus Lifesciences Limited','Bharti Airtel Limited',\ 'ICICI Bank Limited','TechMahindra Limited', 'Indiabulls Real Estate Limited','Tamilnad Mercanitle Bank Limited', \ 'Bajaj Finance Limited', 'Apollo Tyres Limited', 'Zydus Lifesciences Limited', 'Indusind Bank Limited', 'Kirloskar Oil Engines Limited'] co_list = co_list1 + co_list_tracked #################################### ################################## # Let us create some functions required ################################## # LLM function to get announcement detail def give_announcement(llm,stock): if not stock: return "This company has not made any announcements today or yesterday" else: retriever1 = vectorstore.as_retriever() qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever1, return_source_documents=False) response = qa_chain({"query":f"What are the announcements made by the company {stock}?. If no announcement has been made by that company, \ just say that no announcement has been made by that company."}) return f"Announcements made by {stock}: {response['result']}" # LLM function to get Corporate Action Detail def get_ca(llm,stock): # stock = stock_name if not stock: return "This company has not made any announcements today or yesterday" else: # resp1 = llm.invoke(f"get all the yahoo finance company name(s) of entity name in {stock}. Just print the ticker(s) alone. Do not print leading sentences.") # stock = resp1.content retriever3 = vectorstore3.as_retriever() qa_chain2 = RetrievalQA.from_chain_type(llm, retriever=retriever3, return_source_documents=False) response = qa_chain2({"query":f"What are the corporate action announcements made by the company {stock}?. If no announcement has been made by that company, do not print any source documents and \ just say that no announcement has been made by that company."}) return response['result']#, response['source_documents'] # a web search tool search=DuckDuckGoSearchRun() os.environ['SERPAPI_API_KEY']= serp_api_key from langchain_community.utilities import SerpAPIWrapper search = SerpAPIWrapper() # Fetch stock data from Yahoo Finance def get_stock_price(ticker,history=5): # time.sleep(4) #To avoid rate limit error if "." in ticker: ticker=ticker.split(".")[0] ticker=ticker+".NS" stock = yf.Ticker(ticker) df = stock.history(period="1y") df=df[["Close","Volume"]] df.index=[str(x).split()[0] for x in list(df.index)] df.index.rename("Date",inplace=True) df=df[-history:] # print(df.columns) return df.to_string() # get stock price movements def get_movements(llm,stock): if not stock: return "This company has not made any announcements today or yesterday" else: stock = stock[0] dfc = pd.read_csv('nse_data_old.csv') stockdesc = dfc[dfc['COMPANY NAME'] == stock]['COMPANY NAME'].iloc[0] stock1 = dfc[dfc['COMPANY NAME'] == stock]['SYMBOL'].iloc[0] stock = get_ticker(stock1) print("stock is ",stock) tools=[ Tool( name="get stock data", func=get_stock_price, description=f"Use this tool to get stock price data. This tool will return three values: date, volume and closing price of the stock \ for the period of 5 days. stock = {stock}" ), Tool( name="SerpAPI search", func=search.run, description=f"Use this tool for for web search for searching details about stock like broker sentiment. You can also get recent stock \ related news. stock symbol = {stock} and stockname = {stockdesc}" ), ] prompt = hub.pull("hwchase17/structured-chat-agent") # Construct the Tools agent agent = create_structured_chat_agent(llm, tools, prompt) try: # Create an agent executor by passing in the agent and tools agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) response = agent_executor.invoke({"input": f"How much the stock price of stock {stock} with name {stockdesc} moved in the last few days?. Give the prices \ over the last few days and also percentage change. For example, If the stock has not moved in single direction, \ you can say the stock has been volatile. But if it has moved up over five days, you can say so with percentage movement"}) return f"Answer for {stock} - {response['output']}" except Exception as e: return f"An error occurred: {str(e)}" ##################################### # get stock sentiments ##################################### prompt1 = """Hello, I need broker sentiment data for a specific stock. Please search and summarize current market analyses, broker reports, \ and overall sentiment regarding the given stock:\Focus on information from credible sources like financial news, broker reports, and investment research firms. \ Provide key insights, including:\ Recent broker recommendations (buy, hold, sell), \ Notable broker analyses or reports, \ General trends in broker sentiment, \ Any major news or events impacting the stock's sentiment. \ Please ensure the data is up-to-date and from reputable sources. Provide a concise summary with relevant details and any supporting context to understand the current sentiment.\ Please note that you are not chat agent, but meant for single usage, so do not conclude with any greetings or asking for further assistance etc!.\ """ def get_sentiments(llm,stock): if not stock: return "This company has not made any announcements today or yesterday" else: print("st1",stock) stock = stock[0] print("af ",stock) ##### dfc = pd.read_csv('nse_data_old.csv') stockdesc = dfc[dfc['COMPANY NAME'] == stock]['COMPANY NAME'].iloc[0] stock1 = dfc[dfc['COMPANY NAME'] == stock]['SYMBOL'].iloc[0] stock = get_ticker(stock1) tools=[ Tool( name="get stock data", func=get_stock_price, description=f"Use this tool to get stock price data. This tool will return three values: date, volume and closing price of the stock \ for the period of 5 days. stock = {stock}" ), Tool( name="SerpAPI Search", func=search.run, description=f"Use this tool for for web search for searching details about stock like broker sentiment. You can also get recent stock \ related news. stock name = {stockdesc}" ), ] prompt = hub.pull("hwchase17/structured-chat-agent") # Construct the Tools agent agent = create_structured_chat_agent(llm, tools, prompt) try: agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) response = agent_executor.invoke({"input": f"Get broker sentiment for the stock {stock} and stock name {stockdesc}"}) return f"Broker sentiment analysis for {stock}. - {response['output']}" except Exception as e: return f"An error occurred: {str(e)}" ################# # Fetch financial statements from Yahoo Finance def get_balancesheet(ticker): # time.sleep(4) #To avoid rate limit error if "." in ticker: ticker=ticker.split(".")[0] else: ticker=ticker ticker=ticker+".NS" company = yf.Ticker(ticker) df = company.balance_sheet # df = df.head(30) df.fillna(method='ffill',inplace=True) df.dropna(inplace=True) return df def get_incomestatement(ticker): # time.sleep(4) #To avoid rate limit error if "." in ticker: ticker=ticker.split(".")[0] else: ticker=ticker ticker=ticker+".NS" company = yf.Ticker(ticker) df = company.financials # df = df.head(30) df.fillna(method='ffill',inplace=True) df.dropna(inplace=True) return df def get_ticker(company_name): com=company_name+".NS" ticker = yf.Ticker(com) return ticker.info['symbol'] def get_financialratio(model, input,stock): stock_name = get_companynames(stock) llm = get_model(model) if not stock_name: return "This company has not made any announcements" else: stockname = stock_name[0] print("stock1 ",stockname) dfc = pd.read_csv('nse_data_old.csv') stock1 = dfc[dfc['COMPANY NAME'] == stockname]['SYMBOL'].iloc[0] print("staock1 ",stock1) stock = get_ticker(stock1) print("stock is ",stock) if input == '': return "No query has been entered!" else: resp = llm.invoke(f"You have to answer either 'A' or 'B' without any leading sentences - check whether the input {input} pertains \ to financial ratio query. If it pertains to financial ratio query, \ respond with letter 'A', else with letter 'B' if it contains only something like company name") print("nature of query ",resp) if resp.content == 'B': return "Enter a query pertaining to financial ratios!" else: # resp1 = llm.invoke(f"get yahoo finance ticker name of entity name in {input}. Just print the ticker alone. Do not print leading sentences.") # stock = resp1.content # resp2 = llm.invoke(f"to answer the query {input}, whether balance sheet or income statement required? If balance sheet, answer A, else B") resp2 = llm.invoke(f"Answer A, if balance sheet or B, if income statement. To answer the query {input}, \ whether balance sheet or income statement required - If balance sheet, answer A, else B") if resp2.content=='A': df1 = get_balancesheet(f'{stock}') print("balance sheet") else: df1 = get_incomestatement(f'{stock}') print("income statement") df=df1.T print("the df is ",df) cols= df.columns.tolist() resp3 = llm.invoke(f"List the column names, as python list, in {cols} needed for {input} calculation. Do not output any sentence other than column names.\ For example, do not output leading answer statements like: Here are the column names needed for ..") message=resp3.content def extract_df(df, message): c = ast.literal_eval(message) return df[c] df_new=extract_df(df,message) # prompt1 = f"List the column names, as python list, in {cols} needed for {data} calculation. Do not output any sentence other than column names.\ # For example, do not output leading answer statements like: Here are the column names needed for .." # prompt = f"What is the current ratio of {stock}?. Use {df_new}. Give only year and current ratio for that year in JSON format" parser = JsonOutputParser() prompt = PromptTemplate( template="Answer the user query.\n{format_instructions}\n{query}\n", input_variables=["query"], partial_variables={"format_instructions": parser.get_format_instructions()}, ) # prompt = ChatPromptTemplate.from_messages( # [ # ( # "system", # "You are a helpful financial data analysis assistant.", # ), # ("placeholder", "{chat_history}"), # ("human", f"Answer the user using df_new and input: question:{input}, dataframe: {df_new}, \ # format_instructions: parser.get_format_instructions()"\ # ), # ("placeholder", "{agent_scratchpad}"), # ] # ) chain = prompt | llm | parser try: response= chain.invoke( f"Using {df_new}, {input}?") # Print only the results. Print the output in Json format.") return f"For the company: {stockname}, Here are the details: {response}" except Exception as e: return f"An error occurred: {str(e)}" ########################## # Functions to plot a chart over ratios - this has scope for major enhancements! def plot_chart(data): # Load the JSON string into a Python object # data = json.loads(json_str) # Get the first key in the dictionary try: key = list(data.keys())[0] # Create a plot plt.figure(figsize=(8, 6)) plt.bar(data[key].keys(), data[key].values()) plt.title(f"{key} Over Years") plt.xlabel("Year") plt.ylabel(key) plt.tight_layout() # Return the plot return plt except Exception as e: return None # def get_chart(input): # response = get_financialratio(model,input) # plt = plot_chart(response) # return plt def get_chart(model,input,stock): stock_name = get_companynames(stock) if stock_name: response = get_financialratio(model,input,stock) # Extract the dictionary part using regex dict_match = re.search(r"\{.*\}", response) # Search for content within curly braces # Convert the extracted string to a dictionary if dict_match: extracted_dict_str = dict_match.group(0) # Get the matching text extracted_dict = ast.literal_eval(extracted_dict_str) # Convert string to dictionary else: extracted_dict = None # No dictionary found print("extrated tic ", extracted_dict) plt = plot_chart(extracted_dict) return plt else: return None def combined_ratio(model, input,stock): return get_financialratio(model,input,stock), get_chart(model, input,stock) ############################### ############################### # Create the Gradio Blocks interface with a title and description ############### def get_colist2(): dfco = pd.read_csv('dfco.csv') dfco1 = dfco[['COMPANY NAME']] dfco2 = dfco1.drop_duplicates() # Save the result to a new CSV file dfco2.to_csv('companies.csv', index=False) dfco3 = dfco2.head(10) co_list3 = dfco3['COMPANY NAME'].unique().tolist() filtered_df = dfco2[dfco2['COMPANY NAME'].isin(co_list)] co_list2 = filtered_df['COMPANY NAME'].tolist() return co_list2, co_list3 ###################################### def get_timestampmessage(flag): dfco = pd.read_csv('dfco.csv') timestamp = dfco[['BROADCAST DATE/TIME']].max().values.tolist()[0] if flag == 1: message = f"There is NSE timeout error. The latest filing information is available upto {timestamp}" else: message = f"Lastest filing information is available upto {timestamp}" return message global message_init, message_update def update(): flag, message_update = incremental_process() message_init = message_update return message_update def give_time(): return message_init def give_timestamp(): dfco = pd.read_csv("dfco.csv") timestamp = dfco[['BROADCAST DATE/TIME']].max().values.tolist()[0] return timestamp ################################## # Define the IST timezone ist_timezone = pytz.timezone("Asia/Kolkata") # Define UTC for server-side time utc_timezone = pytz.utc def refresh(): # Get the client-side timestamp (assuming it is in IST) timestamp_str = give_timestamp() # The format returned should match the expected format timest = give_time() given_time = datetime.strptime(timestamp_str, "%d-%b-%Y %H:%M:%S") given_time_ist = ist_timezone.localize(given_time) # Localize to IST # Get the current server time in UTC current_time_utc = datetime.now(tz=utc_timezone) # Convert the client-side time to UTC for consistent comparison given_time_utc = given_time_ist.astimezone(utc_timezone) # Calculate the time difference time_difference = current_time_utc - given_time_utc print("the time diff is ", time_difference) # Check if the time difference is greater than one hour if time_difference > timedelta(minutes=5): message1 = update() print("Incremental update run") else: message1 = f"Refresh allowed only if data is stale for more than 5 minutes. Current timestamp: {timest}" return message1 ########################################################################## def plot1_top_20(): df = pd.read_csv('nse_data_old.csv') subjects = ['Acquisition', 'Alteration Of Capital and Fund Raising-XBRL', 'Analysts/Institutional Investor Meet/Con. Call Updates', 'Board Meeting Intimation', 'Book Closure', 'Change in Directors/ Key Managerial Personnel/ Auditor/ Compliance Officer/ Share Transfer Agent', 'Change in Management', 'Credit Rating', 'Disclosure of material issue', 'Dividend', 'Financial Result Updates', 'Investor Presentation', 'Notice Of Shareholders Meetings-XBRL', 'Related Party Transactions', 'Resignation', 'Rights Issue', 'Shareholders meeting', 'Spurt in Volume', 'Update-Acquisition/Scheme/Sale/Disposal-XBRL', ] # companies = co_list2 # df = df[df['COMPANY NAME'].isin(co_list2)] df = df[df['SUBJECT'].isin(subjects)] # df['SUBJECT'] = df['SUBJECT'].replace('Change in Directors/ Key Managerial Personnel/ Auditor/ Compliance Officer/ Share Transfer Agent', 'Change in Key Managerial Personnel') df['SUBJECT'] = df['SUBJECT'].replace('Change in Directors/ Key Managerial Personnel/ Auditor/ Compliance Officer/ Share Transfer Agent', 'Change in Key Managerial Personnel') value_counts = df['SUBJECT'].value_counts() # Get the top 10 labels by count # top_20_value_counts = value_counts[:20] plt.figure(figsize=(10, 6)) plt.barh(value_counts.index, value_counts.values) plt.xlabel('Count') plt.ylabel('Announcements') plt.title('NSE Corporate Announcements - A Glance') plt.tight_layout() # plt.close() return plt ## Function to create company list specific chart def plot2_top_20(): co_list2,_ = get_colist2() # global co_list2 # Get the counts of each label df = pd.read_csv('nse_data_old.csv') subjects = ['Acquisition', 'Alteration Of Capital and Fund Raising-XBRL', 'Analysts/Institutional Investor Meet/Con. Call Updates', 'Board Meeting Intimation', 'Book Closure', 'Change in Directors/ Key Managerial Personnel/ Auditor/ Compliance Officer/ Share Transfer Agent', 'Change in Management', 'Credit Rating', 'Disclosure of material issue', 'Dividend', 'Financial Result Updates', 'Investor Presentation', 'Notice Of Shareholders Meetings-XBRL', 'Related Party Transactions', 'Resignation', 'Rights Issue', 'Shareholders meeting', 'Spurt in Volume', 'Update-Acquisition/Scheme/Sale/Disposal-XBRL', ] # companies = co_list2 df = df[df['COMPANY NAME'].isin(co_list2)] # df = df[df['COMPANY NAME'].isin(co_list_tracked)] # df = df[df['SUBJECT'].isin(subjects)] # df['SUBJECT'] = df['SUBJECT'].replace('Change in Directors/ Key Managerial Personnel/ Auditor/ Compliance Officer/ Share Transfer Agent', 'Change in Key Managerial Personnel') df['SUBJECT'] = df['SUBJECT'].replace('Change in Directors/ Key Managerial Personnel/ Auditor/ Compliance Officer/ Share Transfer Agent', 'Change in Key Managerial Personnel') value_counts = df['SUBJECT'].value_counts() # Get the top 10 labels by count # top_20_value_counts = value_counts[:20] plt.figure(figsize=(10, 6)) plt.barh(value_counts.index, value_counts.values) plt.xlabel('Count') plt.ylabel('Announcements') plt.title('NSE Corporate Announcements - Tracked Companies') plt.tight_layout() # plt.close() return plt def get_companynames(stock): print("get companynames", stock) df = pd.read_csv('nse_data_old.csv') if stock: print('fi stock', stock) # Create a regular expression pattern pattern = f'.*{stock}.*' # Get rows where 'COMPANY NAME' contains the keyword (case-insensitive) matched_rows = df[df['COMPANY NAME'].str.contains(pattern, case=False, na=False)] # Get unique company names unique_companies = matched_rows['COMPANY NAME'].unique() return list(unique_companies) else: return "not found" # A combined function to be used in Gradio output box def print_model(llm): co_list2,_ = get_colist2() if co_list2: return f"You are using {llm.model_name} model for this session. \n \n" \ f"These are the companies you track: {co_list_tracked}. \n \n" \ f"These are the companies, including those in NIFTY, that have filed any information with NSE either today / yesterday - {co_list2}" else: return f"You are using {llm.model_name} model for this session. \n \n" \ f"Your are tracking these companies: {co_list_tracked}, \n \n"\ f"None of the tracked companies or NIFTY 50 have filed any information with NSE on either today or yesterday" def print_model1(llm): return f"You are using {llm.model_name} model for this session. \n \n [Note: There is NSE timeout error preventing fetching of latest data. So, results may not be real-time / up-to-date]" def combined_function1(model,stock): global flag llm = get_model(model) stock = get_companynames(stock) if flag == 0: return print_model(llm), give_announcement(llm,stock),get_ca(llm,stock),get_movements(llm,stock), get_sentiments(llm,stock) else: return print_model1(llm), give_announcement(llm,stock),get_ca(llm,stock),get_movements(llm,stock), get_sentiments(llm,stock) def get_model(model_name): llm = ChatGroq( api_key=groq_api_key, model=model_name, max_tokens = 8192, # model = 'gemma-7b-it', temperature = 0 # model = 'mixtral-8x7B-32768' ) return llm # This function is given here as company list is dynamic def give_names(): global co_list_tracked co_list2, co_list3 = get_colist2() return f"Apart from NIFTY, these are the companies you track: \n \n" \ f" {co_list_tracked}. \n \n" \ f"These are the tracked companies that have made announcements: \n \n" \ f"{co_list2}. \n \n" \ f"These are latest 10 companies that have made announcements: \n \n " \ f"{co_list3}" ############################## retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat") ############################### # This function is for chat queries. Given here due to retriever defined here def chat_chain(model,query): llm = get_model(model) if query=='': return "Please enter a query!" else: combine_docs_chain = create_stuff_documents_chain( llm, retrieval_qa_chat_prompt) retriever2 = vectorstore2.as_retriever() retrieval_chain = create_retrieval_chain(retriever2, combine_docs_chain) response = retrieval_chain.invoke({"input": query}) return response['answer'] ################################# ## NEW ## Update the vectorstate with latest data global flag def init_process(): global vectorstore,vectorstore2,vectorstore3, flag try: df_new, _ = get_pd(1) flag = 0 except: df_new = pd.read_csv("df_backup.csv") flag = 1 df_new.to_csv("df_new.csv",index=False) print("length of df_new ",len(df_new)) print("length of df_old ", len(df_old)) #drop unnecessary common columns df_new.drop(['RECEIPT','DISSEMINATION','DIFFERENCE'],axis=1,inplace=True) # #find the difference and add incrementally for first store df_merged = df_new.merge(df_old, how='left', indicator=True) # Filter rows that are unique to 'n' (i.e., where '_merge' is 'left_only') df_add1= df_merged[df_merged['_merge'] == 'left_only'].drop(columns=['_merge']) # Save it as a CSV file df_add1.to_csv("nse_data_add1.csv", index=False) #drop unnecessary columns for second vector store df_new2 = df_new.drop(['ATTACHMENT'],axis=1) # add increment for second store df_merged = df_new2.merge(df_old2, how='left', indicator=True) df_add2 = df_merged[df_merged['_merge'] == 'left_only'].drop(columns=['_merge']) # Save it as a CSV file df_add2.to_csv("nse_data_add2.csv", index=False) ##################### # Load the first CSV file dfold = pd.read_csv('nse_data_old.csv') # Load the second CSV file dfadd = pd.read_csv('nse_data_add1.csv') # print("df old",dfold) # print("######") # print("df add ",dfadd) if dfadd.empty: dfco = dfold.copy() else: # Append df2 at the end of df1 dfco = pd.concat([dfold, dfadd], ignore_index=True) dfco.to_csv("dfco.csv",index=False) # Here incremental RAG is achieved by adding additional data dynamically to vectorstore loader = CSVLoader("nse_data_add1.csv") data_new1 = loader.load() loader = CSVLoader("nse_data_add2.csv") data_new2 = loader.load() print("original size ",vectorstore.index.ntotal) len1 = len(pd.read_csv('nse_data_old.csv')) + len(pd.read_csv('nse_data_add1.csv')) print("len1 old + new csv ",len1) len2 = vectorstore.index.ntotal if len1!=len2: print("old size ",vectorstore.index.ntotal) # for first store vectorstore_add1 = FAISS.from_documents(data_new1, embedding_function) print("incremental size ",vectorstore_add1.index.ntotal) vectorstore_new1 = FAISS.load_local("vectorstore",embedding_function,allow_dangerous_deserialization=True) vectorstore_new1.merge_from(vectorstore_add1) vectorstore_new1.save_local("vectorstore") print("new size ",vectorstore_new1.index.ntotal) print("new old size ",vectorstore.index.ntotal) # retrieverx = vectorstore_new.as_retriever() # for second store vectorstore_add2 = FAISS.from_documents(data_new2, embedding_function) print("incremental size ",vectorstore_add2.index.ntotal) vectorstore_new2 = FAISS.load_local("vectorstore2",embedding_function,allow_dangerous_deserialization=True) vectorstore_new2.merge_from(vectorstore_add2) vectorstore_new2.save_local("vectorstore2") print("new size ",vectorstore_new2.index.ntotal) print("new old size ",vectorstore2.index.ntotal) # retrieverx = vectorstore_new2.as_retriever() ########################## # Define updated vector stores, retrievers and QA chains ########################## vectorstore = FAISS.load_local("vectorstore",embedding_function,allow_dangerous_deserialization=True) print("final size store 1",vectorstore.index.ntotal) vectorstore2 = FAISS.load_local("vectorstore2",embedding_function,allow_dangerous_deserialization=True) print("final size store 2",vectorstore2.index.ntotal) vectorstore3 = FAISS.load_local("vectorstore3",embedding_function,allow_dangerous_deserialization=True) print("final size store 3",vectorstore3.index.ntotal) message_init = get_timestampmessage(flag) print("timeout flag",flag) print("message at init ", message_init) return flag, message_init def incremental_process(): global vectorstore,vectorstore2,vectorstore3, flag try: df_new, _ = get_pd(1) flag = 0 except: df_new = pd.read_csv("df_backup.csv") flag = 1 df_new.to_csv("df_new.csv",index=False) print("length of df_new ",len(df_new)) print("length of df_old ", len(df_old)) #drop unnecessary common columns df_new.drop(['RECEIPT','DISSEMINATION','DIFFERENCE'],axis=1,inplace=True) # #find the difference and add incrementally for first store df_merged = df_new.merge(df_old, how='left', indicator=True) # Filter rows that are unique to 'n' (i.e., where '_merge' is 'left_only') df_add1= df_merged[df_merged['_merge'] == 'left_only'].drop(columns=['_merge']) # Save it as a CSV file df_add1.to_csv("nse_data_add1.csv", index=False) #drop unnecessary columns for second vector store df_new2 = df_new.drop(['ATTACHMENT'],axis=1) # add increment for second store df_merged = df_new2.merge(df_old2, how='left', indicator=True) df_add2 = df_merged[df_merged['_merge'] == 'left_only'].drop(columns=['_merge']) # Save it as a CSV file df_add2.to_csv("nse_data_add2.csv", index=False) ##################### # Load the first CSV file dfold = pd.read_csv('nse_data_old.csv') # Load the second CSV file dfadd = pd.read_csv('nse_data_add1.csv') # print("df old",dfold) # print("######") # print("df add ",dfadd) if dfadd.empty: dfco = dfold.copy() else: # Append df2 at the end of df1 dfco = pd.concat([dfold, dfadd], ignore_index=True) dfco.to_csv("dfco.csv",index=False) # Here incremental RAG is achieved by adding additional data dynamically to vectorstore loader = CSVLoader("nse_data_add1.csv") data_new1 = loader.load() loader = CSVLoader("nse_data_add2.csv") data_new2 = loader.load() print("original size ",vectorstore.index.ntotal) len1 = len(pd.read_csv('nse_data_old.csv')) + len(pd.read_csv('nse_data_add1.csv')) print("len1 old + new csv ",len1) len2 = vectorstore.index.ntotal if len1!=len2: print("old size ",vectorstore.index.ntotal) # for first store vectorstore_add1 = FAISS.from_documents(data_new1, embedding_function) print("incremental size ",vectorstore_add1.index.ntotal) vectorstore_new1 = FAISS.load_local("vectorstore",embedding_function,allow_dangerous_deserialization=True) vectorstore_new1.merge_from(vectorstore_add1) vectorstore_new1.save_local("vectorstore") print("new size ",vectorstore_new1.index.ntotal) print("new old size ",vectorstore.index.ntotal) # retrieverx = vectorstore_new.as_retriever() # for second store vectorstore_add2 = FAISS.from_documents(data_new2, embedding_function) print("incremental size ",vectorstore_add2.index.ntotal) vectorstore_new2 = FAISS.load_local("vectorstore2",embedding_function,allow_dangerous_deserialization=True) vectorstore_new2.merge_from(vectorstore_add2) vectorstore_new2.save_local("vectorstore2") print("new size ",vectorstore_new2.index.ntotal) print("new old size ",vectorstore2.index.ntotal) # retrieverx = vectorstore_new2.as_retriever() ########################## # Define updated vector stores, retrievers and QA chains ########################## vectorstore = FAISS.load_local("vectorstore",embedding_function,allow_dangerous_deserialization=True) print("final size store 1",vectorstore.index.ntotal) vectorstore2 = FAISS.load_local("vectorstore2",embedding_function,allow_dangerous_deserialization=True) print("final size store 2",vectorstore2.index.ntotal) vectorstore3 = FAISS.load_local("vectorstore3",embedding_function,allow_dangerous_deserialization=True) print("final size store 3",vectorstore3.index.ntotal) message_update = get_timestampmessage(flag) return flag, message_update ################################# ## Update the vectorstate with latest data flag,message_init = init_process() ########################################################################### ########################################################################### with gr.Blocks() as demo: # Add a Markdown block for the description gr.Markdown("""