ai_seeker / youtube.py
srikanth-nm's picture
Upload 19 files
b30ed6a
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter
import json
import ingest
import run_localGPT
import utils
def audio_to_transcript(video_id):
sub = YouTubeTranscriptApi.get_transcript(video_id)
formatted_subs = JSONFormatter().format_transcript(transcript=sub)
with open("transcript.json", "w") as outfile:
json.dump(sub, outfile)
lstTexts = []
for dct in sub:
lstTexts.append(dct['text'])
strResult = ' '.join(lstTexts)
with open('SOURCE_DOCUMENTS/transcript.txt', 'w') as outfile:
outfile.write(strResult)
transcript = ' '.join(lstTexts)
utils.calculate_ends('transcript.json','transcript_end.json')
utils.create_chunks('transcript_end.json','chunks.json')
return transcript
def start_training():
training_status = ingest.main()
return training_status
def replace_substring_and_following(input_string, substring):
index = input_string.find(substring)
if index != -1:
return input_string[:index]
else:
return input_string
def ask_question(strQuestion):
answer = run_localGPT.main(device_type='cpu', strQuery=strQuestion)
answer_cleaned = replace_substring_and_following(answer, "Unhelpful Answer")
return answer_cleaned
def summarize():
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
model_id = "TheBloke/Llama-2-7B-Chat-GGML"
model_basename = "llama-2-7b-chat.ggmlv3.q4_0.bin"
llm = run_localGPT.load_model(device_type='cpu', model_id=model_id, model_basename=model_basename)
text_splitter = CharacterTextSplitter()
with open("SOURCE_DOCUMENTS/transcript.txt") as f:
file_content = f.read()
texts = text_splitter.split_text(file_content)
from langchain.docstore.document import Document
docs = [Document(page_content=t) for t in texts]
from langchain.chains.summarize import load_summarize_chain
chain = load_summarize_chain(llm, chain_type="map_reduce")
summary = chain.run(docs)
return summary