# -*- coding: utf-8 -*- __company__ = '' __project__ = 'Observatory News - Final Project' __author__ = 'Strauss' __intial__ = '2023-08-07' import sys import os import pandas as pd import numpy as np import streamlit as st from database.data import Data from utils.graphs import Altair from database.data import Data import glob from datasets import load_dataset from huggingface_hub import login, interpreter_login from bertopic import BERTopic from sentence_transformers import SentenceTransformer import os sys.path.append(os.path.dirname(os.getcwd())) ## Label Metrics Size st.markdown("""""", unsafe_allow_html=True,) data = Data() def load_ds(media): dir = '{0}/{1}'.format(data.pth_data, media) st.info(dir) files = glob.glob(dir + '/*.csv') df_list = (pd.read_csv(file) for file in files) st.info(files) df = pd.concat(df_list, ignore_index=True) st.info('Loaded {0} rows and {1} columns'.format(df.shape[0], df.shape[1])) return df def load_hugging_face_ds(media): # If the dataset is gated/private, make sure you have run huggingface-cli login dataset = load_dataset("strauss-oak/observatory-brazilian-news") dir = '{0}{1}/202101.csv'.format(data.pth_data, media) st.info(dir) dataset = load_dataset('csv', data_files=[dir], delimiter=',') return dataset @st.cache_resource def load_embeddings(): media = 'globo' mdl_st_nm = 'paraphrase-multilingual-mpnet-base-v2' pth_gd_embedding = '/home/user/app/embeddings' st.info(os.getcwd()) embedding_nm = 'embedding_{0}_{1}.npy'.format(media, mdl_st_nm) embeddings = np.load('{0}/{1}'.format(pth_gd_embedding, embedding_nm), allow_pickle=True) st.info('Embedding da mídia {0} carregada com sucesso com tamanho {1}'.format(media, embeddings.shape)) return embeddings @st.cache_resource def load_topic_model(): access_token = os.getenv('EMB_TKN') login(token=access_token) mdl_bertopic_globo = BERTopic.load("strauss-oak/observatory-news-topic-model") st.info(mdl_bertopic_globo) return mdl_bertopic_globo @st.cache_resource def load_sbert(): access_token = os.getenv('EMB_TKN') login(token=access_token) mdl_sbert = SentenceTransformer.load('strauss-oak/mdl_bertopic_globo') st.info(mdl_sbert) return mdl_sbert #embeddings = load_embeddings() mdl_bertopic_globo = load_topic_model() mdl_sbert = load_sbert() c1 = st.container() with c1: topics_map = {-1: 'nao agrupado', 0: 'Pandemia Covid-19', 1: 'Política', 2: 'Segurança Pública', 3: 'Música', 4: 'Guerra - Rússia', 5: 'Clima Tempo', 6: 'Combustíveis', 7: 'Medicina', 8: 'Economia', 9: 'Loterias', 10: 'Aeronáutica', 11: 'Tecnologia', 12: 'Tributações', 13: 'Cotidiano', 14: 'Ecologia', 15: 'Guerra Palestina', 16: 'Mercado de trabalho', 17: 'Diversidade de gênero', 18: 'Nascimento'} st.info(topics_map) news_text = st.text_input('Texto da notícia', 'Digite o texto da notícia do g1') news = [] news.append(news_text) if st.button('Identificar Tópico da notícia:'): embeddings_ = mdl_sbert.encode(news) topics, probs = mdl_bertopic_globo.transform(news, embeddings_) st.info(topics) st.info(topics_map.get(topics[0]))