import streamlit as st import pandas as pd from transformers import BartForConditionalGeneration, TapexTokenizer, T5ForConditionalGeneration, T5Tokenizer from prophet import Prophet import datetime import sentencepiece as spm st.markdown("""
Chatbot do Tesouro RS
""", unsafe_allow_html=True) # File upload interface uploaded_file = st.file_uploader("carregue um arquivo CSV ou XLSX", type=['csv', 'xlsx']) if uploaded_file: if 'all_anomalies' not in st.session_state: with st.spinner('Aplicando modelo de série temporal...'): # Load the file into a DataFrame if uploaded_file.name.endswith('.csv'): df = pd.read_csv(uploaded_file, quotechar='"', encoding='utf-8') elif uploaded_file.name.endswith('.xlsx'): df = pd.read_excel(uploaded_file) # Data preprocessing for Prophet new_df = df.iloc[2:, 9:-1].fillna(0) new_df.columns = df.iloc[1, 9:-1] new_df.columns = new_df.columns.str.replace(r" \(\d+\)", "", regex=True) month_dict = { 'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04', 'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08', 'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12' } def convert_column_name(column_name): if column_name == 'Rótulos de Linha': return column_name parts = column_name.split('/') month = parts[0].strip() year = parts[1].strip() year = ''.join(filter(str.isdigit, year)) month_number = month_dict.get(month, '00') return f"{month_number}/{year}" new_df.columns = [convert_column_name(col) for col in new_df.columns] new_df.columns = pd.to_datetime(new_df.columns, errors='coerce') new_df.rename(columns={new_df.columns[0]: 'Rotulo'}, inplace=True) df_clean = new_df.copy() # Create an empty DataFrame to store all anomalies all_anomalies = pd.DataFrame() # Process each row in the DataFrame for index, row in df_clean.iterrows(): data = pd.DataFrame({ 'ds': [col for col in df_clean.columns if isinstance(col, pd.Timestamp)], 'y': row[[isinstance(col, pd.Timestamp) for col in df_clean.columns]].values }) data = data[data['y'] > 0].reset_index(drop=True) if data.empty or len(data) < 2: print(f"Skipping group {row['Rotulo']} because there are less than 2 non-zero observations.") continue try: model = Prophet(interval_width=0.95) model.fit(data) except ValueError as e: print(f"Skipping group {row['Rotulo']} due to error: {e}") continue future = model.make_future_dataframe(periods=12, freq='M') forecast = model.predict(future) num_real = len(data) num_forecast = len(forecast) real_values = list(data['y']) + [None] * (num_forecast - num_real) forecast['real'] = real_values anomalies = forecast[(forecast['real'] < forecast['yhat_lower']) | (forecast['real'] > forecast['yhat_upper'])] anomalies['Group'] = row['Rotulo'] all_anomalies = pd.concat([all_anomalies, anomalies[['ds', 'real', 'Group']]], ignore_index=True) # Store the result in session state all_anomalies.rename(columns={"ds": "datetime", "real": "monetary value", "Group": "group"}, inplace=True) all_anomalies['monetary value'] = all_anomalies['monetary value'].apply(lambda x: f"{x:.2f}") all_anomalies = all_anomalies.fillna('').astype(str) st.session_state['all_anomalies'] = all_anomalies # Load translation models pt_en_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-pt-en-t5") en_pt_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-en-pt-t5") tokenizer = T5Tokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5") # Load TAPEX model tapex_model = BartForConditionalGeneration.from_pretrained("microsoft/tapex-large-finetuned-wtq") tapex_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq") def translate(text, model, tokenizer, source_lang="pt", target_lang="en"): input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True) outputs = model.generate(input_ids) translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return translated_text def response(user_question, table_data): question_en = translate(user_question, pt_en_translator, tokenizer, source_lang="pt", target_lang="en") encoding = tapex_tokenizer(table=table_data, query=[question_en], padding=True, return_tensors="pt", truncation=True) outputs = tapex_model.generate(**encoding) response_en = tapex_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] response_pt = translate(response_en, en_pt_translator, tokenizer, source_lang="en", target_lang="pt") return response_pt # Streamlit interface st.dataframe(st.session_state['all_anomalies'].head()) # Chat history if 'history' not in st.session_state: st.session_state['history'] = [] user_question = st.text_input("Escreva sua questão aqui:", "") if user_question: st.session_state['history'].append(('👤', user_question)) st.markdown(f"**👤 {user_question}**") bot_response = response(user_question, st.session_state['all_anomalies']) st.session_state['history'].append(('🤖', bot_response)) st.markdown(f"
**🤖 {bot_response}**
", unsafe_allow_html=True) if st.button("Limpar"): st.session_state['history'] = [] for sender, message in st.session_state['history']: if sender == '👤': st.markdown(f"**👤 {message}**") elif sender == '🤖': st.markdown(f"
**🤖 {message}**
", unsafe_allow_html=True) else: st.warning("Por favor, carregue um arquivo CSV ou XLSX para começar.")