import streamlit as st import pandas as pd from transformers import BartForConditionalGeneration, TapexTokenizer, T5ForConditionalGeneration, T5Tokenizer from prophet import Prophet import datetime import sentencepiece as spm st.markdown("""
Chatbot do Tesouro RS
""", unsafe_allow_html=True) # File upload interface uploaded_file = st.file_uploader("Upload a CSV or XLSX file", type=['csv', 'xlsx']) if uploaded_file: # Load the file into a DataFrame if uploaded_file.name.endswith('.csv'): df = pd.read_csv(uploaded_file, quotechar='"', encoding='utf-8') elif uploaded_file.name.endswith('.xlsx'): df = pd.read_excel(uploaded_file) # Data preprocessing for Prophet new_df = df.iloc[2:, 9:-1].fillna(0) new_df.columns = df.iloc[1, 9:-1] new_df.columns = new_df.columns.str.replace(r" \(\d+\)", "", regex=True) month_dict = { 'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04', 'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08', 'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12' } def convert_column_name(column_name): if column_name == 'Rótulos de Linha': return column_name parts = column_name.split('/') month = parts[0].strip() year = parts[1].strip() year = ''.join(filter(str.isdigit, year)) month_number = month_dict.get(month, '00') return f"{month_number}/{year}" new_df.columns = [convert_column_name(col) for col in new_df.columns] new_df.columns = pd.to_datetime(new_df.columns, errors='coerce') new_df.rename(columns={new_df.columns[0]: 'Rotulo'}, inplace=True) df_clean = new_df.copy() # Create an empty DataFrame to store all anomalies all_anomalies = pd.DataFrame() # Process each row in the DataFrame for index, row in df_clean.iterrows(): data = pd.DataFrame({ 'ds': [col for col in df_clean.columns if isinstance(col, pd.Timestamp)], 'y': row[[isinstance(col, pd.Timestamp) for col in df_clean.columns]].values }) data = data[data['y'] > 0].reset_index(drop=True) if data.empty or len(data) < 2: print(f"Skipping group {row['Rotulo']} because there are less than 2 non-zero observations.") continue try: model = Prophet(interval_width=0.95) model.fit(data) except ValueError as e: print(f"Skipping group {row['Rotulo']} due to error: {e}") continue future = model.make_future_dataframe(periods=12, freq='M') forecast = model.predict(future) num_real = len(data) num_forecast = len(forecast) real_values = list(data['y']) + [None] * (num_forecast - num_real) forecast['real'] = real_values anomalies = forecast[(forecast['real'] < forecast['yhat_lower']) | (forecast['real'] > forecast['yhat_upper'])] anomalies['Group'] = row['Rotulo'] all_anomalies = pd.concat([all_anomalies, anomalies[['ds', 'real', 'Group']]], ignore_index=True) # Preparing anomalies DataFrame for TAPEX model all_anomalies.rename(columns={"ds": "datetime", "real": "monetary value", "Group": "explanation"}, inplace=True) all_anomalies['monetary value'] = all_anomalies['monetary value'].apply(lambda x: f"{x:.2f}") all_anomalies = all_anomalies.fillna('').astype(str) # Load translation models pt_en_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-pt-en-t5") en_pt_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-en-pt-t5") tokenizer = T5Tokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5") # Load TAPEX model tapex_model = BartForConditionalGeneration.from_pretrained("microsoft/tapex-large-finetuned-wtq") tapex_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq") def translate(text, model, tokenizer, source_lang="pt", target_lang="en"): input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True) outputs = model.generate(input_ids) translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return translated_text def response(user_question, table_data): question_en = translate(user_question, pt_en_translator, tokenizer, source_lang="pt", target_lang="en") encoding = tapex_tokenizer(table=table_data, query=[question_en], padding=True, return_tensors="pt", truncation=True) outputs = tapex_model.generate(**encoding) response_en = tapex_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] response_pt = translate(response_en, en_pt_translator, tokenizer, source_lang="en", target_lang="pt") return response_pt # Streamlit interface st.dataframe(all_anomalies.head()) # Chat history if 'history' not in st.session_state: st.session_state['history'] = [] user_question = st.text_input("Escreva sua questão aqui:", "") if user_question: st.session_state['history'].append(('👤', user_question)) st.markdown(f"**👤 {user_question}**") bot_response = response(user_question, all_anomalies) st.session_state['history'].append(('🤖', bot_response)) st.markdown(f"
**🤖 {bot_response}**
", unsafe_allow_html=True) if st.button("Limpar"): st.session_state['history'] = [] for sender, message in st.session_state['history']: if sender == '👤': st.markdown(f"**👤 {message}**") elif sender == '🤖': st.markdown(f"
**🤖 {message}**
", unsafe_allow_html=True) else: st.warning("Please upload a CSV or XLSX file to start.")