fschwartzer's picture
Update app.py
96bff79 verified
raw
history blame
No virus
6.31 kB
import streamlit as st
import pandas as pd
from transformers import BartForConditionalGeneration, TapexTokenizer, T5ForConditionalGeneration, T5Tokenizer
from prophet import Prophet
import datetime
import sentencepiece as spm
st.markdown("""
<div style='display: flex; align-items: center;'>
<div style='width: 20px; height: 20px; background-color: green; border-radius: 50%; margin-right: 2px;'></div>
<div style='width: 20px; height: 20px; background-color: red; border-radius: 50%; margin-right: 2px;'></div>
<div style='width: 20px; height: 20px; background-color: yellow; border-radius: 50%; margin-right: 10px;'></div>
<span style='font-size: 40px; font-weight: bold;'>Chatbot do Tesouro RS</span>
</div>
""", unsafe_allow_html=True)
# File upload interface
uploaded_file = st.file_uploader("Upload a CSV or XLSX file", type=['csv', 'xlsx'])
if uploaded_file:
# Load the file into a DataFrame
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file, quotechar='"', encoding='utf-8')
elif uploaded_file.name.endswith('.xlsx'):
df = pd.read_excel(uploaded_file)
# Data preprocessing for Prophet
new_df = df.iloc[2:, 9:-1].fillna(0)
new_df.columns = df.iloc[1, 9:-1]
new_df.columns = new_df.columns.str.replace(r" \(\d+\)", "", regex=True)
month_dict = {
'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04',
'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08',
'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12'
}
def convert_column_name(column_name):
if column_name == 'Rótulos de Linha':
return column_name
parts = column_name.split('/')
month = parts[0].strip()
year = parts[1].strip()
year = ''.join(filter(str.isdigit, year))
month_number = month_dict.get(month, '00')
return f"{month_number}/{year}"
new_df.columns = [convert_column_name(col) for col in new_df.columns]
new_df.columns = pd.to_datetime(new_df.columns, errors='coerce')
new_df.rename(columns={new_df.columns[0]: 'Rotulo'}, inplace=True)
df_clean = new_df.copy()
# Create an empty DataFrame to store all anomalies
all_anomalies = pd.DataFrame()
# Process each row in the DataFrame
for index, row in df_clean.iterrows():
data = pd.DataFrame({
'ds': [col for col in df_clean.columns if isinstance(col, pd.Timestamp)],
'y': row[[isinstance(col, pd.Timestamp) for col in df_clean.columns]].values
})
data = data[data['y'] > 0].reset_index(drop=True)
if data.empty or len(data) < 2:
print(f"Skipping group {row['Rotulo']} because there are less than 2 non-zero observations.")
continue
try:
model = Prophet(interval_width=0.95)
model.fit(data)
except ValueError as e:
print(f"Skipping group {row['Rotulo']} due to error: {e}")
continue
future = model.make_future_dataframe(periods=12, freq='M')
forecast = model.predict(future)
num_real = len(data)
num_forecast = len(forecast)
real_values = list(data['y']) + [None] * (num_forecast - num_real)
forecast['real'] = real_values
anomalies = forecast[(forecast['real'] < forecast['yhat_lower']) | (forecast['real'] > forecast['yhat_upper'])]
anomalies['Group'] = row['Rotulo']
all_anomalies = pd.concat([all_anomalies, anomalies[['ds', 'real', 'Group']]], ignore_index=True)
# Preparing anomalies DataFrame for TAPEX model
all_anomalies.rename(columns={"ds": "datetime", "real": "monetary value", "Group": "explanation"}, inplace=True)
all_anomalies['monetary value'] = all_anomalies['monetary value'].apply(lambda x: f"{x:.2f}")
all_anomalies = all_anomalies.fillna('').astype(str)
# Load translation models
pt_en_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-pt-en-t5")
en_pt_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-en-pt-t5")
tokenizer = T5Tokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")
# Load TAPEX model
tapex_model = BartForConditionalGeneration.from_pretrained("microsoft/tapex-large-finetuned-wtq")
tapex_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
def translate(text, model, tokenizer, source_lang="pt", target_lang="en"):
input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
outputs = model.generate(input_ids)
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return translated_text
def response(user_question, table_data):
question_en = translate(user_question, pt_en_translator, tokenizer, source_lang="pt", target_lang="en")
encoding = tapex_tokenizer(table=table_data, query=[question_en], padding=True, return_tensors="pt", truncation=True)
outputs = tapex_model.generate(**encoding)
response_en = tapex_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
response_pt = translate(response_en, en_pt_translator, tokenizer, source_lang="en", target_lang="pt")
return response_pt
# Streamlit interface
st.dataframe(all_anomalies.head())
# Chat history
if 'history' not in st.session_state:
st.session_state['history'] = []
user_question = st.text_input("Escreva sua questão aqui:", "")
if user_question:
st.session_state['history'].append(('👤', user_question))
st.markdown(f"**👤 {user_question}**")
bot_response = response(user_question, all_anomalies)
st.session_state['history'].append(('🤖', bot_response))
st.markdown(f"<div style='text-align: right'>**🤖 {bot_response}**</div>", unsafe_allow_html=True)
if st.button("Limpar"):
st.session_state['history'] = []
for sender, message in st.session_state['history']:
if sender == '👤':
st.markdown(f"**👤 {message}**")
elif sender == '🤖':
st.markdown(f"<div style='text-align: right'>**🤖 {message}**</div>", unsafe_allow_html=True)
else:
st.warning("Please upload a CSV or XLSX file to start.")