File size: 6,314 Bytes
e2af017
 
 
96bff79
e2af017
 
 
96bff79
 
 
 
 
 
 
 
 
e2af017
96bff79
e2af017
 
 
 
 
 
 
 
96bff79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2af017
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96bff79
e2af017
 
 
 
 
 
 
 
 
 
 
96bff79
e2af017
 
 
 
 
 
 
 
 
 
 
 
 
96bff79
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import streamlit as st
import pandas as pd
from transformers import BartForConditionalGeneration, TapexTokenizer, T5ForConditionalGeneration, T5Tokenizer
from prophet import Prophet
import datetime
import sentencepiece as spm

st.markdown("""
<div style='display: flex; align-items: center;'>
    <div style='width: 20px; height: 20px; background-color: green; border-radius: 50%; margin-right: 2px;'></div>
    <div style='width: 20px; height: 20px; background-color: red; border-radius: 50%; margin-right: 2px;'></div>
    <div style='width: 20px; height: 20px; background-color: yellow; border-radius: 50%; margin-right: 10px;'></div>
    <span style='font-size: 40px; font-weight: bold;'>Chatbot do Tesouro RS</span>
</div>
""", unsafe_allow_html=True)

# File upload interface
uploaded_file = st.file_uploader("Upload a CSV or XLSX file", type=['csv', 'xlsx'])

if uploaded_file:
    # Load the file into a DataFrame
    if uploaded_file.name.endswith('.csv'):
        df = pd.read_csv(uploaded_file, quotechar='"', encoding='utf-8')
    elif uploaded_file.name.endswith('.xlsx'):
        df = pd.read_excel(uploaded_file)

    # Data preprocessing for Prophet
    new_df = df.iloc[2:, 9:-1].fillna(0)
    new_df.columns = df.iloc[1, 9:-1]
    new_df.columns = new_df.columns.str.replace(r" \(\d+\)", "", regex=True)

    month_dict = {
        'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04',
        'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08',
        'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12'
    }

    def convert_column_name(column_name):
        if column_name == 'Rótulos de Linha':
            return column_name
        parts = column_name.split('/')
        month = parts[0].strip()
        year = parts[1].strip()
        year = ''.join(filter(str.isdigit, year))
        month_number = month_dict.get(month, '00')
        return f"{month_number}/{year}"

    new_df.columns = [convert_column_name(col) for col in new_df.columns]
    new_df.columns = pd.to_datetime(new_df.columns, errors='coerce')
    new_df.rename(columns={new_df.columns[0]: 'Rotulo'}, inplace=True)
    df_clean = new_df.copy()

    # Create an empty DataFrame to store all anomalies
    all_anomalies = pd.DataFrame()

    # Process each row in the DataFrame
    for index, row in df_clean.iterrows():
        data = pd.DataFrame({
            'ds': [col for col in df_clean.columns if isinstance(col, pd.Timestamp)],
            'y': row[[isinstance(col, pd.Timestamp) for col in df_clean.columns]].values
        })

        data = data[data['y'] > 0].reset_index(drop=True)
        if data.empty or len(data) < 2:
            print(f"Skipping group {row['Rotulo']} because there are less than 2 non-zero observations.")
            continue

        try:
            model = Prophet(interval_width=0.95)
            model.fit(data)
        except ValueError as e:
            print(f"Skipping group {row['Rotulo']} due to error: {e}")
            continue

        future = model.make_future_dataframe(periods=12, freq='M')
        forecast = model.predict(future)

        num_real = len(data)
        num_forecast = len(forecast)
        real_values = list(data['y']) + [None] * (num_forecast - num_real)
        forecast['real'] = real_values
        anomalies = forecast[(forecast['real'] < forecast['yhat_lower']) | (forecast['real'] > forecast['yhat_upper'])]

        anomalies['Group'] = row['Rotulo']
        all_anomalies = pd.concat([all_anomalies, anomalies[['ds', 'real', 'Group']]], ignore_index=True)

    # Preparing anomalies DataFrame for TAPEX model
    all_anomalies.rename(columns={"ds": "datetime", "real": "monetary value", "Group": "explanation"}, inplace=True)
    all_anomalies['monetary value'] = all_anomalies['monetary value'].apply(lambda x: f"{x:.2f}")
    all_anomalies = all_anomalies.fillna('').astype(str)

    # Load translation models
    pt_en_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-pt-en-t5")
    en_pt_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-en-pt-t5")
    tokenizer = T5Tokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")

    # Load TAPEX model
    tapex_model = BartForConditionalGeneration.from_pretrained("microsoft/tapex-large-finetuned-wtq")
    tapex_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")

    def translate(text, model, tokenizer, source_lang="pt", target_lang="en"):
        input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
        outputs = model.generate(input_ids)
        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translated_text

    def response(user_question, table_data):
        question_en = translate(user_question, pt_en_translator, tokenizer, source_lang="pt", target_lang="en")
        encoding = tapex_tokenizer(table=table_data, query=[question_en], padding=True, return_tensors="pt", truncation=True)
        outputs = tapex_model.generate(**encoding)
        response_en = tapex_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        response_pt = translate(response_en, en_pt_translator, tokenizer, source_lang="en", target_lang="pt")
        return response_pt

    # Streamlit interface

    st.dataframe(all_anomalies.head())

    # Chat history
    if 'history' not in st.session_state:
        st.session_state['history'] = []

    user_question = st.text_input("Escreva sua questão aqui:", "")

    if user_question:
        st.session_state['history'].append(('👤', user_question))
        st.markdown(f"**👤 {user_question}**")
        
        bot_response = response(user_question, all_anomalies)
        
        st.session_state['history'].append(('🤖', bot_response))
        st.markdown(f"<div style='text-align: right'>**🤖 {bot_response}**</div>", unsafe_allow_html=True)

    if st.button("Limpar"):
        st.session_state['history'] = []

    for sender, message in st.session_state['history']:
        if sender == '👤':
            st.markdown(f"**👤 {message}**")
        elif sender == '🤖':
            st.markdown(f"<div style='text-align: right'>**🤖 {message}**</div>", unsafe_allow_html=True)
else:
    st.warning("Please upload a CSV or XLSX file to start.")