import streamlit as st
import pandas as pd
import time
from datetime import datetime

import numpy as np
import pmdarima as pm
import matplotlib.pyplot as plt
from pmdarima import auto_arima
# import plotly.graph_objects as go

import torch
from transformers import pipeline, TapasTokenizer, TapasForQuestionAnswering

st.set_page_config(
      page_title="Sales Forecasting System",
      page_icon="📈",
      layout="wide",
      initial_sidebar_state="expanded",
)

# Preprocessing
@st.cache_data
def merge(B, C, A):
  i = j = k = 0

  # Convert 'Date' columns to datetime.date objects
  B['Date'] = pd.to_datetime(B['Date']).dt.date
  C['Date'] = pd.to_datetime(C['Date']).dt.date
  A['Date'] = pd.to_datetime(A['Date']).dt.date

  while i < len(B) and j < len(C):
    if B['Date'].iloc[i] <= C['Date'].iloc[j]:
      A['Date'].iloc[k] = B['Date'].iloc[i]
      A['Sales'].iloc[k] = B['Sales'].iloc[i]
      i += 1
      
    else:
      A['Date'].iloc[k] = C['Date'].iloc[j]
      A['Sales'].iloc[k] = C['Sales'].iloc[j]
      j += 1
    k += 1

  while i < len(B):
    A['Date'].iloc[k] = B['Date'].iloc[i]
    A['Sales'].iloc[k] = B['Sales'].iloc[i]
    i += 1
    k += 1

  while j < len(C):
    A['Date'].iloc[k] = C['Date'].iloc[j]
    A['Sales'].iloc[k] = C['Sales'].iloc[j]
    j += 1
    k += 1

  return A

@st.cache_data
def merge_sort(dataframe):
  if len(dataframe) > 1:
      center = len(dataframe) // 2
      left = dataframe.iloc[:center]
      right = dataframe.iloc[center:]
      merge_sort(left)
      merge_sort(right)

      return merge(left, right, dataframe)

  else:
      return dataframe

@st.cache_data
def drop (dataframe):
  def get_columns_containing(dataframe, substrings):
    return [col for col in dataframe.columns if any(substring.lower() in col.lower() for substring in substrings)]

  columns_to_keep = get_columns_containing(dataframe, ["date", "sale"])
  dataframe = dataframe.drop(columns=dataframe.columns.difference(columns_to_keep))
  dataframe = dataframe.dropna()
    
  return dataframe

@st.cache_data
def date_format(dataframe):
  for i, d, s in dataframe.itertuples():
    dataframe['Date'][i] = dataframe['Date'][i].strip()

  for i, d, s in dataframe.itertuples():
    new_date = datetime.strptime(dataframe['Date'][i], "%m/%d/%Y").date()
    dataframe['Date'][i] = new_date

  return dataframe

@st.cache_data
def group_to_three(dataframe):
  dataframe['Date'] = pd.to_datetime(dataframe['Date'])
  dataframe = dataframe.groupby([pd.Grouper(key='Date', freq='3D')])['Sales'].mean().round(2)
  dataframe = dataframe.replace(0, np.nan).dropna()

  return dataframe

@st.cache_data
def series_to_df_exogenous(series):
  dataframe = series.to_frame()
  dataframe = dataframe.reset_index()
  dataframe = dataframe.set_index('Date')
  dataframe = dataframe.dropna()
  # Create the eXogenous values
  dataframe['Sales First Difference'] = dataframe['Sales'] - dataframe['Sales'].shift(1)
  dataframe['Seasonal First Difference'] = dataframe['Sales'] - dataframe['Sales'].shift(12)
  dataframe = dataframe.dropna()
  return dataframe

@st.cache_data
def dates_df(dataframe):
  dataframe = dataframe.reset_index()
  dataframe['Date'] = dataframe['Date'].dt.strftime('%B %d, %Y')
  dataframe[dataframe.columns] = dataframe[dataframe.columns].astype(str)
  return dataframe

@st.cache_data
def get_forecast_period(period):
  return round(period / 3)

# SARIMAX Model
@st.cache_data
def train_test(dataframe):
  n = round(len(dataframe) * 0.2)
  training_y = dataframe.iloc[:-n,0]
  test_y = dataframe.iloc[-n:,0]
  test_y_series = pd.Series(test_y, index=dataframe.iloc[-n:, 0].index)
  training_X = dataframe.iloc[:-n,1:]
  test_X = dataframe.iloc[-n:,1:]
  future_X = dataframe.iloc[0:,1:]
  return (training_y, test_y, test_y_series, training_X, test_X, future_X)

@st.cache_data
def test_fitting(dataframe, Exo, trainY):
    trainTestModel = auto_arima(X = Exo, y = trainY, start_p=1, start_q=1,
                           test='adf',min_p=1,min_q=1,
                           max_p=3, max_q=3, m=12,
                           start_P=0, seasonal=True,
                           d=None, D=1, trace=True,
                           error_action='ignore',
                           suppress_warnings=True,
                           stepwise=True)
    model = trainTestModel
    return model

@st.cache_data
def forecast_accuracy(forecast, actual):
    mape = np.mean(np.abs(forecast - actual)/np.abs(actual)).round(4)  # MAPE
    rmse = (np.mean((forecast - actual)**2)**.5).round(2)  # RMSE
    corr = np.corrcoef(forecast, actual)[0,1]   # corr
    mins = np.amin(np.hstack([forecast[:,None],
                            actual[:,None]]), axis=1)
    maxs = np.amax(np.hstack([forecast[:,None],
                            actual[:,None]]), axis=1)
    minmax = 1 - np.mean(mins/maxs)             # minmax
    return({'mape':mape, 'rmse':rmse, 'corr':corr, 'min-max':minmax})

@st.cache_data
def sales_growth(dataframe, fittedValues):
    sales_growth = fittedValues.to_frame()
    sales_growth = sales_growth.reset_index()
    sales_growth.columns = ("Date", "Sales")
    sales_growth = sales_growth.set_index('Date')

    sales_growth['Sales'] = (sales_growth['Sales']).round(2)

    # Calculate and create the column for sales difference and growth
    sales_growth['Forecasted Sales First Difference']=(sales_growth['Sales']-sales_growth['Sales'].shift(1)).round(2)
    sales_growth['Forecasted Sales Growth']=(((sales_growth['Sales']-sales_growth['Sales'].shift(1))/sales_growth['Sales'].shift(1))*100).round(2)

    # Calculate and create the first row for sales difference and growth
    sales_growth['Forecasted Sales First Difference'].iloc[0] = (dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2]).round(2)
    sales_growth['Forecasted Sales Growth'].iloc[0]=(((dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2])/dataframe['Sales'].iloc[-1])*100).round(2)


    return sales_growth

@st.cache_data
def merge_forecast_data(actual, predicted, future):
    actual = actual.to_frame()
    actual.rename(columns={actual.columns[0]: "Actual Sales"}, inplace=True)

    predicted = predicted.to_frame()
    predicted.rename(columns={predicted.columns[0]: "Predicted Sales"}, inplace=True)

    future = future.to_frame()
    future = future.rename_axis('Date')
    future.rename(columns={future.columns[0]: "Future Forecasted Sales"}, inplace=True)

    merged_dataframe = pd.concat([actual, predicted, future], axis=1)
    merged_dataframe = merged_dataframe.fillna(-1)
    merged_dataframe = merged_dataframe.reset_index()
    return merged_dataframe

@st.cache_data
def get_combined_date(actual, predicted, future):
  actual = actual.to_frame()
  predicted = predicted.to_frame()
  future = future.to_frame()
  future = future.rename_axis('Date')

  actual = actual.reset_index()
  predicted = predicted.reset_index()
  future = future.reset_index()

  actual_date = actual['Date'].to_frame()
  predicted_date = predicted['Date'].to_frame()
  future_date = future['Date'].to_frame()

  combined_date = pd.concat([actual_date, predicted_date, future_date])
  combined_date = combined_date.reset_index()
  combined_date = combined_date.drop(columns='index')
  return combined_date

# TAPAS Model

@st.cache_resource
def load_tapas_model():
  model_name = "google/tapas-large-finetuned-wtq"
  tokenizer = TapasTokenizer.from_pretrained(model_name)
  model = TapasForQuestionAnswering.from_pretrained(model_name, local_files_only=False)
  pipe = pipeline("table-question-answering", model=model, tokenizer=tokenizer)
  return pipe

pipe = load_tapas_model()

def get_answer(table, query):
    answers = pipe(table=table, query=query)
    return answers

def convert_answer(answer):
    if answer['aggregator'] == 'SUM':
      cells = answer['cells']
      converted = sum(float(value.replace(',', '')) for value in cells)
      return converted

    if answer['aggregator'] == 'AVERAGE':
      cells = answer['cells']
      values = [float(value.replace(',', '')) for value in cells]
      converted = sum(values) / len(values)
      return converted

    if answer['aggregator'] == 'COUNT':
      cells = answer['cells']
      converted = sum(int(value.replace(',', '')) for value in cells)
      return converted

    else:
      return answer

def get_converted_answer(table, query):
    converted_answer = convert_answer(get_answer(table, query))
    return converted_answer

# Session States
if 'uploaded' not in st.session_state:
    st.session_state.uploaded = False

if 'forecasted' not in st.session_state:
    st.session_state.forecasted = False

# Web Application
st.title("Forecasting Dashboard 📈")
if not st.session_state.uploaded:
  st.subheader("Welcome User, get started forecasting by uploading your file in the sidebar!")

# Sidebar Menu
with st.sidebar:
    # TODO Name for product
    st.title("MLCast v1.0")
    st.subheader("An intelligent sales forecasting system")
    uploaded_file = st.file_uploader("Upload your store data here to proceed (must atleast contain Date and Sales)", type=["csv"])
    if uploaded_file is not None:
      date_found = False
      sales_found = False
      df = pd.read_csv(uploaded_file, parse_dates=True)
      for column in df.columns:
        if 'Date' in column:  
          date_found = True
        if 'Sales' in column:
          sales_found = True
      if(date_found == False or sales_found == False):
        st.error('Please upload a csv containing both Date and Sales...')
        st.stop()

      st.success("File uploaded successfully!")
      st.write("Your uploaded data:")
      st.write(df)

      df = drop(df)
      df = date_format(df)
      merge_sort(df)
      series = group_to_three(df)

      st.session_state.uploaded = True

    with open('sample.csv', 'rb') as f:
       st.download_button("Download our sample CSV", f, file_name='sample.csv')

if (st.session_state.uploaded):
  st.subheader("Sales History")
  st.line_chart(series)
  
  MIN_DAYS = 30
  MAX_DAYS = 90
  period = st.slider('How many days would you like to forecast?', min_value=MIN_DAYS, max_value=MAX_DAYS)
  forecast_period = get_forecast_period(period)

  forecast_button = st.button(
    'Start Forecasting',
    key='forecast_button',
    type="primary",
  )

  if (forecast_button or st.session_state.forecasted):
    df = series_to_df_exogenous(series)

    train = train_test(df)
    training_y, test_y, test_y_series, training_X, test_X, future_X = train
    train_test_model = test_fitting(df, training_X, training_y)
    
    n_periods = round(len(df) * 0.2)
    future_n_periods = forecast_period + n_periods
    fitted, confint = train_test_model.predict(X=test_X, n_periods=n_periods, return_conf_int=True)
    index_of_fc = test_y_series.index

    # make series for plotting purpose
    fitted_series = pd.Series(fitted)
    fitted_series.index = index_of_fc
    lower_series = pd.Series(confint[:, 0], index=index_of_fc)
    upper_series = pd.Series(confint[:, 1], index=index_of_fc)

    #Future predictions
    frequency = '3D'
    future_fitted, confint = train_test_model.predict(X=df.iloc[-future_n_periods:,1:], n_periods=future_n_periods, return_conf_int=True, freq=frequency)
    future_index_of_fc = pd.date_range(df['Sales'].index[-1], periods = future_n_periods, freq=frequency)

    # make series for future plotting purpose
    future_fitted_series = pd.Series(future_fitted)
    future_fitted_series.index = future_index_of_fc
    future_lower_series = pd.Series(confint[:, 0], index=future_index_of_fc)
    future_upper_series = pd.Series(confint[:, 1], index=future_index_of_fc)

    # Plot
    # plt.plot(df['Sales'], color='b', label = 'Actual Sales')
    # plt.plot(test_y, color='b')
    # plt.plot(fitted_series, color='r', label = 'Predicted Sales')
    # plt.title("SARIMAX - Forecast of Auto Business Retail Sales VS Actual Sales")
    # plt.legend(loc='upper left', fontsize=8)
    # plt.plot(future_fitted_series, color='darkgreen', label ='Future Forecasted Sales')
    # plt.fill_between(future_lower_series.index,
    #                  future_lower_series,
    #                  future_upper_series,
    #                  color='k', alpha=.15)
    # plt.fill_between(lower_series.index,
    #                  lower_series,
    #                  upper_series,
    #                  color='k', alpha=.15)

    future_sales_growth = sales_growth(df, future_fitted_series)
    future_sales_growth = future_sales_growth.iloc[n_periods:]
    df = dates_df(future_sales_growth)

    test_y, predictions = np.array(test_y), np.array(fitted)
    acc = forecast_accuracy(predictions, test_y)

    col = st.columns(2)
    with col[0]:
      col[0].header("Sales Forecast")
      #  merged_data = plot_data(df['Sales'], fitted_series, future_fitted_series)
      # date = get_combined_date(df['Sales'], fitted_series, future_fitted_series)
      # col[0].line_chart(x=date, y=[df['Sales'], fitted_series], color='blue', key='actual')
      merged_data = merge_forecast_data(df['Sales'], fitted_series, future_fitted_series)
      col[0].line_chart(merged_data, x="index", y=["Actual Sales", "Predicted Sales", "Future Forecasted Sales"])
      col[0].write(f"MAPE score: {acc['mape']} (lower is better)")
    with col[1]:
      col[1].subheader(f"Forecasted sales in the next {period} days")
      col[1].write(df)
    st.session_state.forecasted = True


    with st.form("question_form"):
        question = st.text_input('Ask a Question about the Forecasted Data', placeholder="What is the total sales in the month of December?")
        query_button = st.form_submit_button(label='Generate Answer')

    if query_button or question:
        answer = get_converted_answer(df, question)
        if answer is not None:
          st.subheader("The answer is:", answer)
        else:
           st.subheader("Answer is not found in table")

# Hide Streamlit default style
hide_st_style = """
            <style>
            footer {visibility: hidden;}
            </style>
            """
st.markdown(hide_st_style, unsafe_allow_html=True)