File size: 2,164 Bytes
24aafee
 
 
 
 
 
c1601ad
24aafee
cc14591
24aafee
 
 
 
a95e12c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24aafee
 
 
 
 
 
 
 
a95e12c
24aafee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import streamlit as st 
st.markdown(""" This is a Streamlit App """)

import streamlit as st
import pandas as pd
import numpy as np
import simpletransformers
import pickle
import torch
import chardet
from pathlib import Path
from detect_delimiter import detect

data = pd.read_csv("training_data.csv")
#Change Information - Sammenligning to information
data['Category'] = data['Category'].replace('Information - Sammenligning', 'Informational')
data['Category'] = data['Category'].replace('Information', 'Informational')
data = data.groupby('Category').apply(lambda x: x.sample(1500, replace=True)).reset_index(drop=True)
train_df = pd.DataFrame()
train_df['text'] = data['keywords']
train_df['labels'] = data['Category']
train_df['labels'] = train_df['labels'].astype('category').cat.codes
n_labels = len(train_df['labels'].unique())
from simpletransformers.ner import NERModel
from simpletransformers.classification import ClassificationModel

model = ClassificationModel('bert', 'Maltehb/danish-bert-botxo', num_labels=n_labels, use_cuda=True, args={'reprocess_input_data': True, 'overwrite_output_dir': True})

model.train_model(train_df)

label_dict = {
    0: "Brandsøgning",
    1: "Informational",
    2: "Inspiration",
    3: "Navigational",
    4: "Transactional"
}
upload_file = st.file_uploader("Choose a file",type="csv" )
#model = pickle.load(open("finalized_model.sav","rb"))

if upload_file is not None:
    result = chardet.detect(upload_file.getvalue())
    encoding_value = result["encoding"]
    if encoding_value == "UTF-16":
      white_space = True
    else:
      white_space = False
    df = pd.read_csv((upload_file), on_bad_lines='skip', encoding=encoding_value, delim_whitespace=white_space)
    print(df)
    result = {}
    result['Keyword'] = df['Keyword'][:5000]
    result['volume'] =df['Volume'][:5000]
    classes =  [label_dict[model.predict(item)[0][0]] for item in df['Keyword'].values[:5000]]
    result['Classes'] = classes
    df = pd.DataFrame(result)
    st.download_button(
        label="Download CSV file",
        data=df.to_csv().encode('utf-8'),
        file_name='labbeled_data.csv',
        mime='text/csv'
    )