Spaces:

Pclanglais
/

Editorialization

Running

App Files Files Community

Pclanglais commited on Jul 3

Commit

9fcaecd

•

1 Parent(s): bd727cb

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -12

app.py CHANGED Viewed

@@ -19,6 +19,16 @@ token_classifier = pipeline(
 tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
 def split_text(text, max_tokens=500):
     # Split the text by newline characters
@@ -64,6 +74,32 @@ def split_text(text, max_tokens=500):
     return chunks
 # Class to encapsulate the Falcon chatbot
@@ -85,6 +121,7 @@ class MistralChatBot:
             batch_prompts = [editorial_text]
         out = token_classifier(batch_prompts)
         print(out)
         generated_text = '<h2 style="text-align:center">Réponse</h3>\n<div class="generation">' + out + "</div>"
         return generated_text
@@ -102,18 +139,6 @@ examples = [
     ]
 ]
-additional_inputs=[
-    gr.Slider(
-        label="Température",
-        value=0.2,  # Default value
-        minimum=0.05,
-        maximum=1.0,
-        step=0.05,
-        interactive=True,
-        info="Des valeurs plus élevées donne plus de créativité, mais aussi d'étrangeté",
-    ),
-]
 demo = gr.Blocks()
 with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:

 tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
+# Preprocess the 'word' column
+def preprocess_text(text):
+    # Remove HTML tags
+    text = re.sub(r'<[^>]+>', '', text)
+    # Replace newlines with spaces
+    text = re.sub(r'\n', ' ', text)
+    # Replace multiple spaces with a single space
+    text = re.sub(r'\s+', ' ', text)
+    # Strip leading and trailing whitespace
+    return text.strip()
 def split_text(text, max_tokens=500):
     # Split the text by newline characters
     return chunks
+def transform_chunks(marianne_segmentation):
+    # Filter out separators
+    marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
+    # Replace '¶' with '\n' and convert to string
+    marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False)
+    #A bit of lceaning.
+    marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text)
+    marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != 'nan']
+    marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != '']
+    marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != ' ']
+    # Add entity_group as a header to each word
+    marianne_segmentation['word'] = '### ' + marianne_segmentation['entity_group'] + ' ###\n' + marianne_segmentation['word']
+    # Group by text_id, identifier, and date, then concatenate words
+    marianne_segmentation = marianne_segmentation.agg({
+        'word': lambda x: '\n\n'.join(x.dropna())
+    }).reset_index()
+    final_text = marianne_segmentation['word'].tolist()[0]
+    return final_text
 # Class to encapsulate the Falcon chatbot
             batch_prompts = [editorial_text]
         out = token_classifier(batch_prompts)
+        out = transform_chunks(out)
         print(out)
         generated_text = '<h2 style="text-align:center">Réponse</h3>\n<div class="generation">' + out + "</div>"
         return generated_text
     ]
 ]
 demo = gr.Blocks()
 with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo: