DevBM commited on
Commit
97d9e12
1 Parent(s): 9f8802c

Upload 15 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ s2v_old/freqs.json filter=lfs diff=lfs merge=lfs -text
37
+ s2v_old/key2row filter=lfs diff=lfs merge=lfs -text
38
+ s2v_old/strings.json filter=lfs diff=lfs merge=lfs -text
39
+ s2v_old/vectors filter=lfs diff=lfs merge=lfs -text
Questgen/__pycache__/main.cpython-312.pyc ADDED
Binary file (26.8 kB). View file
 
Questgen/main.py ADDED
@@ -0,0 +1,608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np # linear algebra
2
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
3
+ import time
4
+ import torch
5
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
6
+ import random
7
+ import spacy
8
+ import zipfile
9
+ import os
10
+ import json
11
+ from sense2vec import Sense2Vec
12
+ import requests
13
+ from collections import OrderedDict
14
+ import string
15
+ import pke
16
+ import nltk
17
+ import numpy
18
+ from nltk import FreqDist
19
+ nltk.download('brown', quiet=True, force=True)
20
+ nltk.download('stopwords', quiet=True, force=True)
21
+ nltk.download('popular', quiet=True, force=True)
22
+ from nltk.corpus import stopwords
23
+ from nltk.corpus import brown
24
+ from similarity.normalized_levenshtein import NormalizedLevenshtein
25
+ from nltk.tokenize import sent_tokenize
26
+ from flashtext import KeywordProcessor
27
+ # from Questgen.encoding.encoding import beam_search_decoding
28
+ # from Questgen.mcq.mcq import tokenize_sentences
29
+ # from Questgen.mcq.mcq import get_keywords
30
+ # from Questgen.mcq.mcq import get_sentences_for_keyword
31
+ # from Questgen.mcq.mcq import generate_questions_mcq
32
+ # from Questgen.mcq.mcq import generate_normal_questions
33
+ import time
34
+ import numpy as np # linear algebra
35
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
36
+ import time
37
+ import torch
38
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
39
+ import random
40
+ import spacy
41
+ import zipfile
42
+ import os
43
+ import json
44
+ from sense2vec import Sense2Vec
45
+ import requests
46
+ from collections import OrderedDict
47
+ import string
48
+ import pke
49
+ import nltk
50
+ from nltk import FreqDist
51
+ nltk.download('brown')
52
+ nltk.download('stopwords')
53
+ nltk.download('popular')
54
+ from nltk.corpus import stopwords
55
+ from nltk.corpus import brown
56
+ # from similarity.normalized_levenshtein import NormalizedLevenshtein
57
+ from nltk.tokenize import sent_tokenize
58
+ # from flashtext import KeywordProcessor
59
+
60
+ def beam_search_decoding (inp_ids,attn_mask,model,tokenizer):
61
+ beam_output = model.generate(input_ids=inp_ids,
62
+ attention_mask=attn_mask,
63
+ max_length=256,
64
+ num_beams=10,
65
+ num_return_sequences=3,
66
+ no_repeat_ngram_size=2,
67
+ early_stopping=True
68
+ )
69
+ Questions = [tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) for out in
70
+ beam_output]
71
+ return [Question.strip().capitalize() for Question in Questions]
72
+
73
+
74
+
75
+ def MCQs_available(word,s2v):
76
+ word = word.replace(" ", "_")
77
+ sense = s2v.get_best_sense(word)
78
+ if sense is not None:
79
+ return True
80
+ else:
81
+ return False
82
+
83
+
84
+ def edits(word):
85
+ "All edits that are one edit away from `word`."
86
+ letters = 'abcdefghijklmnopqrstuvwxyz '+string.punctuation
87
+ splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
88
+ deletes = [L + R[1:] for L, R in splits if R]
89
+ transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
90
+ replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
91
+ inserts = [L + c + R for L, R in splits for c in letters]
92
+ return set(deletes + transposes + replaces + inserts)
93
+
94
+
95
+ def sense2vec_get_words(word,s2v):
96
+ output = []
97
+
98
+ word_preprocessed = word.translate(word.maketrans("","", string.punctuation))
99
+ word_preprocessed = word_preprocessed.lower()
100
+
101
+ word_edits = edits(word_preprocessed)
102
+
103
+ word = word.replace(" ", "_")
104
+
105
+ sense = s2v.get_best_sense(word)
106
+ most_similar = s2v.most_similar(sense, n=15)
107
+
108
+ compare_list = [word_preprocessed]
109
+ for each_word in most_similar:
110
+ append_word = each_word[0].split("|")[0].replace("_", " ")
111
+ append_word = append_word.strip()
112
+ append_word_processed = append_word.lower()
113
+ append_word_processed = append_word_processed.translate(append_word_processed.maketrans("","", string.punctuation))
114
+ if append_word_processed not in compare_list and word_preprocessed not in append_word_processed and append_word_processed not in word_edits:
115
+ output.append(append_word.title())
116
+ compare_list.append(append_word_processed)
117
+
118
+
119
+ out = list(OrderedDict.fromkeys(output))
120
+
121
+ return out
122
+
123
+ def get_options(answer,s2v):
124
+ distractors =[]
125
+
126
+ try:
127
+ distractors = sense2vec_get_words(answer,s2v)
128
+ if len(distractors) > 0:
129
+ print(" Sense2vec_distractors successful for word : ", answer)
130
+ return distractors,"sense2vec"
131
+ except:
132
+ print (" Sense2vec_distractors failed for word : ",answer)
133
+
134
+
135
+ return distractors,"None"
136
+
137
+ def tokenize_sentences(text):
138
+ sentences = [sent_tokenize(text)]
139
+ sentences = [y for x in sentences for y in x]
140
+ # Remove any short sentences less than 20 letters.
141
+ sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
142
+ return sentences
143
+
144
+
145
+ def get_sentences_for_keyword(keywords, sentences):
146
+ keyword_processor = KeywordProcessor()
147
+ keyword_sentences = {}
148
+ for word in keywords:
149
+ word = word.strip()
150
+ keyword_sentences[word] = []
151
+ keyword_processor.add_keyword(word)
152
+ for sentence in sentences:
153
+ keywords_found = keyword_processor.extract_keywords(sentence)
154
+ for key in keywords_found:
155
+ keyword_sentences[key].append(sentence)
156
+
157
+ for key in keyword_sentences.keys():
158
+ values = keyword_sentences[key]
159
+ values = sorted(values, key=len, reverse=True)
160
+ keyword_sentences[key] = values
161
+
162
+ delete_keys = []
163
+ for k in keyword_sentences.keys():
164
+ if len(keyword_sentences[k]) == 0:
165
+ delete_keys.append(k)
166
+ for del_key in delete_keys:
167
+ del keyword_sentences[del_key]
168
+
169
+ return keyword_sentences
170
+
171
+
172
+ def is_far(words_list,currentword,thresh,normalized_levenshtein):
173
+ threshold = thresh
174
+ score_list =[]
175
+ for word in words_list:
176
+ score_list.append(normalized_levenshtein.distance(word.lower(),currentword.lower()))
177
+ if min(score_list)>=threshold:
178
+ return True
179
+ else:
180
+ return False
181
+
182
+ def filter_phrases(phrase_keys,max,normalized_levenshtein ):
183
+ filtered_phrases =[]
184
+ if len(phrase_keys)>0:
185
+ filtered_phrases.append(phrase_keys[0])
186
+ for ph in phrase_keys[1:]:
187
+ if is_far(filtered_phrases,ph,0.7,normalized_levenshtein ):
188
+ filtered_phrases.append(ph)
189
+ if len(filtered_phrases)>=max:
190
+ break
191
+ return filtered_phrases
192
+
193
+
194
+ def get_nouns_multipartite(text):
195
+ out = []
196
+
197
+ extractor = pke.unsupervised.MultipartiteRank()
198
+ extractor.load_document(input=text, language='en')
199
+ pos = {'PROPN', 'NOUN'}
200
+ stoplist = list(string.punctuation)
201
+ stoplist += stopwords.words('english')
202
+ extractor.candidate_selection(pos=pos)
203
+ # 4. build the Multipartite graph and rank candidates using random walk,
204
+ # alpha controls the weight adjustment mechanism, see TopicRank for
205
+ # threshold/method parameters.
206
+ try:
207
+ extractor.candidate_weighting(alpha=1.1,
208
+ threshold=0.75,
209
+ method='average')
210
+ except:
211
+ return out
212
+
213
+ keyphrases = extractor.get_n_best(n=10)
214
+
215
+ for key in keyphrases:
216
+ out.append(key[0])
217
+
218
+ return out
219
+
220
+
221
+ def get_phrases(doc):
222
+ phrases={}
223
+ for np in doc.noun_chunks:
224
+ phrase =np.text
225
+ len_phrase = len(phrase.split())
226
+ if len_phrase > 1:
227
+ if phrase not in phrases:
228
+ phrases[phrase]=1
229
+ else:
230
+ phrases[phrase]=phrases[phrase]+1
231
+
232
+ phrase_keys=list(phrases.keys())
233
+ phrase_keys = sorted(phrase_keys, key= lambda x: len(x),reverse=True)
234
+ phrase_keys=phrase_keys[:50]
235
+ return phrase_keys
236
+
237
+
238
+
239
+ def get_keywords(nlp,text,max_keywords,s2v,fdist,normalized_levenshtein,no_of_sentences):
240
+ doc = nlp(text)
241
+ max_keywords = int(max_keywords)
242
+
243
+ keywords = get_nouns_multipartite(text)
244
+ keywords = sorted(keywords, key=lambda x: fdist[x])
245
+ keywords = filter_phrases(keywords, max_keywords,normalized_levenshtein )
246
+
247
+ phrase_keys = get_phrases(doc)
248
+ filtered_phrases = filter_phrases(phrase_keys, max_keywords,normalized_levenshtein )
249
+
250
+ total_phrases = keywords + filtered_phrases
251
+
252
+ total_phrases_filtered = filter_phrases(total_phrases, min(max_keywords, 2*no_of_sentences),normalized_levenshtein )
253
+
254
+
255
+ answers = []
256
+ for answer in total_phrases_filtered:
257
+ if answer not in answers and MCQs_available(answer,s2v):
258
+ answers.append(answer)
259
+
260
+ answers = answers[:max_keywords]
261
+ return answers
262
+
263
+
264
+ def generate_questions_mcq(keyword_sent_mapping,device,tokenizer,model,sense2vec,normalized_levenshtein):
265
+ batch_text = []
266
+ answers = keyword_sent_mapping.keys()
267
+ for answer in answers:
268
+ txt = keyword_sent_mapping[answer]
269
+ context = "context: " + txt
270
+ text = context + " " + "answer: " + answer + " </s>"
271
+ batch_text.append(text)
272
+
273
+ encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")
274
+
275
+
276
+ print ("Running model for generation")
277
+ input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
278
+
279
+ with torch.no_grad():
280
+ outs = model.generate(input_ids=input_ids,
281
+ attention_mask=attention_masks,
282
+ max_length=150)
283
+
284
+ output_array ={}
285
+ output_array["questions"] =[]
286
+ # print(outs)
287
+ for index, val in enumerate(answers):
288
+ individual_question ={}
289
+ out = outs[index, :]
290
+ dec = tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
291
+
292
+ Question = dec.replace("question:", "")
293
+ Question = Question.strip()
294
+ individual_question["question_statement"] = Question
295
+ individual_question["question_type"] = "MCQ"
296
+ individual_question["answer"] = val
297
+ individual_question["id"] = index+1
298
+ individual_question["options"], individual_question["options_algorithm"] = get_options(val, sense2vec)
299
+
300
+ individual_question["options"] = filter_phrases(individual_question["options"], 10,normalized_levenshtein)
301
+ index = 3
302
+ individual_question["extra_options"]= individual_question["options"][index:]
303
+ individual_question["options"] = individual_question["options"][:index]
304
+ individual_question["context"] = keyword_sent_mapping[val]
305
+
306
+ if len(individual_question["options"])>0:
307
+ output_array["questions"].append(individual_question)
308
+
309
+ return output_array
310
+
311
+ def generate_normal_questions(keyword_sent_mapping,device,tokenizer,model): #for normal one word questions
312
+ batch_text = []
313
+ answers = keyword_sent_mapping.keys()
314
+ for answer in answers:
315
+ txt = keyword_sent_mapping[answer]
316
+ context = "context: " + txt
317
+ text = context + " " + "answer: " + answer + " </s>"
318
+ batch_text.append(text)
319
+
320
+ encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")
321
+
322
+
323
+ print ("Running model for generation")
324
+ input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
325
+
326
+ with torch.no_grad():
327
+ outs = model.generate(input_ids=input_ids,
328
+ attention_mask=attention_masks,
329
+ max_length=150)
330
+
331
+ output_array ={}
332
+ output_array["questions"] =[]
333
+
334
+ for index, val in enumerate(answers):
335
+ individual_quest= {}
336
+ out = outs[index, :]
337
+ dec = tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
338
+
339
+ Question= dec.replace('question:', '')
340
+ Question= Question.strip()
341
+
342
+ individual_quest['Question']= Question
343
+ individual_quest['Answer']= val
344
+ individual_quest["id"] = index+1
345
+ individual_quest["context"] = keyword_sent_mapping[val]
346
+
347
+ output_array["questions"].append(individual_quest)
348
+
349
+ return output_array
350
+
351
+ def random_choice():
352
+ a = random.choice([0,1])
353
+ return bool(a)
354
+
355
+ class QGen:
356
+
357
+ def __init__(self):
358
+
359
+ self.tokenizer = T5Tokenizer.from_pretrained('t5-large')
360
+ model = T5ForConditionalGeneration.from_pretrained('Parth/result')
361
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
362
+ model.to(device)
363
+ # model.eval()
364
+ self.device = device
365
+ self.model = model
366
+ self.nlp = spacy.load('en_core_web_sm')
367
+
368
+ self.s2v = Sense2Vec().from_disk('s2v_old')
369
+
370
+ self.fdist = FreqDist(brown.words())
371
+ self.normalized_levenshtein = NormalizedLevenshtein()
372
+ self.set_seed(42)
373
+
374
+ def set_seed(self,seed):
375
+ numpy.random.seed(seed)
376
+ torch.manual_seed(seed)
377
+ if torch.cuda.is_available():
378
+ torch.cuda.manual_seed_all(seed)
379
+
380
+ def predict_mcq(self, payload):
381
+ start = time.time()
382
+ inp = {
383
+ "input_text": payload.get("input_text"),
384
+ "max_questions": payload.get("max_questions", 4)
385
+ }
386
+
387
+ text = inp['input_text']
388
+ sentences = tokenize_sentences(text)
389
+ joiner = " "
390
+ modified_text = joiner.join(sentences)
391
+
392
+
393
+ keywords = get_keywords(self.nlp,modified_text,inp['max_questions'],self.s2v,self.fdist,self.normalized_levenshtein,len(sentences) )
394
+
395
+
396
+ keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences)
397
+
398
+ for k in keyword_sentence_mapping.keys():
399
+ text_snippet = " ".join(keyword_sentence_mapping[k][:3])
400
+ keyword_sentence_mapping[k] = text_snippet
401
+
402
+
403
+ final_output = {}
404
+
405
+ if len(keyword_sentence_mapping.keys()) == 0:
406
+ return final_output
407
+ else:
408
+ try:
409
+ generated_questions = generate_questions_mcq(keyword_sentence_mapping,self.device,self.tokenizer,self.model,self.s2v,self.normalized_levenshtein)
410
+
411
+ except:
412
+ return final_output
413
+ end = time.time()
414
+
415
+ final_output["statement"] = modified_text
416
+ final_output["questions"] = generated_questions["questions"]
417
+ final_output["time_taken"] = end-start
418
+
419
+ if torch.device=='cuda':
420
+ torch.cuda.empty_cache()
421
+
422
+ return final_output
423
+
424
+ def predict_shortq(self, payload):
425
+ inp = {
426
+ "input_text": payload.get("input_text"),
427
+ "max_questions": payload.get("max_questions", 4)
428
+ }
429
+
430
+ text = inp['input_text']
431
+ sentences = tokenize_sentences(text)
432
+ joiner = " "
433
+ modified_text = joiner.join(sentences)
434
+
435
+
436
+ keywords = get_keywords(self.nlp,modified_text,inp['max_questions'],self.s2v,self.fdist,self.normalized_levenshtein,len(sentences) )
437
+
438
+
439
+ keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences)
440
+
441
+ for k in keyword_sentence_mapping.keys():
442
+ text_snippet = " ".join(keyword_sentence_mapping[k][:3])
443
+ keyword_sentence_mapping[k] = text_snippet
444
+
445
+ final_output = {}
446
+
447
+ if len(keyword_sentence_mapping.keys()) == 0:
448
+ print('ZERO')
449
+ return final_output
450
+ else:
451
+
452
+ generated_questions = generate_normal_questions(keyword_sentence_mapping,self.device,self.tokenizer,self.model)
453
+ print(generated_questions)
454
+
455
+
456
+ final_output["statement"] = modified_text
457
+ final_output["questions"] = generated_questions["questions"]
458
+
459
+ if torch.device=='cuda':
460
+ torch.cuda.empty_cache()
461
+
462
+ return final_output
463
+
464
+
465
+ def paraphrase(self,payload):
466
+ start = time.time()
467
+ inp = {
468
+ "input_text": payload.get("input_text"),
469
+ "max_questions": payload.get("max_questions", 3)
470
+ }
471
+
472
+ text = inp['input_text']
473
+ num = inp['max_questions']
474
+
475
+ self.sentence= text
476
+ self.text= "paraphrase: " + self.sentence + " </s>"
477
+
478
+ encoding = self.tokenizer.encode_plus(self.text,pad_to_max_length=True, return_tensors="pt")
479
+ input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)
480
+
481
+ beam_outputs = self.model.generate(
482
+ input_ids=input_ids,
483
+ attention_mask=attention_masks,
484
+ max_length= 50,
485
+ num_beams=50,
486
+ num_return_sequences=num,
487
+ no_repeat_ngram_size=2,
488
+ early_stopping=True
489
+ )
490
+
491
+ # print ("\nOriginal Question ::")
492
+ # print (text)
493
+ # print ("\n")
494
+ # print ("Paraphrased Questions :: ")
495
+ final_outputs =[]
496
+ for beam_output in beam_outputs:
497
+ sent = self.tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
498
+ if sent.lower() != self.sentence.lower() and sent not in final_outputs:
499
+ final_outputs.append(sent)
500
+
501
+ output= {}
502
+ output['Question']= text
503
+ output['Count']= num
504
+ output['Paraphrased Questions']= final_outputs
505
+
506
+ for i, final_output in enumerate(final_outputs):
507
+ print("{}: {}".format(i, final_output))
508
+
509
+ if torch.device=='cuda':
510
+ torch.cuda.empty_cache()
511
+
512
+ return output
513
+
514
+
515
+ class BoolQGen:
516
+
517
+ def __init__(self):
518
+ self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
519
+ model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_boolean_questions')
520
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
521
+ model.to(device)
522
+ # model.eval()
523
+ self.device = device
524
+ self.model = model
525
+ self.set_seed(42)
526
+
527
+ def set_seed(self,seed):
528
+ numpy.random.seed(seed)
529
+ torch.manual_seed(seed)
530
+ if torch.cuda.is_available():
531
+ torch.cuda.manual_seed_all(seed)
532
+
533
+ def random_choice(self):
534
+ a = random.choice([0,1])
535
+ return bool(a)
536
+
537
+
538
+ def predict_boolq(self,payload):
539
+ start = time.time()
540
+ inp = {
541
+ "input_text": payload.get("input_text"),
542
+ "max_questions": payload.get("max_questions", 4)
543
+ }
544
+
545
+ text = inp['input_text']
546
+ num= inp['max_questions']
547
+ sentences = tokenize_sentences(text)
548
+ joiner = " "
549
+ modified_text = joiner.join(sentences)
550
+ answer = self.random_choice()
551
+ form = "truefalse: %s passage: %s </s>" % (modified_text, answer)
552
+
553
+ encoding = self.tokenizer.encode_plus(form, return_tensors="pt")
554
+ input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)
555
+
556
+ output = beam_search_decoding(input_ids, attention_masks,self.model,self.tokenizer)
557
+ if torch.device=='cuda':
558
+ torch.cuda.empty_cache()
559
+
560
+ final= {}
561
+ final['Text']= text
562
+ final['Count']= num
563
+ final['Boolean Questions']= output
564
+
565
+ return final
566
+
567
+ class AnswerPredictor:
568
+
569
+ def __init__(self):
570
+ self.tokenizer = T5Tokenizer.from_pretrained('t5-large', model_max_length=512)
571
+ model = T5ForConditionalGeneration.from_pretrained('Parth/boolean')
572
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
573
+ model.to(device)
574
+ # model.eval()
575
+ self.device = device
576
+ self.model = model
577
+ self.set_seed(42)
578
+
579
+ def set_seed(self,seed):
580
+ numpy.random.seed(seed)
581
+ torch.manual_seed(seed)
582
+ if torch.cuda.is_available():
583
+ torch.cuda.manual_seed_all(seed)
584
+
585
+ def greedy_decoding (inp_ids,attn_mask,model,tokenizer):
586
+ greedy_output = model.generate(input_ids=inp_ids, attention_mask=attn_mask, max_length=256)
587
+ Question = tokenizer.decode(greedy_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
588
+ return Question.strip().capitalize()
589
+
590
+ def predict_answer(self,payload):
591
+ answers = []
592
+ inp = {
593
+ "input_text": payload.get("input_text"),
594
+ "input_question" : payload.get("input_question")
595
+ }
596
+ for ques in payload.get("input_question"):
597
+
598
+ context = inp["input_text"]
599
+ question = ques
600
+ input = "question: %s <s> context: %s </s>" % (question,context)
601
+
602
+ encoding = self.tokenizer.encode_plus(input, return_tensors="pt")
603
+ input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)
604
+ greedy_output = self.model.generate(input_ids=input_ids, attention_mask=attention_masks, max_length=256)
605
+ Question = self.tokenizer.decode(greedy_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
606
+ answers.append(Question.strip().capitalize())
607
+
608
+ return answers
b/b.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import subprocess
2
+ cmd = ["python", "-m", "spacy", "download", "en_core_web_sm"]
3
+ subprocess.run(cmd)
main.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pprint import pprint
3
+ import subprocess
4
+ cmd = ["python", "-m", "spacy", "download", "en_core_web_sm"]
5
+ subprocess.run(cmd)
6
+ from spacy.cli import download
7
+ from Questgen import main
8
+ from spacy.cli import download
9
+
10
+ # download('en_core_web_sm')
11
+
12
+ st.set_page_config(
13
+ page_title='Questgen',
14
+ page_icon= ':fire:',
15
+ )
16
+
17
+ st.title(body='Question Generator')
18
+
19
+ input_text = st.text_area(
20
+ label='Enter text from which questions are to be generated',
21
+ value = 'Sachin Tendulkar is the best batsman in the history of cricket. Sachin is from Mumbai. Sachin has two children.'
22
+ )
23
+
24
+ qg = main.QGen()
25
+
26
+ payload = {
27
+ 'input_text' : input_text
28
+ }
29
+
30
+ output = qg.predict_mcq(payload=payload)
31
+
32
+ st.header(body='*Generated Questions are:*', divider='orange')
33
+ for question in output['questions']:
34
+ st.subheader(body=f":orange[Q{question['id']}:] {question['question_statement']}", divider='blue')
35
+ st.markdown(f"A: {question['answer']}")
36
+ c = 0
37
+ for option in question['options']:
38
+ # st.markdown(f"{c}")
39
+ c+=1
40
+ if c==1:
41
+ st.markdown(f"B: {option}")
42
+ elif c==2:
43
+ st.markdown(f"C: {option}")
44
+ elif c==3:
45
+ st.markdown(f"D: {option}")
46
+ # st.write(f"{question['question_statement']}")
47
+
48
+ if st.toggle(label='Show Total Output'):
49
+ st.write(output)
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ torch
3
+ sense2vec
4
+ strsim
5
+ six
6
+ networkx
7
+ numpy
8
+ scipy
9
+ scikit-learn
10
+ unicode
11
+ future
12
+ joblib
13
+ pytz
14
+ python-dateutil
15
+ flashtext
16
+ pandas
17
+ sentencepiece
18
+ transformers
19
+ spacy
20
+ git+https://github.com/boudinfl/pke.git
21
+ #git+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
22
+ #git+https://github.com/devbm7/Questgen.ai.git
23
+ nltk
s2v_old/._cfg ADDED
Binary file (174 Bytes). View file
 
s2v_old/._freqs.json ADDED
Binary file (174 Bytes). View file
 
s2v_old/._key2row ADDED
Binary file (174 Bytes). View file
 
s2v_old/._strings.json ADDED
Binary file (174 Bytes). View file
 
s2v_old/._vectors ADDED
Binary file (174 Bytes). View file
 
s2v_old/cfg ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "senses":[
3
+ "PUNCT",
4
+ "SYM",
5
+ "MONEY",
6
+ "PERCENT",
7
+ "PRODUCT",
8
+ "X",
9
+ "LANGUAGE",
10
+ "DET",
11
+ "LOC",
12
+ "CARDINAL",
13
+ "CONJ",
14
+ "LAW",
15
+ "ORG",
16
+ "PART",
17
+ "VERB",
18
+ "NUM",
19
+ "EVENT",
20
+ "ADP",
21
+ "PERSON",
22
+ "QUANTITY",
23
+ "INTJ",
24
+ "TIME",
25
+ "SPACE",
26
+ "DATE",
27
+ "ADJ",
28
+ "NOUN",
29
+ "NORP",
30
+ "ORDINAL",
31
+ "WORK OF ART",
32
+ "ADV",
33
+ "FAC",
34
+ "GPE"
35
+ ]
36
+ }
s2v_old/freqs.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb75f4bbf927c536d808426c6e9f55ef1f69ab44e473c460b8e13274eab97241
3
+ size 49969681
s2v_old/key2row ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29690c5ab1c96b6f9061b25bf737fee04540187328a3857cea0f9a1b4da46614
3
+ size 16492891
s2v_old/strings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1ecd6b643475b42d153c74515cba54c12e28e1edac8abbd51794a6ca4a105e0
3
+ size 26188439
s2v_old/vectors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:290724e713d3e8da2ed0f82ab2ad1a1aeaa9d5fe1330baccd26b62a7399f6d71
3
+ size 611973760