File size: 15,173 Bytes
cfe9686
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3994cd
cfe9686
 
c3994cd
 
 
 
 
cfe9686
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
import streamlit as st
import numpy as np
import cv2
import tensorflow as tf 
from PIL import Image
from keras.models import load_model
from sklearn.preprocessing import LabelEncoder
import pickle
from  keras_preprocessing.sequence import pad_sequences
from  keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from PIL import Image
# from google.colab.patches import cv2_imshow

def label_smoothing(y_true,y_pred):
    
     return tf.keras.losses.binary_crossentropy(y_true,y_pred,label_smoothing=0.1)
def sparse_cross_entropy(y_true, y_pred):
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)
    loss_mean = tf.reduce_mean(loss)
    return loss_mean
model1 = load_model('densenet.h5',custom_objects={'label_smoothing': label_smoothing})
image_model_transfer=load_model("image_model_transfer.h5")
decoder_model=load_model("Final_ISRO_DenseNet201_Epoch50.h5",custom_objects={'sparse_cross_entropy': sparse_cross_entropy})

class TokenizerWrap(Tokenizer):
    """Wrap the Tokenizer-class from Keras with more functionality."""
    
    def _init_(self, texts, num_words=None):
        """
        :param texts: List of strings with the data-set.
        :param num_words: Max number of words to use.
        """

        Tokenizer._init_(self, num_words=num_words)

        # Create the vocabulary from the texts.
        self.fit_on_texts(texts)

        # Create inverse lookup from integer-tokens to words.
        # word_index is a dictionary. its values are tokens and the keys are words
        # opposite to index_to_word
        self.index_to_word = dict(zip(self.word_index.values(),
                                      self.word_index.keys()))

    def token_to_word(self, token):
        """Lookup a single word from an integer-token."""
        word = " " if token == 0 else self.index_to_word[token]
        return word 

    def tokens_to_string(self, tokens):
        """Convert a list of integer-tokens to a string."""
        # Create a list of the individual words.
        words = [self.index_to_word[token]
                 for token in tokens
                 if token != 0]
        
        # Concatenate the words to a single string
        # with space between all the words.
        text = " ".join(words)

        return text
    
    def captions_to_tokens(self, captions_listlist):
        """
        Convert a list-of-list with text-captions to
        a list-of-list of integer-tokens.
        """
        
        # Note that text_to_sequences() takes a list of texts.
        tokens = [self.texts_to_sequences(captions_list)
                  for captions_list in captions_listlist]
        
        return tokens
with open('Train_Label.pickle', 'rb') as efile:
    labels=pickle.load(efile)
with open('tokenizer.pkl', 'rb') as efile:
    tokenizer=pickle.load(efile)

le=LabelEncoder()
labels=le.fit_transform(labels)

def framing(video):#defining a small function named"framing" with a parameter "i" that's supposed to be provided for reading the video
    fr = []#creating an empty list named fr
    fr_pre=[]#creating an empty list named fr_pre
    cap = cv2.VideoCapture(video)#reading the video file
    while (cap.isOpened()):#This command builds a loop to check if the data is still being read from the video
        ret,frame = cap.read()#reading the data tunnel,gives two output where one tells about presence of frames(here it's ret) & the other speaks frame data(here it's frame)
        if ret == True:#checking for presence of frames
            # cv2_imshow(frame)#displaying the frames
            grayed = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)#Converting the frames to Grayscale from BGR
            canned = cv2.Canny(grayed,320,320)#For extrating edges we use Canny Edge detection method
            fr.append(frame)#Appending the read frame
            fr_pre.append(canned)#Appending the edge extracted frames
            # cv2_imshow(grayed)#Displaying the original frames
            # cv2_imshow(canned)#Displaying the edge detected frames
            k = cv2.waitKey(10) & 0XFF#this is an arrangement for displaying the video where the secs for which each frame needs to be displayed in given in the paranthesis
            if k == ord('q'):#pressing 'q' key will close the video
                break
        else:
            break
    cap.release()#Here we release the resoures
    cv2.destroyAllWindows()#Here we delete all the windows that were created during the program       
    return fr_pre,fr 

def difference_of_frames(frames):
    diff = []#creatin a list variable
    for i in range(0,len(frames)-1):#defining the range
        diff.append(cv2.absdiff(frames[i],frames[i+1]))#appending the diff between frames to the list variable so we're supposed to get only the difference between frames
    return diff

def cal_threshold(diff):
  mn = np.mean(diff)#This gives mean
  st_d = np.std(diff)#This gives standard deviation
  a = 4#Setting a random value we can modify it to any value 
  ts = mn + (a * st_d)#defining the standard threshold value for the project/global threshold value
  return ts

def imp_frames(diff, ts, ogframes):
  a_fr = []#Creating an empty list
  for i in range(len(diff)):#Defining the for loop to be looped over all the frames obtained after finding the frames resulted from subtracting
      mn = np.mean(diff[i])#Calculating the mean for each frame
      st_d = np.std(diff[i])#Calculating the standard deviation for each frame
      fr_ts = mn + (4*st_d)#Finding the threshold values for each frame/image
      a_fr.append([i,fr_ts])#Appending the frame number & the threshold values
  imp_fr = []#Creating an empty list
  for i,ac_tr in(a_fr):#Defining the loop on the list obtained from above code
      if ac_tr >= ts:#Comapring the threshold values to the standard threshold/global threshold values
          imp_fr.append([i,ac_tr])#Appending the list with the imp frames based on their index & the values
  key_fr = []#Creating an empty list
  for i,_ in imp_fr:#Defining the loop over the list obtained from above code
      key_fr.append(ogframes[i])#This extracts the frames based on the index of frames 
  return key_fr

def final_image(video):
  frames,ogframes = framing(video)#calling function framing & then extracting the images 
  diff=difference_of_frames(frames)  
  ts=cal_threshold(diff) 
  key_fr=imp_frames(diff, ts, ogframes)  
  frame_no=key_fr[int(len(key_fr)/2)] #this is a frame
  cv2.imwrite("Testing1.jpg",frame_no)
  return "Testing1.jpg"
  cv2.destroyAllWindows()

def image_test(image_path):
  image=Image.open(image_path)
  image = image.resize((224,224)) 
  image = np.array(image) 
  image= np.expand_dims(image, axis=0)
  return image

def largest_indices(ary, n):
    flat = ary.flatten()
    indices = np.argpartition(flat, -n)[-n:]
    indices = indices[np.argsort(-flat[indices])]
    return indices

mark_start = 'ssss'
mark_end = ' eeee'

token_start = tokenizer.word_index[mark_start.strip()]
token_end = tokenizer.word_index[mark_end.strip()]

def load_image(path, size=None):
  """
  Load the image from the given file-path and resize it
  to the given size if not None.
  """

  # Load the image using PIL.
  img = Image.open(path)

  # Resize image if desired.
  if not size is None:
      img = img.resize(size=size, resample=Image.LANCZOS)
      
  img = np.array(img)
  img = img / 255.0

  # Convert 2-dim gray-scale array to 3-dim RGB array.
  if (len(img.shape) == 2):
      img = np.repeat(img[:, :, np.newaxis], 3, axis=2)
  return img

def greedy_search(image_path, max_tokens=30):
    """
    Generate a caption for the image in the given path.
    The caption is limited to the given number of tokens (words).
    """
    # ---------------------------ENCODE IMAGE--------------------------------
    # Load and resize the image.
    image = load_image(image_path, size=(224,224))
    
    # Expand the 3-dim numpy array to 4-dim
    # because the image-model expects a whole batch as input,
    # so we give it a batch with just one image.
    image_batch = np.expand_dims(image, axis=0)

    # Process the image with the pre-trained image-model
    # to get the transfer-values.
    transfer_values = image_model_transfer.predict(image_batch)
    
    # -------------------------------------------------------------------
    
    
    # Pre-allocate the 2-dim array used as input to the decoder.
    # This holds just a single sequence of integer-tokens,
    # but the decoder-model expects a batch of sequences.
    shape = (1, max_tokens)
    decoder_input_data = np.zeros(shape=shape, dtype=int)

    # The first input-token is the special start-token for 'ssss '.
    token_int = token_start #1

    # Initialize an empty output-text.
    output_text = ''

    # Initialize the number of tokens we have processed.
    count_tokens = 0

    # While we haven't sampled the special end-token for ' eeee'
    # and we haven't processed the max number of tokens.
    while token_int != token_end and count_tokens < max_tokens:
        # Update the input-sequence to the decoder
        # with the last token that was sampled.
        # In the first iteration this will set the
        # first element to the start-token.
        decoder_input_data[0, count_tokens] = token_int

        # Wrap the input-data in a dict for clarity and safety,
        # so we are sure we input the data in the right order.
        x_data = \
        {
            'transfer_values_input': transfer_values,
            'decoder_input': decoder_input_data
        }

        # Note that we input the entire sequence of tokens
        # to the decoder. This wastes a lot of computation
        # because we are only interested in the last input
        # and output. We could modify the code to return
        # the GRU-states when calling predict() and then
        # feeding these GRU-states as well the next time
        # we call predict(), but it would make the code
        # much more complicated.
        
        # Input this data to the decoder and get the predicted output.
        decoder_output = decoder_model.predict(x_data)
#         print(decoder_output.shape) (1,30,15000) for every iteration
        
        # Get the last predicted token as a one-hot encoded array.
        # Note that this is not limited by softmax, but we just
        # need the index of the largest element so it doesn't matter.
        token_onehot = decoder_output[0, count_tokens, :]
#         print(token_onehot.shape) (15000, ) for every iteration
        # Convert to an integer-token.
        token_int = np.argmax(token_onehot)
#         print(token_int) #the token of a word with the highest score
        
        # Lookup the word corresponding to this integer-token.
        sampled_word = tokenizer.token_to_word(token_int)
#         print(sampled_word)

        # Append the word to the output-text.
        output_text += " " + sampled_word

        # Increment the token-counter.
        count_tokens += 1

    # This is the sequence of tokens output by the decoder.
    output_tokens = decoder_input_data[0]
#     print(output_tokens)
    # Plot the image.
    # plt.imshow(image)
    # plt.show()

    predicted_caption=output_text.split()
    del (predicted_caption[-1])
    output_text = " "
    output_text = output_text.join(predicted_caption)

    # Print the predicted caption.
    # print("Predicted caption:")
    # print(output_text)
    # print()
    return predicted_caption

def beam_search(beam_index, image_path, max_tokens=30):
    image = load_image(image_path, size=(224,224))
    
    # Expand the 3-dim numpy array to 4-dim
    # because the image-model expects a whole batch as input,
    # so we give it a batch with just one image.
    image_batch = np.expand_dims(image, axis=0)

    # Process the image with the pre-trained image-model
    # to get the transfer-values.
    transfer_values = image_model_transfer.predict(image_batch)
    
    token_int = [token_start]
    start_word = [[token_int, 0.0]]
    count_tokens = 0 
    while len(start_word[0][0])<max_tokens:
        temp = []
        
        for s in start_word:
            par_caps = pad_sequences([s[0]], maxlen=max_tokens, padding='post')
            preds = decoder_model.predict([transfer_values,par_caps], verbose=0)
            token_onehot = preds[0, count_tokens, :]
#             print(token_onehot.shape)
            word_preds = np.argsort(token_onehot)[-beam_index:]
#             print(word_preds.shape)
            
            for w in word_preds:
                next_cap, prob = s[0][:], s[1]
                next_cap.append(w)
                prob += token_onehot[w]
                temp.append([next_cap, prob])
                  
        start_word = temp
        count_tokens+=1
        # Sorting according to the probabilities
        start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
        # Getting the top words
        start_word = start_word[-beam_index:]
                  
    start_word = start_word[-1][0]
    intermediate_caption = [tokenizer.token_to_word(i) for i in start_word]
    final_caption = []
    
    for i in intermediate_caption:
        if i != 'eeee':
            final_caption.append(i)
        else:
            break

    # final_caption = ' '.join(final_caption[1:])
    return final_caption[1:]

def generate_caption_any(image_path):
  predicted_caption1=' '.join((greedy_search(image_path=image_path)))
  predicted_caption2=' '.join(beam_search(beam_index=3,image_path=image_path))
  predicted_caption3=' '.join(beam_search(beam_index=5,image_path=image_path))
  return predicted_caption2
  # show_image_using_path(image_path)



def main():
    st.title("ISRO Video Classification & Captioning")
    st.write('In this project, we introduce a technique for video classification and captioning, harnessing a keyframe extraction method to streamline the process. Utilizing Densenet 201, our model is designed to classify videos by focusing on the most crucial frame, optimizing efficiency and performance. Users can experience our innovative approach by employing any of the provided three videos which have provided as an example')

    video_options = {
            "Video 1": "Video001-Scene-001.mp4",
            "Video 2": "Video015-Scene-074.mp4",
            "Video 3": "Video005-Scene-043.mp4",
            "Video 4": "Video002-Scene-023.mp4",
        }

    selected_video = st.selectbox("Select a video to submit", list(video_options.keys()))
    video_path = video_options[selected_video]

    if st.button("Submit"):
        st.video(video_path)
        path=final_image(video_path)
        image=image_test(path)
        output_class=model1.predict(image)
        caption=generate_caption_any(path)
        indices=largest_indices(output_class, 3)
        st.title('The predicted category is:')
        st.write(le.inverse_transform(indices)[0])
        st.title('Caption:')
        caption = caption.capitalize()
        st.write(caption)
        # st.video(uploaded_file)

if __name__ == "__main__":
    main()