File size: 6,609 Bytes
e4b4ce0
 
 
 
 
 
 
 
 
 
 
5763a51
e4b4ce0
 
9f2d6e6
 
e4b4ce0
 
 
86ca445
e4b4ce0
205e4ca
 
 
e4b4ce0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b129969
e4b4ce0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import re
from PIL import Image 
import os
import numpy as np

import spaces
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

model = AutoModelForCausalLM.from_pretrained('thwri/CogFlorence-2.1-Large', trust_remote_code=True).to("cuda").eval()
processor = AutoProcessor.from_pretrained('thwri/CogFlorence-2.1-Large', trust_remote_code=True)

TITLE = "# [thwri/CogFlorence-2.1-Large](https://huggingface.co/thwri/CogFlorence-2.1-Large/)"
DESCRIPTION = "[microsoft/Florence-2-large](https://huggingface.co/microsoft/Florence-2-large) tuned on [Ejafa/ye-pop](https://huggingface.co/datasets/Ejafa/ye-pop) captioned with [CogVLM2](https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B)"

def modify_caption(caption: str) -> str:
    special_patterns = [
        (r'the image is ', ''),
        (r'the image captures ', ''),
        (r'the image showcases ', ''),
        (r'the image shows ', ''),
        (r'the image ', ''),
    ]
    
    for pattern, replacement in special_patterns:
        caption = re.sub(pattern, replacement, caption, flags=re.IGNORECASE)
    
    caption = caption.replace('\n', '').replace('\r', '')
    caption = re.sub(r'(?<=[.,?!])(?=[^\s])', r' ', caption)
    caption = ' '.join(caption.strip().splitlines())
    
    return caption

@spaces.GPU
def process_image(image):
    if isinstance(image, np.ndarray):
        image = Image.fromarray(image)
    elif isinstance(image, str):
        image = Image.open(image)
    if image.mode != "RGB":
        image = image.convert("RGB")
    
    prompt = "<MORE_DETAILED_CAPTION>"
    
    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3,
        do_sample=True
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
    return modify_caption(parsed_answer["<MORE_DETAILED_CAPTION>"])

def extract_frames(image_path, output_folder):
    with Image.open(image_path) as img:
        base_name = os.path.splitext(os.path.basename(image_path))[0]
        frame_paths = []
        
        try:
            for i in range(0, img.n_frames):
                img.seek(i)
                frame_path = os.path.join(output_folder, f"{base_name}_frame_{i:03d}.png")
                img.save(frame_path)
                frame_paths.append(frame_path)
        except EOFError:
            pass  # We've reached the end of the sequence
        
        return frame_paths

def process_folder(folder_path):
    if not os.path.isdir(folder_path):
        return "Invalid folder path."
    
    processed_files = []
    skipped_files = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp', '.heic')):
            image_path = os.path.join(folder_path, filename)
            txt_filename = os.path.splitext(filename)[0] + '.txt'
            txt_path = os.path.join(folder_path, txt_filename)
            
            # Check if the corresponding text file already exists
            if os.path.exists(txt_path):
                skipped_files.append(f"Skipped {filename} (text file already exists)")
                continue
            
            # Check if the image has multiple frames
            with Image.open(image_path) as img:
                if getattr(img, "is_animated", False) and img.n_frames > 1:
                    # Extract frames
                    frames = extract_frames(image_path, folder_path)
                    for frame_path in frames:
                        frame_txt_filename = os.path.splitext(os.path.basename(frame_path))[0] + '.txt'
                        frame_txt_path = os.path.join(folder_path, frame_txt_filename)
                        
                        # Check if the corresponding text file for the frame already exists
                        if os.path.exists(frame_txt_path):
                            skipped_files.append(f"Skipped {os.path.basename(frame_path)} (text file already exists)")
                            continue
                        
                        caption = process_image(frame_path)
                        
                        with open(frame_txt_path, 'w', encoding='utf-8') as f:
                            f.write(caption)
                        
                        processed_files.append(f"Processed {os.path.basename(frame_path)} -> {frame_txt_filename}")
                else:
                    # Process single image
                    caption = process_image(image_path)
                    
                    with open(txt_path, 'w', encoding='utf-8') as f:
                        f.write(caption)
                    
                    processed_files.append(f"Processed {filename} -> {txt_filename}")
    
    result = "\n".join(processed_files + skipped_files)
    return result if result else "No image files found or all files were skipped in the specified folder."

css = """
#output { height: 500px; overflow: auto; border: 1px solid #ccc; }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(TITLE)
    gr.Markdown(DESCRIPTION)
    
    with gr.Tab(label="Single Image Processing"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")
        
        gr.Examples(
            [["image1.jpg"], ["image2.jpg"], ["image3.png"], ["image4.jpg"], ["image5.jpg"], ["image6.PNG"]],
            inputs=[input_img],
            outputs=[output_text],
            fn=process_image,
            label='Try captioning on below examples'
        )
        
        submit_btn.click(process_image, [input_img], [output_text])

    with gr.Tab(label="Batch Processing"):
        with gr.Row():
            folder_input = gr.Textbox(label="Input Folder Path")
            batch_submit_btn = gr.Button(value="Process Folder")
        batch_output = gr.Textbox(label="Batch Processing Results", lines=10)
        
        batch_submit_btn.click(process_folder, [folder_input], [batch_output])

demo.launch(debug=True)