import re import torch import requests from PIL import Image, ImageDraw from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration repo = "microsoft/kosmos-2.5" device = "cuda:0" dtype = torch.bfloat16 model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype) processor = AutoProcessor.from_pretrained(repo) # sample image url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png" image = Image.open(requests.get(url, stream=True).raw) prompt = "" inputs = processor(text=prompt, images=image, return_tensors="pt") height, width = inputs.pop("height"), inputs.pop("width") raw_width, raw_height = image.size scale_height = raw_height / height scale_width = raw_width / width inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()} inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype) generated_ids = model.generate( **inputs, max_new_tokens=1024, ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) print(generated_text[0])