|
|
""" |
|
|
LongCLIP processor for preprocessing images and text. |
|
|
|
|
|
This module provides a processor that combines image and text preprocessing |
|
|
for LongCLIP models. |
|
|
""" |
|
|
|
|
|
from typing import List, Optional, Union |
|
|
|
|
|
from transformers import CLIPImageProcessor, CLIPTokenizer |
|
|
from transformers.processing_utils import ProcessorMixin |
|
|
|
|
|
|
|
|
class LongCLIPProcessor(ProcessorMixin): |
|
|
""" |
|
|
Processor for LongCLIP that combines image and text preprocessing. |
|
|
|
|
|
This processor wraps CLIPImageProcessor and CLIPTokenizer to provide |
|
|
a unified interface for preprocessing inputs for LongCLIP models. |
|
|
|
|
|
Args: |
|
|
image_processor (CLIPImageProcessor): Image processor for preprocessing images. |
|
|
tokenizer (CLIPTokenizer): Tokenizer for preprocessing text. |
|
|
|
|
|
Attributes: |
|
|
image_processor_class (str): Name of the image processor class. |
|
|
tokenizer_class (str): Name of the tokenizer class. |
|
|
|
|
|
Example: |
|
|
```python |
|
|
>>> from long_clip_hf import LongCLIPProcessor |
|
|
>>> from transformers import CLIPImageProcessor, CLIPTokenizer |
|
|
>>> from PIL import Image |
|
|
>>> |
|
|
>>> # Initialize processor |
|
|
>>> image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32") |
|
|
>>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") |
|
|
>>> processor = LongCLIPProcessor(image_processor=image_processor, tokenizer=tokenizer) |
|
|
>>> |
|
|
>>> # Process inputs |
|
|
>>> image = Image.open("path/to/image.jpg") |
|
|
>>> text = "a photo of a cat" |
|
|
>>> inputs = processor(text=text, images=image, return_tensors="pt", padding=True, max_length=248) |
|
|
>>> |
|
|
>>> # inputs contains both 'input_ids', 'attention_mask' and 'pixel_values' |
|
|
``` |
|
|
""" |
|
|
|
|
|
attributes = ["image_processor", "tokenizer"] |
|
|
image_processor_class = "CLIPImageProcessor" |
|
|
tokenizer_class = "CLIPTokenizer" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
image_processor: Optional[CLIPImageProcessor] = None, |
|
|
tokenizer: Optional[CLIPTokenizer] = None, |
|
|
**kwargs, |
|
|
): |
|
|
if image_processor is None: |
|
|
raise ValueError("You need to specify an `image_processor`.") |
|
|
if tokenizer is None: |
|
|
raise ValueError("You need to specify a `tokenizer`.") |
|
|
|
|
|
super().__init__(image_processor, tokenizer) |
|
|
|
|
|
def __call__( |
|
|
self, |
|
|
text: Union[str, List[str], None] = None, |
|
|
images=None, |
|
|
return_tensors: Optional[str] = "pt", |
|
|
padding: Union[bool, str] = True, |
|
|
max_length: Optional[int] = 248, |
|
|
truncation: Optional[bool] = True, |
|
|
**kwargs, |
|
|
): |
|
|
""" |
|
|
Preprocess text and images for LongCLIP model. |
|
|
|
|
|
Args: |
|
|
text (str, List[str], optional): Text or list of texts to process. |
|
|
images: Image or list of images to process. Can be PIL Image, numpy array, or tensor. |
|
|
return_tensors (str, optional): Type of tensors to return ('pt' for PyTorch). |
|
|
padding (bool or str, optional): Padding strategy. Defaults to True. |
|
|
max_length (int, optional): Maximum sequence length. Defaults to 248 for LongCLIP. |
|
|
truncation (bool, optional): Whether to truncate sequences. Defaults to True. |
|
|
**kwargs: Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
BatchEncoding: Dictionary containing processed inputs with keys: |
|
|
- input_ids: Tokenized text (if text provided) |
|
|
- attention_mask: Attention mask for text (if text provided) |
|
|
- pixel_values: Processed images (if images provided) |
|
|
""" |
|
|
|
|
|
if text is not None: |
|
|
text_inputs = self.tokenizer( |
|
|
text, |
|
|
return_tensors=return_tensors, |
|
|
padding=padding, |
|
|
max_length=max_length, |
|
|
truncation=truncation, |
|
|
**kwargs, |
|
|
) |
|
|
else: |
|
|
text_inputs = {} |
|
|
|
|
|
|
|
|
if images is not None: |
|
|
image_inputs = self.image_processor( |
|
|
images, |
|
|
return_tensors=return_tensors, |
|
|
) |
|
|
else: |
|
|
image_inputs = {} |
|
|
|
|
|
|
|
|
return {**text_inputs, **image_inputs} |
|
|
|
|
|
def batch_decode(self, *args, **kwargs): |
|
|
""" |
|
|
Decode token IDs back to text. |
|
|
|
|
|
This method is forwarded to the tokenizer's batch_decode method. |
|
|
""" |
|
|
return self.tokenizer.batch_decode(*args, **kwargs) |
|
|
|
|
|
def decode(self, *args, **kwargs): |
|
|
""" |
|
|
Decode token IDs back to text. |
|
|
|
|
|
This method is forwarded to the tokenizer's decode method. |
|
|
""" |
|
|
return self.tokenizer.decode(*args, **kwargs) |
|
|
|
|
|
@property |
|
|
def model_input_names(self): |
|
|
""" |
|
|
Get the names of model inputs. |
|
|
|
|
|
Returns: |
|
|
List[str]: List of input names. |
|
|
""" |
|
|
tokenizer_input_names = self.tokenizer.model_input_names |
|
|
image_processor_input_names = self.image_processor.model_input_names |
|
|
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) |
|
|
|