RSCLIP Collections
Collection
A collection of Remote Sensing CLIP models in both huggingface/transformers and huggingface/diffusers text encoder production ready style
•
15 items
•
Updated
This model is a mirror/redistribution of the original lcybuaa/Git-RSCLIP model.
Git-RSCLIP is pre-trained on the Git-10M dataset, a global-scale remote sensing image-text pair dataset consisting of 10 million pairs. It uses a structure similar to SigLIP and is designed for tasks like image-text retrieval and zero-shot classification in the remote sensing domain.
transformers
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
# Load model and processor
model = CLIPModel.from_pretrained("BiliSakura/Git-RSCLIP-ViT-L-16")
processor = CLIPProcessor.from_pretrained("BiliSakura/Git-RSCLIP-ViT-L-16")
# Load and process image
image = Image.open("path/to/your/image.jpg")
inputs = processor(
text=["a photo of a building", "a photo of vegetation", "a photo of water"],
images=image,
return_tensors="pt",
padding=True
)
# Get image-text similarity scores
with torch.inference_mode():
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(f"Similarity scores: {probs}")
Zero-shot image classification:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
model = CLIPModel.from_pretrained("BiliSakura/Git-RSCLIP-ViT-L-16")
processor = CLIPProcessor.from_pretrained("BiliSakura/Git-RSCLIP-ViT-L-16")
# Define candidate labels
candidate_labels = [
"a satellite image of urban area",
"a satellite image of forest",
"a satellite image of agricultural land",
"a satellite image of water body"
]
image = Image.open("path/to/your/image.jpg")
inputs = processor(
text=candidate_labels,
images=image,
return_tensors="pt",
padding=True
)
with torch.inference_mode():
outputs = model(**inputs)
probs = outputs.logits_per_image.softmax(dim=1)
# Get the predicted label
predicted_idx = probs.argmax().item()
print(f"Predicted label: {candidate_labels[predicted_idx]}")
print(f"Confidence: {probs[0][predicted_idx]:.4f}")
Extracting individual features:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
model = CLIPModel.from_pretrained("BiliSakura/Git-RSCLIP-ViT-L-16")
processor = CLIPProcessor.from_pretrained("BiliSakura/Git-RSCLIP-ViT-L-16")
# Get image features only
image = Image.open("path/to/your/image.jpg")
image_inputs = processor(images=image, return_tensors="pt")
with torch.inference_mode():
image_features = model.get_image_features(**image_inputs)
# Get text features only
text_inputs = processor(
text=["a satellite image of urban area"],
return_tensors="pt",
padding=True,
truncation=True
)
with torch.inference_mode():
text_features = model.get_text_features(**text_inputs)
print(f"Image features shape: {image_features.shape}")
print(f"Text features shape: {text_features.shape}")
diffusers
This model's text encoder can be used with Stable Diffusion and other diffusion models:
from diffusers import StableDiffusionPipeline
from transformers import CLIPTextModel, CLIPTokenizer
import torch
# Load the text encoder and tokenizer
text_encoder = CLIPTextModel.from_pretrained(
"BiliSakura/Git-RSCLIP-ViT-L-16/diffusers",
subfolder="text_encoder",
torch_dtype=torch.float16
)
tokenizer = CLIPTokenizer.from_pretrained(
"BiliSakura/Git-RSCLIP-ViT-L-16"
)
# Encode text prompt
prompt = "a satellite image of a city with buildings and roads"
text_inputs = tokenizer(
prompt,
padding="max_length",
max_length=77,
truncation=True,
return_tensors="pt"
)
with torch.inference_mode():
text_outputs = text_encoder(text_inputs.input_ids)
text_embeddings = text_outputs.last_hidden_state
print(f"Text embeddings shape: {text_embeddings.shape}")
Using with Stable Diffusion:
from diffusers import StableDiffusionPipeline
import torch
# Load pipeline with custom text encoder
pipe = StableDiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
text_encoder=text_encoder,
tokenizer=tokenizer,
torch_dtype=torch.float16
)
pipe = pipe.to("cuda")
# Generate image
prompt = "a high-resolution satellite image of urban area"
image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
image.save("generated_image.png")
If you use this model in your research, please cite the original work:
@misc{liu2025text2earthunlockingtextdrivenremote,
title={Text2Earth: Unlocking Text-driven Remote Sensing Image Generation with a Global-Scale Dataset and a Foundation Model},
author={Chenyang Liu and Keyan Chen and Rui Zhao and Zhengxia Zou and Zhenwei Shi},
year={2025},
eprint={2501.00895},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2501.00895},
}