orynxml-agents / app /agent /huggingface_agent.py
Speedofmastery's picture
Upload folder using huggingface_hub
88f3fce verified
"""
Hugging Face Agent Integration for OpenManus
Extends the main AI agent with access to thousands of HuggingFace models
"""
import os
from typing import Any, Dict, List, Optional
from app.agent.base import BaseAgent
from app.huggingface_models import ModelCategory
from app.logger import logger
from app.tool.huggingface_models_tool import HuggingFaceModelsTool
class HuggingFaceAgent(BaseAgent):
"""AI Agent with integrated HuggingFace model access"""
def __init__(self, **config):
super().__init__(**config)
# Initialize HuggingFace integration
hf_token = os.getenv("HUGGINGFACE_TOKEN") or config.get("huggingface_token")
if not hf_token:
logger.warning(
"No Hugging Face token provided. HF models will not be available."
)
self.hf_tool = None
else:
self.hf_tool = HuggingFaceModelsTool(hf_token)
# Default models for different tasks
self.default_models = {
"text_generation": "MiniMax-M2", # Latest high-performance model
"image_generation": "FLUX.1 Dev", # Best quality image generation
"speech_recognition": "Whisper Large v3", # Best multilingual ASR
"text_to_speech": "Kokoro 82M", # High quality, lightweight TTS
"image_classification": "ViT Base Patch16", # General image classification
"embeddings": "Sentence Transformers All MiniLM", # Fast embeddings
"translation": "M2M100 1.2B", # Multilingual translation
"summarization": "PEGASUS XSum", # Abstractive summarization
}
async def generate_text_with_hf(
self,
prompt: str,
model_name: Optional[str] = None,
max_tokens: int = 200,
temperature: float = 0.7,
stream: bool = False,
) -> Dict[str, Any]:
"""Generate text using HuggingFace models"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or self.default_models["text_generation"]
return await self.hf_tool.text_generation(
model_name=model_name,
prompt=prompt,
max_tokens=max_tokens,
temperature=temperature,
stream=stream,
)
async def generate_image_with_hf(
self,
prompt: str,
model_name: Optional[str] = None,
negative_prompt: Optional[str] = None,
width: int = 1024,
height: int = 1024,
) -> Dict[str, Any]:
"""Generate images using HuggingFace models"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or self.default_models["image_generation"]
return await self.hf_tool.generate_image(
model_name=model_name,
prompt=prompt,
negative_prompt=negative_prompt,
width=width,
height=height,
)
async def transcribe_audio_with_hf(
self,
audio_data: bytes,
model_name: Optional[str] = None,
language: Optional[str] = None,
) -> Dict[str, Any]:
"""Transcribe audio using HuggingFace models"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or self.default_models["speech_recognition"]
return await self.hf_tool.transcribe_audio(
model_name=model_name, audio_data=audio_data, language=language
)
async def synthesize_speech_with_hf(
self,
text: str,
model_name: Optional[str] = None,
voice_id: Optional[str] = None,
) -> Dict[str, Any]:
"""Generate speech from text using HuggingFace models"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or self.default_models["text_to_speech"]
return await self.hf_tool.text_to_speech(
model_name=model_name, text=text, voice_id=voice_id
)
async def classify_image_with_hf(
self, image_data: bytes, model_name: Optional[str] = None, task: str = "general"
) -> Dict[str, Any]:
"""Classify images using HuggingFace models"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
# Choose model based on task
if task == "nsfw":
model_name = "NSFW Image Detection"
elif task == "emotions":
model_name = "Facial Emotions Detection"
elif task == "deepfake":
model_name = "Deepfake Detection"
else:
model_name = model_name or self.default_models["image_classification"]
return await self.hf_tool.classify_image(
model_name=model_name, image_data=image_data
)
async def get_text_embeddings_with_hf(
self, texts: List[str], model_name: Optional[str] = None
) -> Dict[str, Any]:
"""Get text embeddings using HuggingFace models"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or self.default_models["embeddings"]
return await self.hf_tool.get_embeddings(model_name=model_name, texts=texts)
async def translate_with_hf(
self,
text: str,
target_language: str,
source_language: Optional[str] = None,
model_name: Optional[str] = None,
) -> Dict[str, Any]:
"""Translate text using HuggingFace models"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or self.default_models["translation"]
return await self.hf_tool.translate_text(
model_name=model_name,
text=text,
source_language=source_language,
target_language=target_language,
)
async def summarize_with_hf(
self, text: str, model_name: Optional[str] = None, max_length: int = 150
) -> Dict[str, Any]:
"""Summarize text using HuggingFace models"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or self.default_models["summarization"]
return await self.hf_tool.summarize_text(
model_name=model_name, text=text, max_length=max_length
)
def get_available_hf_models(self, category: Optional[str] = None) -> Dict[str, Any]:
"""Get list of available HuggingFace models"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
return self.hf_tool.list_available_models(category)
async def smart_model_selection(
self, task_description: str, content_type: str = "text"
) -> str:
"""
Intelligently select the best HuggingFace model for a task
Args:
task_description: Description of what the user wants to do
content_type: Type of content (text, image, audio, video)
"""
task_lower = task_description.lower()
# Video generation and processing
if any(
keyword in task_lower
for keyword in [
"video",
"movie",
"animation",
"motion",
"gif",
"sequence",
"frames",
]
):
if "generate" in task_lower or "create" in task_lower:
return "Stable Video Diffusion"
elif "analyze" in task_lower or "describe" in task_lower:
return "Video ChatGPT"
else:
return "AnimateDiff"
# Code and App Development
elif any(
keyword in task_lower
for keyword in [
"code",
"programming",
"app",
"application",
"software",
"develop",
"build",
"function",
"class",
"api",
"database",
"website",
"frontend",
"backend",
]
):
if "app" in task_lower or "application" in task_lower:
return "CodeLlama 34B Instruct" # Best for full applications
elif "python" in task_lower:
return "WizardCoder 34B" # Python specialist
elif "api" in task_lower:
return "StarCoder2 15B" # Good for APIs
elif "explain" in task_lower or "comment" in task_lower:
return "Phind CodeLlama" # Best for code explanation
else:
return "DeepSeek Coder V2" # General coding
# 3D and AR/VR Content
elif any(
keyword in task_lower
for keyword in [
"3d",
"three dimensional",
"mesh",
"model",
"obj",
"stl",
"ar",
"vr",
"augmented reality",
"virtual reality",
"texture",
"material",
]
):
if "text" in task_lower and ("3d" in task_lower or "model" in task_lower):
return "Shap-E"
elif "image" in task_lower and "3d" in task_lower:
return "DreamFusion"
else:
return "Point-E"
# Document Processing and OCR
elif any(
keyword in task_lower
for keyword in [
"ocr",
"document",
"pdf",
"scan",
"extract text",
"handwriting",
"form",
"table",
"layout",
"invoice",
"receipt",
"contract",
]
):
if "handwriting" in task_lower or "handwritten" in task_lower:
return "TrOCR Handwritten"
elif "table" in task_lower:
return "TableTransformer"
elif "form" in task_lower:
return "FormNet"
else:
return "TrOCR Large"
# Multimodal AI
elif any(
keyword in task_lower
for keyword in [
"visual question",
"image question",
"describe image",
"multimodal",
"vision language",
"image text",
"cross modal",
]
):
if "chat" in task_lower or "conversation" in task_lower:
return "GPT-4V"
elif "question" in task_lower:
return "LLaVA"
else:
return "BLIP-2"
# Creative Content
elif any(
keyword in task_lower
for keyword in [
"story",
"creative",
"poem",
"poetry",
"novel",
"screenplay",
"script",
"blog",
"article",
"marketing",
"copy",
"advertising",
]
):
if "story" in task_lower or "novel" in task_lower:
return "Novel AI"
elif "poem" in task_lower or "poetry" in task_lower:
return "Poet Assistant"
elif "marketing" in task_lower or "copy" in task_lower:
return "Marketing Copy AI"
else:
return "GPT-3.5 Creative"
# Game Development
elif any(
keyword in task_lower
for keyword in [
"game",
"character",
"npc",
"level",
"dialogue",
"asset",
"quest",
"gameplay",
"mechanic",
"unity",
"unreal",
]
):
if "character" in task_lower:
return "Character AI"
elif "level" in task_lower or "environment" in task_lower:
return "Level Designer"
elif "dialogue" in task_lower or "conversation" in task_lower:
return "Dialogue Writer"
else:
return "Asset Creator"
# Science and Research
elif any(
keyword in task_lower
for keyword in [
"research",
"scientific",
"paper",
"analysis",
"data",
"protein",
"molecule",
"chemistry",
"biology",
"physics",
"experiment",
]
):
if "protein" in task_lower or "folding" in task_lower:
return "AlphaFold"
elif "molecule" in task_lower or "chemistry" in task_lower:
return "ChemBERTa"
elif "data" in task_lower and "analysis" in task_lower:
return "Data Analyst"
else:
return "SciBERT"
# Business and Productivity
elif any(
keyword in task_lower
for keyword in [
"email",
"business",
"report",
"presentation",
"meeting",
"project",
"plan",
"proposal",
"memo",
"letter",
"professional",
]
):
if "email" in task_lower:
return "Email Assistant"
elif "presentation" in task_lower:
return "Presentation AI"
elif "report" in task_lower:
return "Report Writer"
elif "meeting" in task_lower:
return "Meeting Summarizer"
else:
return "Project Planner"
# Specialized AI
elif any(
keyword in task_lower
for keyword in [
"music",
"audio",
"sound",
"voice clone",
"enhance",
"restore",
"upscale",
"remove background",
"inpaint",
"style transfer",
]
):
if "music" in task_lower:
return "MusicGen"
elif "voice" in task_lower and "clone" in task_lower:
return "Voice Cloner"
elif "upscale" in task_lower or "enhance" in task_lower:
return "Real-ESRGAN"
elif "background" in task_lower and "remove" in task_lower:
return "Background Remover"
elif "restore" in task_lower or "face" in task_lower:
return "GFPGAN"
else:
return "LaMa"
# Traditional categories
elif any(
keyword in task_lower
for keyword in [
"generate",
"write",
"create",
"compose",
"chat",
"conversation",
]
):
if "chat" in task_lower or "conversation" in task_lower:
return "Llama 3.1 8B Instruct"
else:
return "MiniMax-M2"
# Image generation
elif any(
keyword in task_lower
for keyword in ["image", "picture", "draw", "art", "photo", "visual"]
):
if "fast" in task_lower or "quick" in task_lower:
return "FLUX.1 Schnell"
else:
return "FLUX.1 Dev"
# Audio processing
elif any(
keyword in task_lower
for keyword in ["transcribe", "speech to text", "recognize", "audio"]
):
if content_type == "audio" or "transcribe" in task_lower:
return "Whisper Large v3"
# Text-to-speech
elif any(
keyword in task_lower
for keyword in ["speak", "voice", "text to speech", "tts"]
):
if "fast" in task_lower:
return "Kokoro 82M" # Lightweight and fast
else:
return "VibeVoice 1.5B" # High quality
# Image analysis
elif (
any(
keyword in task_lower
for keyword in ["classify", "analyze image", "detect", "recognize"]
)
and content_type == "image"
):
if "nsfw" in task_lower or "safe" in task_lower:
return "NSFW Image Detection"
elif "emotion" in task_lower or "face" in task_lower:
return "Facial Emotions Detection"
elif "deepfake" in task_lower or "fake" in task_lower:
return "Deepfake Detection"
else:
return "ViT Base Patch16" # General classification
# Translation
elif any(
keyword in task_lower for keyword in ["translate", "language", "convert"]
):
return "M2M100 1.2B" # Multilingual translation
# Summarization
elif any(
keyword in task_lower
for keyword in ["summarize", "summary", "abstract", "brief"]
):
return "PEGASUS XSum" # Best summarization
# Embeddings/similarity
elif any(
keyword in task_lower
for keyword in ["similar", "embed", "vector", "search", "match"]
):
return "Sentence Transformers All MiniLM" # Fast embeddings
# Default fallback
else:
return "MiniMax-M2" # Best general-purpose model
async def execute_hf_task(
self, task: str, content: Any, model_name: Optional[str] = None, **kwargs
) -> Dict[str, Any]:
"""
Execute any HuggingFace task with intelligent model selection
Args:
task: Task description (e.g., "generate image", "transcribe audio")
content: Input content (text, image bytes, audio bytes)
model_name: Specific model to use (optional)
**kwargs: Additional parameters
"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
try:
task_lower = task.lower()
# Determine content type
content_type = "text"
if isinstance(content, bytes):
if (
b"PNG" in content[:20]
or b"JFIF" in content[:20]
or b"GIF" in content[:20]
):
content_type = "image"
else:
content_type = "audio"
# Auto-select model if not specified
if not model_name:
model_name = await self.smart_model_selection(task, content_type)
# Route to appropriate method based on task
if "generate" in task_lower and (
"image" in task_lower or "picture" in task_lower
):
return await self.generate_image_with_hf(content, model_name, **kwargs)
elif "transcribe" in task_lower or "speech to text" in task_lower:
return await self.transcribe_audio_with_hf(
content, model_name, **kwargs
)
elif "text to speech" in task_lower or "tts" in task_lower:
return await self.synthesize_speech_with_hf(
content, model_name, **kwargs
)
elif "classify" in task_lower and content_type == "image":
return await self.classify_image_with_hf(content, model_name, **kwargs)
elif "embed" in task_lower or "vector" in task_lower:
texts = [content] if isinstance(content, str) else content
return await self.get_text_embeddings_with_hf(texts, model_name)
elif "translate" in task_lower:
return await self.translate_with_hf(
content, model_name=model_name, **kwargs
)
elif "summarize" in task_lower:
return await self.summarize_with_hf(content, model_name, **kwargs)
else:
# Default to text generation
return await self.generate_text_with_hf(content, model_name, **kwargs)
except Exception as e:
logger.error(f"HuggingFace task execution failed: {e}")
return {"error": f"Task execution failed: {str(e)}"}
async def chat_with_hf_models(
self, message: str, conversation_history: List[Dict] = None
) -> Dict[str, Any]:
"""
Enhanced chat with access to HuggingFace models
This method extends the base agent's capabilities with HF models
"""
# Check if the user is asking for HuggingFace-specific functionality
message_lower = message.lower()
# Handle model listing requests
if "list" in message_lower and (
"model" in message_lower or "hf" in message_lower
):
return self.get_available_hf_models()
# Handle specific model requests
hf_keywords = [
"generate image",
"create image",
"draw",
"picture",
"transcribe",
"speech to text",
"audio",
"text to speech",
"speak",
"voice",
"translate",
"language",
"classify image",
"embed",
"vector",
"similarity",
"summarize",
]
if any(keyword in message_lower for keyword in hf_keywords):
# This is likely a HuggingFace model request
return await self.execute_hf_task(message, message)
# For regular chat, we can enhance responses with HF models
# First get a response from the base agent
base_response = await super().chat(message, conversation_history)
# Optionally enhance with HF capabilities if relevant
if "image" in message_lower and "generate" in message_lower:
# User might want image generation
base_response["hf_suggestion"] = {
"action": "generate_image",
"models": ["FLUX.1 Dev", "FLUX.1 Schnell", "Stable Diffusion XL"],
"message": "I can also generate images for you using HuggingFace models. Just ask!",
}
return base_response
# New methods for expanded model categories
async def generate_video_with_hf(
self, prompt: str, model_name: Optional[str] = None, **kwargs
) -> Dict[str, Any]:
"""Generate video from text prompt"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or "Stable Video Diffusion"
return await self.hf_tool.text_to_video(
model_name=model_name, prompt=prompt, **kwargs
)
async def generate_code_with_hf(
self,
prompt: str,
language: str = "python",
model_name: Optional[str] = None,
**kwargs,
) -> Dict[str, Any]:
"""Generate code from natural language description"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or "CodeLlama 34B Instruct"
return await self.hf_tool.code_generation(
model_name=model_name, prompt=prompt, language=language, **kwargs
)
async def generate_app_with_hf(
self,
description: str,
app_type: str = "web_app",
model_name: Optional[str] = None,
**kwargs,
) -> Dict[str, Any]:
"""Generate complete application from description"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or "CodeLlama 34B Instruct"
enhanced_prompt = f"Create a {app_type} application: {description}"
return await self.hf_tool.code_generation(
model_name=model_name, prompt=enhanced_prompt, **kwargs
)
async def generate_3d_model_with_hf(
self, prompt: str, model_name: Optional[str] = None, **kwargs
) -> Dict[str, Any]:
"""Generate 3D model from text description"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or "Shap-E"
return await self.hf_tool.text_to_3d(
model_name=model_name, prompt=prompt, **kwargs
)
async def process_document_with_hf(
self,
document_data: bytes,
task_type: str = "ocr",
model_name: Optional[str] = None,
**kwargs,
) -> Dict[str, Any]:
"""Process documents with OCR and analysis"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
if task_type == "ocr":
model_name = model_name or "TrOCR Large"
return await self.hf_tool.ocr(
model_name=model_name, image_data=document_data, **kwargs
)
else:
model_name = model_name or "LayoutLMv3"
return await self.hf_tool.document_analysis(
model_name=model_name, document_data=document_data, **kwargs
)
async def multimodal_chat_with_hf(
self, image_data: bytes, text: str, model_name: Optional[str] = None, **kwargs
) -> Dict[str, Any]:
"""Chat with images using multimodal models"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or "BLIP-2"
return await self.hf_tool.vision_language(
model_name=model_name, image_data=image_data, text=text, **kwargs
)
async def generate_music_with_hf(
self,
prompt: str,
duration: int = 30,
model_name: Optional[str] = None,
**kwargs,
) -> Dict[str, Any]:
"""Generate music from text description"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or "MusicGen"
return await self.hf_tool.music_generation(
model_name=model_name, prompt=prompt, duration=duration, **kwargs
)
async def enhance_image_with_hf(
self,
image_data: bytes,
task_type: str = "super_resolution",
model_name: Optional[str] = None,
**kwargs,
) -> Dict[str, Any]:
"""Enhance images with various AI models"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
if task_type == "super_resolution":
model_name = model_name or "Real-ESRGAN"
return await self.hf_tool.super_resolution(
model_name=model_name, image_data=image_data, **kwargs
)
elif task_type == "background_removal":
model_name = model_name or "Background Remover"
return await self.hf_tool.background_removal(
model_name=model_name, image_data=image_data, **kwargs
)
elif task_type == "face_restoration":
model_name = model_name or "GFPGAN"
return await self.hf_tool.super_resolution(
model_name=model_name, image_data=image_data, **kwargs
)
async def generate_creative_content_with_hf(
self,
prompt: str,
content_type: str = "story",
model_name: Optional[str] = None,
**kwargs,
) -> Dict[str, Any]:
"""Generate creative content like stories, poems, etc."""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or "GPT-3.5 Creative"
enhanced_prompt = f"Write a {content_type}: {prompt}"
return await self.hf_tool.creative_writing(
model_name=model_name, prompt=enhanced_prompt, **kwargs
)
async def generate_game_content_with_hf(
self,
description: str,
content_type: str = "character",
model_name: Optional[str] = None,
**kwargs,
) -> Dict[str, Any]:
"""Generate game development content"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or "Character AI"
enhanced_prompt = f"Create game {content_type}: {description}"
return await self.hf_tool.creative_writing(
model_name=model_name, prompt=enhanced_prompt, **kwargs
)
async def generate_business_document_with_hf(
self,
context: str,
document_type: str = "email",
model_name: Optional[str] = None,
**kwargs,
) -> Dict[str, Any]:
"""Generate business documents and content"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or "Email Assistant"
return await self.hf_tool.business_document(
model_name=model_name,
document_type=document_type,
context=context,
**kwargs,
)
async def research_assistance_with_hf(
self,
topic: str,
research_type: str = "analysis",
model_name: Optional[str] = None,
**kwargs,
) -> Dict[str, Any]:
"""Research assistance and scientific content generation"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
model_name = model_name or "SciBERT"
enhanced_prompt = f"Research {research_type} on: {topic}"
return await self.hf_tool.text_generation(
model_name=model_name, prompt=enhanced_prompt, **kwargs
)
def get_available_hf_models(self, category: Optional[str] = None) -> Dict[str, Any]:
"""Get available models by category"""
if not self.hf_tool:
return {"error": "HuggingFace integration not available"}
return self.hf_tool.list_available_models(category=category)