Cactus supports vision-language models (VLMs) that can process both images and text in the same conversation. These models understand visual content and can answer questions, describe images, and perform visual reasoning.
messages = json.dumps([{ "role": "user", "content": "Compare these two images. What are the differences?", "images": ["image1.jpg", "image2.jpg"]}])result = json.loads(cactus_complete(model, messages, None, None, None))print(result["response"])
conversation = []# First turn: Show imageconversation.append({ "role": "user", "content": "What's in this image?", "images": ["scene.jpg"]})messages = json.dumps(conversation)result = json.loads(cactus_complete(model, messages, None, None, None))conversation.append({"role": "assistant", "content": result["response"]})# Second turn: Follow-up question (no new image)conversation.append({ "role": "user", "content": "What color is the car?"})messages = json.dumps(conversation)result = json.loads(cactus_complete(model, messages, None, None, None))print(result["response"])
tasks = [ "How many people are in this photo?", "What is the dominant color?", "Is this photo taken indoors or outdoors?", "What time of day is it?"]for question in tasks: messages = json.dumps([{ "role": "user", "content": question, "images": ["photo.jpg"] }]) result = json.loads(cactus_complete(model, messages, None, None, None)) print(f"Q: {question}") print(f"A: {result['response']}\n")
messages = json.dumps([{ "role": "user", "content": "List all objects visible in this image with their approximate locations.", "images": ["scene.jpg"]}])result = json.loads(cactus_complete(model, messages, None, None, None))print(result["response"])
try: result = json.loads(cactus_complete(model, messages, None, None, None)) if not result["success"]: print(f"Error: {result['error']}")except RuntimeError as e: print(f"Failed to process image: {e}")