• About Us
  • Disclaimer
  • Contact Us
  • Privacy Policy
Thursday, March 26, 2026
mGrowTech
No Result
View All Result
  • Technology And Software
    • Account Based Marketing
    • Channel Marketing
    • Marketing Automation
      • Al, Analytics and Automation
      • Ad Management
  • Digital Marketing
    • Social Media Management
    • Google Marketing
  • Direct Marketing
    • Brand Management
    • Marketing Attribution and Consulting
  • Mobile Marketing
  • Event Management
  • PR Solutions
  • Technology And Software
    • Account Based Marketing
    • Channel Marketing
    • Marketing Automation
      • Al, Analytics and Automation
      • Ad Management
  • Digital Marketing
    • Social Media Management
    • Google Marketing
  • Direct Marketing
    • Brand Management
    • Marketing Attribution and Consulting
  • Mobile Marketing
  • Event Management
  • PR Solutions
No Result
View All Result
mGrowTech
No Result
View All Result
Home Al, Analytics and Automation

How to Build a Vision-Guided Web AI Agent with MolmoWeb-4B Using Multimodal Reasoning and Action Prediction

Josh by Josh
March 26, 2026
in Al, Analytics and Automation
0
How to Build a Vision-Guided Web AI Agent with MolmoWeb-4B Using Multimodal Reasoning and Action Prediction


def parse_click_coords(action_str):
   """
   Extract normalised (x, y) coordinates from a click action string.
   e.g., 'click(0.45, 0.32)' -> (0.45, 0.32)
   Returns None if the action is not a click.
   """
   match = re.search(r"click\(\s*([\d.]+)\s*,\s*([\d.]+)\s*\)", action_str)
   if match:
       return float(match.group(1)), float(match.group(2))
   return None




def parse_action_details(action_str):
   """
   Parse a MolmoWeb action string into a structured dict.
   Returns:  {"type": "click", "x": 0.45, "y": 0.32}
             {"type": "goto", "url": "https://..."}
             {"type": "type", "text": "query text"}
             {"type": "scroll", "direction": "down"}
             {"type": "press", "key": "Enter"}
             {"type": "send_msg", "message": "The answer is ..."}
             {"type": "unknown", "raw": "..."}
   """
   action_str = action_str.strip()


   m = re.match(r'click\(\s*([\d.]+)\s*,\s*([\d.]+)\s*\)', action_str)
   if m:
       return {"type": "click", "x": float(m.group(1)), "y": float(m.group(2))}


   m = re.match(r'goto\(\s*["\'](.+?)["\']\s*\)', action_str)
   if m:
       return {"type": "goto", "url": m.group(1)}


   m = re.match(r'type\(\s*["\'](.+?)["\']\s*\)', action_str)
   if m:
       return {"type": "type", "text": m.group(1)}


   m = re.match(r'scroll\(\s*["\']?(up|down)["\']?\s*\)', action_str)
   if m:
       return {"type": "scroll", "direction": m.group(1)}


   m = re.match(r'press\(\s*["\'](.+?)["\']\s*\)', action_str)
   if m:
       return {"type": "press", "key": m.group(1)}


   m = re.match(r'send_msg\(\s*["\'](.+?)["\']\s*\)', action_str, re.DOTALL)
   if m:
       return {"type": "send_msg", "message": m.group(1)}


   m = re.match(r'(new_tab|go_back|switch_tab)\(\s*(\d*)\s*\)', action_str)
   if m:
       result = {"type": m.group(1)}
       if m.group(2):
           result["tab"] = int(m.group(2))
       return result


   return {"type": "unknown", "raw": action_str}




def visualise_click(image, action_str, title="MolmoWeb Prediction"):
   """
   Draw the predicted click location on the screenshot and display it.
   Coordinates are normalised (0-1); we convert to pixel space.
   """
   coords = parse_click_coords(action_str)


   fig, ax = plt.subplots(1, 1, figsize=(12, 7))
   ax.imshow(image)
   ax.set_title(title, fontsize=14)


   if coords:
       x_norm, y_norm = coords
       w, h = image.size
       x_px, y_px = x_norm * w, y_norm * h


       circle = patches.Circle(
           (x_px, y_px), radius=18, linewidth=3,
           edgecolor="red", facecolor="none"
       )
       ax.add_patch(circle)
       ax.plot(x_px, y_px, "r+", markersize=20, markeredgewidth=3)


       ax.annotate(
           f"click({x_norm:.3f}, {y_norm:.3f})",
           (x_px, y_px), xytext=(x_px + 25, y_px - 25),
           fontsize=11, color="white",
           bbox=dict(boxstyle="round,pad=0.3", facecolor="red", alpha=0.8),
           arrowprops=dict(arrowstyle="->", color="red", lw=2),
       )
   else:
       ax.text(
           0.5, 0.02, f"Action: {action_str}", transform=ax.transAxes,
           fontsize=12, ha="center", color="white",
           bbox=dict(boxstyle="round,pad=0.4", facecolor="blue", alpha=0.8),
       )


   ax.axis("off")
   plt.tight_layout()
   plt.show()




def download_image(url, size=(1280, 720)):
   """Download an image from a URL and resize to browser viewport dimensions."""
   response = requests.get(url, timeout=15)
   img = Image.open(BytesIO(response.content)).convert("RGB")
   img = img.resize(size, Image.LANCZOS)
   return img




def create_synthetic_webpage(title="Example Page", elements=None):
   """
   Create a synthetic webpage screenshot for testing.
   'elements' is a list of dicts: {"type": "button"|"input"|"text"|"link",
                                    "text": str, "pos": (x, y)}
   """
   img = Image.new("RGB", (1280, 720), color=(255, 255, 255))
   draw = ImageDraw.Draw(img)


   draw.rectangle([0, 0, 1280, 50], fill=(240, 240, 240))
   draw.rectangle([180, 10, 900, 40], outline=(200, 200, 200), width=1, fill="white")
   draw.text((200, 16), f"https://www.example.com", fill=(100, 100, 100))


   for cx in [30, 60, 90]:
       draw.ellipse([cx - 8, 17, cx + 8, 33], fill=(200, 200, 200))


   draw.text((50, 70), title, fill="black")


   if elements:
       for el in elements:
           x, y = el["pos"]
           if el["type"] == "button":
               draw.rectangle([x, y, x + 150, y + 35], fill=(66, 133, 244))
               draw.text((x + 10, y + 8), el["text"], fill="white")
           elif el["type"] == "input":
               draw.rectangle([x, y, x + 300, y + 35], outline=(180, 180, 180), width=2)
               draw.text((x + 10, y + 8), el["text"], fill=(150, 150, 150))
           elif el["type"] == "text":
               draw.text((x, y), el["text"], fill="black")
           elif el["type"] == "link":
               draw.text((x, y), el["text"], fill=(66, 133, 244))


   return img




print("Helper functions defined successfully.")




print("\n" + "=" * 70)
print("SECTION 5: Single-step inference - blank page (cold start)")
print("=" * 70)
print("The agent starts at about:blank and must decide its first action.\n")


blank_image = Image.new("RGB", (1280, 720), color="white")


task = "Go to arxiv.org and find the latest paper about Molmo from Ai2"


prompt = build_prompt(
   task_description=task,
   page_url="about:blank",
   page_index=0,
)


print(f"Task: {task}")
print("Screenshot: blank white image (about:blank)")
print("Running inference...\n")


raw_output = run_inference(prompt, blank_image)


print(f"Raw model output:\n{raw_output}\n")


parsed = parse_thought_and_action(raw_output)
print(f"Thought: {parsed['thought']}")
print(f"Action:  {parsed['action']}")


action_details = parse_action_details(parsed["action"])
print(f"Parsed:  {action_details}")



Source_link

READ ALSO

Apple Is Finally Rebuilding Siri From the Ground Up. But Will It Be Any Good This Time?

Wristband enables wearers to control a robotic hand with their own movements | MIT News

Related Posts

Apple Is Finally Rebuilding Siri From the Ground Up. But Will It Be Any Good This Time?
Al, Analytics and Automation

Apple Is Finally Rebuilding Siri From the Ground Up. But Will It Be Any Good This Time?

March 25, 2026
Wristband enables wearers to control a robotic hand with their own movements | MIT News
Al, Analytics and Automation

Wristband enables wearers to control a robotic hand with their own movements | MIT News

March 25, 2026
NVIDIA AI Introduces PivotRL: A New AI Framework Achieving High Agentic Accuracy With 4x Fewer Rollout Turns Efficiently
Al, Analytics and Automation

NVIDIA AI Introduces PivotRL: A New AI Framework Achieving High Agentic Accuracy With 4x Fewer Rollout Turns Efficiently

March 25, 2026
On algorithms, life, and learning | MIT News
Al, Analytics and Automation

On algorithms, life, and learning | MIT News

March 25, 2026
Paged Attention in Large Language Models LLMs
Al, Analytics and Automation

Paged Attention in Large Language Models LLMs

March 24, 2026
How to create “humble” AI | MIT News
Al, Analytics and Automation

How to create “humble” AI | MIT News

March 24, 2026
Next Post
T-Mobile customers have a week to sign up for a free year of MLB.TV

T-Mobile customers have a week to sign up for a free year of MLB.TV

POPULAR NEWS

Trump ends trade talks with Canada over a digital services tax

Trump ends trade talks with Canada over a digital services tax

June 28, 2025
Communication Effectiveness Skills For Business Leaders

Communication Effectiveness Skills For Business Leaders

June 10, 2025
15 Trending Songs on TikTok in 2025 (+ How to Use Them)

15 Trending Songs on TikTok in 2025 (+ How to Use Them)

June 18, 2025
App Development Cost in Singapore: Pricing Breakdown & Insights

App Development Cost in Singapore: Pricing Breakdown & Insights

June 22, 2025
Comparing the Top 7 Large Language Models LLMs/Systems for Coding in 2025

Comparing the Top 7 Large Language Models LLMs/Systems for Coding in 2025

November 4, 2025

EDITOR'S PICK

How to Print Fun Custom Canva T-Shirts (+ Canva T-Shirt Templates)

How to Print Fun Custom Canva T-Shirts (+ Canva T-Shirt Templates)

June 5, 2025
Young Republicans group chat: Why the racist, sexist, antisemitic messages matter.

Young Republicans group chat: Why the racist, sexist, antisemitic messages matter.

October 25, 2025
Gen Z and the Future of Loyalty in 2026

Gen Z and the Future of Loyalty in 2026

March 4, 2026
Nvidia CEO pushes back against report that his company’s $100B OpenAI investment has stalled

Nvidia CEO pushes back against report that his company’s $100B OpenAI investment has stalled

January 31, 2026

About

We bring you the best Premium WordPress Themes that perfect for news, magazine, personal blog, etc. Check our landing page for details.

Follow us

Categories

  • Account Based Marketing
  • Ad Management
  • Al, Analytics and Automation
  • Brand Management
  • Channel Marketing
  • Digital Marketing
  • Direct Marketing
  • Event Management
  • Google Marketing
  • Marketing Attribution and Consulting
  • Marketing Automation
  • Mobile Marketing
  • PR Solutions
  • Social Media Management
  • Technology And Software
  • Uncategorized

Recent Posts

  • T-Mobile customers have a week to sign up for a free year of MLB.TV
  • How to Build a Vision-Guided Web AI Agent with MolmoWeb-4B Using Multimodal Reasoning and Action Prediction
  • Think Of Positioning Your Organization’s Brand Like Raising A Child
  • Forex Trading App Development UAE
  • About Us
  • Disclaimer
  • Contact Us
  • Privacy Policy
No Result
View All Result
  • Technology And Software
    • Account Based Marketing
    • Channel Marketing
    • Marketing Automation
      • Al, Analytics and Automation
      • Ad Management
  • Digital Marketing
    • Social Media Management
    • Google Marketing
  • Direct Marketing
    • Brand Management
    • Marketing Attribution and Consulting
  • Mobile Marketing
  • Event Management
  • PR Solutions