• About Us
  • Disclaimer
  • Contact Us
  • Privacy Policy
Thursday, June 25, 2026
mGrowTech
No Result
View All Result
  • Technology And Software
    • Account Based Marketing
    • Channel Marketing
    • Marketing Automation
      • Al, Analytics and Automation
      • Ad Management
  • Digital Marketing
    • Social Media Management
    • Google Marketing
  • Direct Marketing
    • Brand Management
    • Marketing Attribution and Consulting
  • Mobile Marketing
  • Event Management
  • PR Solutions
  • Technology And Software
    • Account Based Marketing
    • Channel Marketing
    • Marketing Automation
      • Al, Analytics and Automation
      • Ad Management
  • Digital Marketing
    • Social Media Management
    • Google Marketing
  • Direct Marketing
    • Brand Management
    • Marketing Attribution and Consulting
  • Mobile Marketing
  • Event Management
  • PR Solutions
No Result
View All Result
mGrowTech
No Result
View All Result
Home Al, Analytics and Automation

How to Build a Vision-Guided Web AI Agent with MolmoWeb-4B Using Multimodal Reasoning and Action Prediction

Josh by Josh
March 26, 2026
in Al, Analytics and Automation
0
How to Build a Vision-Guided Web AI Agent with MolmoWeb-4B Using Multimodal Reasoning and Action Prediction


def parse_click_coords(action_str):
   """
   Extract normalised (x, y) coordinates from a click action string.
   e.g., 'click(0.45, 0.32)' -> (0.45, 0.32)
   Returns None if the action is not a click.
   """
   match = re.search(r"click\(\s*([\d.]+)\s*,\s*([\d.]+)\s*\)", action_str)
   if match:
       return float(match.group(1)), float(match.group(2))
   return None




def parse_action_details(action_str):
   """
   Parse a MolmoWeb action string into a structured dict.
   Returns:  {"type": "click", "x": 0.45, "y": 0.32}
             {"type": "goto", "url": "https://..."}
             {"type": "type", "text": "query text"}
             {"type": "scroll", "direction": "down"}
             {"type": "press", "key": "Enter"}
             {"type": "send_msg", "message": "The answer is ..."}
             {"type": "unknown", "raw": "..."}
   """
   action_str = action_str.strip()


   m = re.match(r'click\(\s*([\d.]+)\s*,\s*([\d.]+)\s*\)', action_str)
   if m:
       return {"type": "click", "x": float(m.group(1)), "y": float(m.group(2))}


   m = re.match(r'goto\(\s*["\'](.+?)["\']\s*\)', action_str)
   if m:
       return {"type": "goto", "url": m.group(1)}


   m = re.match(r'type\(\s*["\'](.+?)["\']\s*\)', action_str)
   if m:
       return {"type": "type", "text": m.group(1)}


   m = re.match(r'scroll\(\s*["\']?(up|down)["\']?\s*\)', action_str)
   if m:
       return {"type": "scroll", "direction": m.group(1)}


   m = re.match(r'press\(\s*["\'](.+?)["\']\s*\)', action_str)
   if m:
       return {"type": "press", "key": m.group(1)}


   m = re.match(r'send_msg\(\s*["\'](.+?)["\']\s*\)', action_str, re.DOTALL)
   if m:
       return {"type": "send_msg", "message": m.group(1)}


   m = re.match(r'(new_tab|go_back|switch_tab)\(\s*(\d*)\s*\)', action_str)
   if m:
       result = {"type": m.group(1)}
       if m.group(2):
           result["tab"] = int(m.group(2))
       return result


   return {"type": "unknown", "raw": action_str}




def visualise_click(image, action_str, title="MolmoWeb Prediction"):
   """
   Draw the predicted click location on the screenshot and display it.
   Coordinates are normalised (0-1); we convert to pixel space.
   """
   coords = parse_click_coords(action_str)


   fig, ax = plt.subplots(1, 1, figsize=(12, 7))
   ax.imshow(image)
   ax.set_title(title, fontsize=14)


   if coords:
       x_norm, y_norm = coords
       w, h = image.size
       x_px, y_px = x_norm * w, y_norm * h


       circle = patches.Circle(
           (x_px, y_px), radius=18, linewidth=3,
           edgecolor="red", facecolor="none"
       )
       ax.add_patch(circle)
       ax.plot(x_px, y_px, "r+", markersize=20, markeredgewidth=3)


       ax.annotate(
           f"click({x_norm:.3f}, {y_norm:.3f})",
           (x_px, y_px), xytext=(x_px + 25, y_px - 25),
           fontsize=11, color="white",
           bbox=dict(boxstyle="round,pad=0.3", facecolor="red", alpha=0.8),
           arrowprops=dict(arrowstyle="->", color="red", lw=2),
       )
   else:
       ax.text(
           0.5, 0.02, f"Action: {action_str}", transform=ax.transAxes,
           fontsize=12, ha="center", color="white",
           bbox=dict(boxstyle="round,pad=0.4", facecolor="blue", alpha=0.8),
       )


   ax.axis("off")
   plt.tight_layout()
   plt.show()




def download_image(url, size=(1280, 720)):
   """Download an image from a URL and resize to browser viewport dimensions."""
   response = requests.get(url, timeout=15)
   img = Image.open(BytesIO(response.content)).convert("RGB")
   img = img.resize(size, Image.LANCZOS)
   return img




def create_synthetic_webpage(title="Example Page", elements=None):
   """
   Create a synthetic webpage screenshot for testing.
   'elements' is a list of dicts: {"type": "button"|"input"|"text"|"link",
                                    "text": str, "pos": (x, y)}
   """
   img = Image.new("RGB", (1280, 720), color=(255, 255, 255))
   draw = ImageDraw.Draw(img)


   draw.rectangle([0, 0, 1280, 50], fill=(240, 240, 240))
   draw.rectangle([180, 10, 900, 40], outline=(200, 200, 200), width=1, fill="white")
   draw.text((200, 16), f"https://www.example.com", fill=(100, 100, 100))


   for cx in [30, 60, 90]:
       draw.ellipse([cx - 8, 17, cx + 8, 33], fill=(200, 200, 200))


   draw.text((50, 70), title, fill="black")


   if elements:
       for el in elements:
           x, y = el["pos"]
           if el["type"] == "button":
               draw.rectangle([x, y, x + 150, y + 35], fill=(66, 133, 244))
               draw.text((x + 10, y + 8), el["text"], fill="white")
           elif el["type"] == "input":
               draw.rectangle([x, y, x + 300, y + 35], outline=(180, 180, 180), width=2)
               draw.text((x + 10, y + 8), el["text"], fill=(150, 150, 150))
           elif el["type"] == "text":
               draw.text((x, y), el["text"], fill="black")
           elif el["type"] == "link":
               draw.text((x, y), el["text"], fill=(66, 133, 244))


   return img




print("Helper functions defined successfully.")




print("\n" + "=" * 70)
print("SECTION 5: Single-step inference - blank page (cold start)")
print("=" * 70)
print("The agent starts at about:blank and must decide its first action.\n")


blank_image = Image.new("RGB", (1280, 720), color="white")


task = "Go to arxiv.org and find the latest paper about Molmo from Ai2"


prompt = build_prompt(
   task_description=task,
   page_url="about:blank",
   page_index=0,
)


print(f"Task: {task}")
print("Screenshot: blank white image (about:blank)")
print("Running inference...\n")


raw_output = run_inference(prompt, blank_image)


print(f"Raw model output:\n{raw_output}\n")


parsed = parse_thought_and_action(raw_output)
print(f"Thought: {parsed['thought']}")
print(f"Action:  {parsed['action']}")


action_details = parse_action_details(parsed["action"])
print(f"Parsed:  {action_details}")



Source_link

READ ALSO

Baidu Releases Unlimited OCR, a 3B Model That Keeps the KV Cache Flat for Long-Document Parsing

Context Windows Are Not Memory: What AI Agent Developers Need to Understand

Related Posts

Baidu Releases Unlimited OCR, a 3B Model That Keeps the KV Cache Flat for Long-Document Parsing
Al, Analytics and Automation

Baidu Releases Unlimited OCR, a 3B Model That Keeps the KV Cache Flat for Long-Document Parsing

June 25, 2026
Al, Analytics and Automation

Context Windows Are Not Memory: What AI Agent Developers Need to Understand

June 25, 2026
Using Graphify and NetworkX to Map Python Codebase Structure with God Nodes, Communities, and Architecture Visualizations
Al, Analytics and Automation

Using Graphify and NetworkX to Map Python Codebase Structure with God Nodes, Communities, and Architecture Visualizations

June 24, 2026
Audio Data Collection & Annotation: Challenges and Best Practices
Al, Analytics and Automation

Audio Data Collection & Annotation: Challenges and Best Practices

June 24, 2026
Exploring the societal impacts of AI | MIT News
Al, Analytics and Automation

Exploring the societal impacts of AI | MIT News

June 24, 2026
Datalab Releases lift: A 9B Open-Weights Vision Model That Extracts Structured JSON From PDFs Using Schemas
Al, Analytics and Automation

Datalab Releases lift: A 9B Open-Weights Vision Model That Extracts Structured JSON From PDFs Using Schemas

June 23, 2026
Next Post
T-Mobile customers have a week to sign up for a free year of MLB.TV

T-Mobile customers have a week to sign up for a free year of MLB.TV

POPULAR NEWS

Trump ends trade talks with Canada over a digital services tax

Trump ends trade talks with Canada over a digital services tax

June 28, 2025
15 Trending Songs on TikTok in 2025 (+ How to Use Them)

15 Trending Songs on TikTok in 2025 (+ How to Use Them)

June 18, 2025
Communication Effectiveness Skills For Business Leaders

Communication Effectiveness Skills For Business Leaders

June 10, 2025
App Development Cost in Singapore: Pricing Breakdown & Insights

App Development Cost in Singapore: Pricing Breakdown & Insights

June 22, 2025
Comparing the Top 7 Large Language Models LLMs/Systems for Coding in 2025

Comparing the Top 7 Large Language Models LLMs/Systems for Coding in 2025

November 4, 2025

EDITOR'S PICK

WTO Says AI Could Lift Global Commerce by ~40%—If the World Can Share the Spoils

WTO Says AI Could Lift Global Commerce by ~40%—If the World Can Share the Spoils

September 17, 2025
Electronics Weekly Announces Two New Virtual Events 2026

Electronics Weekly Announces Two New Virtual Events 2026

January 28, 2026
A Guide for Using Social Media to Pay Off Your Debt Faster

A Guide for Using Social Media to Pay Off Your Debt Faster

June 16, 2025
The Future of Content Marketing: Embracing Liquid Content in a Dynamic Landscape

The Future of Content Marketing: Embracing Liquid Content in a Dynamic Landscape

March 20, 2026

About

We bring you the best Premium WordPress Themes that perfect for news, magazine, personal blog, etc. Check our landing page for details.

Follow us

Categories

  • Account Based Marketing
  • Ad Management
  • Al, Analytics and Automation
  • Brand Management
  • Channel Marketing
  • Digital Marketing
  • Direct Marketing
  • Event Management
  • Google Marketing
  • Marketing Attribution and Consulting
  • Marketing Automation
  • Mobile Marketing
  • PR Solutions
  • Social Media Management
  • Technology And Software
  • Uncategorized

Recent Posts

  • AI Is A Platform Shift, Not A Brand Advantage
  • Cost to Develop an App Like Herfy in 2026
  • Yahoo’s Anna Nicholson on Mentorship, Betting on You
  • Read our white paper on a pragmatic approach to AI governance in America.
  • About Us
  • Disclaimer
  • Contact Us
  • Privacy Policy
No Result
View All Result
  • Technology And Software
    • Account Based Marketing
    • Channel Marketing
    • Marketing Automation
      • Al, Analytics and Automation
      • Ad Management
  • Digital Marketing
    • Social Media Management
    • Google Marketing
  • Direct Marketing
    • Brand Management
    • Marketing Attribution and Consulting
  • Mobile Marketing
  • Event Management
  • PR Solutions