Skip to content

Implement OmniMCP for Claude computer control #946

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
3 changes: 3 additions & 0 deletions deploy/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_REGION=
10 changes: 10 additions & 0 deletions deploy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
```
# First time setup
cd deploy
uv venv
source .venv/bin/activate
uv pip install -e .

# Subsequent usage
python deploy/models/omniparser/deploy.py start
```
20 changes: 20 additions & 0 deletions deploy/deploy/models/omniparser/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
__pycache__
*.pyc
*.pyo
*.pyd
.Python
env
pip-log.txt
pip-delete-this-directory.txt
.tox
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.log
.pytest_cache
.env
.venv
.DS_Store
59 changes: 59 additions & 0 deletions deploy/deploy/models/omniparser/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
FROM nvidia/cuda:12.3.1-devel-ubuntu22.04

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
git-lfs \
wget \
libgl1 \
libglib2.0-0 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& git lfs install

RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
bash miniconda.sh -b -p /opt/conda && \
rm miniconda.sh
ENV PATH="/opt/conda/bin:$PATH"

RUN conda create -n omni python=3.12 && \
echo "source activate omni" > ~/.bashrc
ENV CONDA_DEFAULT_ENV=omni
ENV PATH="/opt/conda/envs/omni/bin:$PATH"

WORKDIR /app

RUN git clone https://github.com./microsoft/OmniParser.git && \
cd OmniParser && \
git lfs install && \
git lfs pull

WORKDIR /app/OmniParser

RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
pip uninstall -y opencv-python opencv-python-headless && \
pip install --no-cache-dir opencv-python-headless==4.8.1.78 && \
pip install -r requirements.txt && \
pip install huggingface_hub fastapi uvicorn

# Download V2 weights
RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
mkdir -p /app/OmniParser/weights && \
cd /app/OmniParser && \
rm -rf weights/icon_detect weights/icon_caption weights/icon_caption_florence && \
for folder in icon_caption icon_detect; do \
huggingface-cli download microsoft/OmniParser-v2.0 --local-dir weights --repo-type model --include "$folder/*"; \
done && \
mv weights/icon_caption weights/icon_caption_florence

# Pre-download OCR models during build
RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
cd /app/OmniParser && \
python3 -c "import easyocr; reader = easyocr.Reader(['en']); print('Downloaded EasyOCR model')" && \
python3 -c "from paddleocr import PaddleOCR; ocr = PaddleOCR(lang='en', use_angle_cls=False, use_gpu=False, show_log=False); print('Downloaded PaddleOCR model')"

CMD ["python3", "/app/OmniParser/omnitool/omniparserserver/omniparserserver.py", \
"--som_model_path", "/app/OmniParser/weights/icon_detect/model.pt", \
"--caption_model_path", "/app/OmniParser/weights/icon_caption_florence", \
"--device", "cuda", \
"--BOX_TRESHOLD", "0.05", \
"--host", "0.0.0.0", \
"--port", "8000"]
128 changes: 128 additions & 0 deletions deploy/deploy/models/omniparser/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""Client module for interacting with the OmniParser server."""

import base64
import fire
import requests

from loguru import logger
from PIL import Image, ImageDraw


def image_to_base64(image_path: str) -> str:
"""Convert an image file to base64 string.

Args:
image_path: Path to the image file

Returns:
str: Base64 encoded string of the image
"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")


def plot_results(
original_image_path: str,
som_image_base64: str,
parsed_content_list: list[dict[str, list[float]]],
) -> None:
"""Plot parsing results on the original image.

Args:
original_image_path: Path to the original image
som_image_base64: Base64 encoded SOM image
parsed_content_list: List of parsed content with bounding boxes
"""
# Open original image
image = Image.open(original_image_path)
width, height = image.size

# Create drawable image
draw = ImageDraw.Draw(image)

# Draw bounding boxes and labels
for item in parsed_content_list:
# Get normalized coordinates and convert to pixel coordinates
x1, y1, x2, y2 = item["bbox"]
x1 = int(x1 * width)
y1 = int(y1 * height)
x2 = int(x2 * width)
y2 = int(y2 * height)

label = item["content"]

# Draw rectangle
draw.rectangle([(x1, y1), (x2, y2)], outline="red", width=2)

# Draw label background
text_bbox = draw.textbbox((x1, y1), label)
draw.rectangle(
[text_bbox[0] - 2, text_bbox[1] - 2, text_bbox[2] + 2, text_bbox[3] + 2],
fill="white",
)

# Draw label text
draw.text((x1, y1), label, fill="red")

# Show image
image.show()


def parse_image(
image_path: str,
server_url: str,
) -> None:
"""Parse an image using the OmniParser server.

Args:
image_path: Path to the image file
server_url: URL of the OmniParser server
"""
# Remove trailing slash from server_url if present
server_url = server_url.rstrip("/")

# Convert image to base64
base64_image = image_to_base64(image_path)

# Prepare request
url = f"{server_url}/parse/"
payload = {"base64_image": base64_image}

try:
# First, check if the server is available
probe_url = f"{server_url}/probe/"
probe_response = requests.get(probe_url)
probe_response.raise_for_status()
logger.info("Server is available")

# Make request to API
response = requests.post(url, json=payload)
response.raise_for_status()

# Parse response
result = response.json()
som_image_base64 = result["som_image_base64"]
parsed_content_list = result["parsed_content_list"]

# Plot results
plot_results(image_path, som_image_base64, parsed_content_list)

# Print latency
logger.info(f"API Latency: {result['latency']:.2f} seconds")

except requests.exceptions.ConnectionError:
logger.error(f"Error: Could not connect to server at {server_url}")
logger.error("Please check if the server is running and the URL is correct")
except requests.exceptions.RequestException as e:
logger.error(f"Error making request to API: {e}")
except Exception as e:
logger.error(f"Error: {e}")


def main() -> None:
"""Main entry point for the client application."""
fire.Fire(parse_image)


if __name__ == "__main__":
main()
Loading
Loading