Skip to content

Commit 6aa145a

Browse files
authored
Merge pull request #163 from bytedance/chore/publish-python-lib
feat: init ui-tars python library
2 parents 64caa72 + db1407b commit 6aa145a

File tree

14 files changed

+959
-161
lines changed

14 files changed

+959
-161
lines changed

.github/workflows/test.yml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
name: Test
2+
3+
on:
4+
pull_request:
5+
types: [opened, synchronize, reopened]
6+
branches:
7+
- "**"
8+
push:
9+
branches:
10+
- "main"
11+
12+
jobs:
13+
test_ui_tars:
14+
runs-on: ubuntu-latest
15+
steps:
16+
- uses: actions/checkout@v4
17+
- name: Set up Python
18+
uses: actions/setup-python@v5
19+
with:
20+
python-version: "3.11"
21+
- name: Install dependencies
22+
working-directory: codes
23+
run: |
24+
python -m pip install --upgrade pip uv
25+
uv sync
26+
- name: Run unit tests
27+
working-directory: codes
28+
run: |
29+
make test

codes/.gitignore

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Python-generated files
2+
__pycache__/
3+
.pytest_cache/
4+
*.py[oc]
5+
build/
6+
dist/
7+
wheels/
8+
*.egg-info
9+
10+
# Virtual environments
11+
.venv
12+
.DS_Store

codes/.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.10

codes/README.md

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
# ui-tars
2+
3+
A python package for parsing LLM-generated GUI action instructions, automatically generating pyautogui scripts, and supporting coordinate conversion and smart image resizing.
4+
5+
---
6+
7+
## Introduction
8+
9+
`ui-tars` is a Python package for parsing LLM-generated GUI action instructions, automatically generating pyautogui scripts, and supporting coordinate conversion and smart image resizing.
10+
11+
- Supports multiple LLM output formats (e.g., Qwen, Doubao)
12+
- Automatically handles coordinate scaling and format conversion
13+
- One-click generation of pyautogui automation scripts
14+
15+
---
16+
17+
## Quick Start
18+
19+
### Installation
20+
21+
```bash
22+
pip install ui-tars
23+
# or
24+
uv pip install ui-tars
25+
```
26+
27+
### Parse LLM output into structured actions
28+
29+
```python
30+
from ui_tars.action_parser import parse_action_to_structure_output
31+
32+
response = "Thought: Click the button\nAction: click(start_box='(0.1,0.2,0.1,0.2)')"
33+
original_image_width, original_image_height = 1920, 1080
34+
parsed_dict = parse_action_to_structure_output(
35+
response,
36+
factor=1000,
37+
origin_resized_height=original_image_height,
38+
origin_resized_width=original_image_width,
39+
model_type="doubao"
40+
)
41+
print(parsed_dict)
42+
```
43+
44+
### Generate pyautogui automation script
45+
46+
```python
47+
from ui_tars.action_parser import parsing_response_to_pyautogui_code
48+
49+
pyautogui_code = parsing_response_to_pyautogui_code(parsed_dict, original_image_height, original_image_width)
50+
print(pyautogui_code)
51+
```
52+
53+
### Visualize coordinates on the image (optional)
54+
55+
```python
56+
from PIL import Image, ImageDraw
57+
import numpy as np
58+
import matplotlib.pyplot as plt
59+
60+
image = Image.open("your_image_path.png")
61+
start_box = parsed_dict[0]["action_inputs"]["start_box"]
62+
coordinates = eval(start_box)
63+
x1 = int(coordinates[0] * original_image_width)
64+
y1 = int(coordinates[1] * original_image_height)
65+
draw = ImageDraw.Draw(image)
66+
radius = 5
67+
draw.ellipse((x1 - radius, y1 - radius, x1 + radius, y1 + radius), fill="red", outline="red")
68+
plt.imshow(np.array(image))
69+
plt.axis("off")
70+
plt.show()
71+
```
72+
73+
---
74+
75+
## API Documentation
76+
77+
### parse_action_to_structure_output
78+
79+
```python
80+
def parse_action_to_structure_output(
81+
text: str,
82+
factor: int,
83+
origin_resized_height: int,
84+
origin_resized_width: int,
85+
model_type: str = "qwen25vl",
86+
max_pixels: int = 16384 * 28 * 28,
87+
min_pixels: int = 100 * 28 * 28
88+
) -> list[dict]:
89+
...
90+
```
91+
92+
**Description:**
93+
Parses LLM output action instructions into structured dictionaries, automatically handling coordinate scaling and box/point format conversion.
94+
95+
**Parameters:**
96+
- `text`: The LLM output string
97+
- `factor`: Scaling factor
98+
- `origin_resized_height`/`origin_resized_width`: Original image height/width
99+
- `model_type`: Model type (e.g., "qwen25vl", "doubao")
100+
- `max_pixels`/`min_pixels`: Image pixel upper/lower limits
101+
102+
**Returns:**
103+
A list of structured actions, each as a dict with fields like `action_type`, `action_inputs`, `thought`, etc.
104+
105+
---
106+
107+
### parsing_response_to_pyautogui_code
108+
109+
```python
110+
def parsing_response_to_pyautogui_code(
111+
responses: dict | list[dict],
112+
image_height: int,
113+
image_width: int,
114+
input_swap: bool = True
115+
) -> str:
116+
...
117+
```
118+
119+
**Description:**
120+
Converts structured actions into a pyautogui script string, supporting click, type, hotkey, drag, scroll, and more.
121+
122+
**Parameters:**
123+
- `responses`: Structured actions (dict or list of dicts)
124+
- `image_height`/`image_width`: Image height/width
125+
- `input_swap`: Whether to use clipboard paste for typing (default True)
126+
127+
**Returns:**
128+
A pyautogui script string, ready for automation execution.
129+
130+
---
131+
132+
## Contribution
133+
134+
Contributions, issues, and suggestions are welcome!
135+
136+
---
137+
138+
## License
139+
140+
Apache-2.0 License

codes/action_parser.py

Lines changed: 0 additions & 13 deletions
This file was deleted.

codes/makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
test:
2+
uv run python3 -m unittest discover tests '*_test.py'

codes/prompts.py

Lines changed: 0 additions & 59 deletions
This file was deleted.

codes/pyproject.toml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
[project]
2+
name = "ui-tars"
3+
version = "0.1.3"
4+
description = "Parsing LLM-generated GUI action instructions, automatically generating pyautogui scripts, and supporting coordinate conversion and smart image resizing."
5+
readme = "README.md"
6+
authors = [
7+
{ name = "liangshihao.0828", email = "[email protected]" },
8+
{ name = "jinxin001", email = "[email protected]" }
9+
]
10+
requires-python = ">=3.10,<4.0"
11+
dependencies = []
12+
13+
[build-system]
14+
requires = ["hatchling"]
15+
build-backend = "hatchling.build"
16+
17+
[tool.hatch.envs.test.scripts]
18+
test = "python -m unittest discover tests '*_test.py'"
19+
publish = "python -m unittest discover tests '*_test.py' && uv build && uv publish"
20+
21+
[tool.black]
22+
line-length = 88
23+
target-version = ['py310']
24+
include = '\.pyi?$'
25+
26+
[tool.hatch.build]
27+
include = [
28+
"ui_tars/**/*.py",
29+
"!ui_tars/**/tests/*.py",
30+
"!ui_tars/**/tests.py"
31+
]
32+
33+
[tool.uv]
34+
dev-dependencies = [
35+
"matplotlib>=3.10.3",
36+
"pillow>=11.2.1",
37+
]

codes/tests/action_parser_test.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import unittest
2+
3+
import os
4+
import sys
5+
6+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7+
8+
from ui_tars.action_parser import (
9+
parsing_response_to_pyautogui_code,
10+
parse_action,
11+
parse_action_to_structure_output,
12+
)
13+
14+
15+
class TestActionParser(unittest.TestCase):
16+
def test_parse_action(self):
17+
action_str = "click(start_box='(10,20,30,40)')"
18+
result = parse_action(action_str)
19+
self.assertEqual(result['function'], 'click')
20+
self.assertEqual(result['args']['start_box'], '(10,20,30,40)')
21+
22+
def test_parse_action_to_structure_output(self):
23+
text = "Thought: test\nAction: click(start_box='(10,20,30,40)')"
24+
actions = parse_action_to_structure_output(
25+
text, factor=28, origin_resized_height=224, origin_resized_width=224
26+
)
27+
self.assertEqual(actions[0]['action_type'], 'click')
28+
self.assertIn('start_box', actions[0]['action_inputs'])
29+
30+
def test_parsing_response_to_pyautogui_code(self):
31+
responses = {"action_type": "hotkey", "action_inputs": {"hotkey": "ctrl v"}}
32+
code = parsing_response_to_pyautogui_code(responses, 224, 224)
33+
self.assertIn('pyautogui.hotkey', code)
34+
35+
36+
if __name__ == '__main__':
37+
unittest.main()

0 commit comments

Comments
 (0)