Skip to content

gh-131507: Refactor screen and cursor position calculations #131547

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 49 additions & 75 deletions Lib/_pyrepl/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,11 @@

from contextlib import contextmanager
from dataclasses import dataclass, field, fields
import unicodedata
from _colorize import can_colorize, ANSIColors


from . import commands, console, input
from .utils import wlen, unbracket, str_width
from .utils import wlen, unbracket, disp_str
from .trace import trace


Expand All @@ -39,36 +38,6 @@
from .types import Callback, SimpleContextManager, KeySpec, CommandName


def disp_str(buffer: str) -> tuple[str, list[int]]:
"""disp_str(buffer:string) -> (string, [int])

Return the string that should be the printed representation of
|buffer| and a list detailing where the characters of |buffer|
get used up. E.g.:

>>> disp_str(chr(3))
('^C', [1, 0])

"""
b: list[int] = []
s: list[str] = []
for c in buffer:
if c == '\x1a':
s.append(c)
b.append(2)
elif ord(c) < 128:
s.append(c)
b.append(1)
elif unicodedata.category(c).startswith("C"):
c = r"\u%04x" % ord(c)
s.append(c)
b.append(len(c))
else:
s.append(c)
b.append(str_width(c))
return "".join(s), b


# syntax classes:

SYNTAX_WHITESPACE, SYNTAX_WORD, SYNTAX_SYMBOL = range(3)
Expand Down Expand Up @@ -347,14 +316,12 @@ def calc_screen(self) -> list[str]:
pos -= offset

prompt_from_cache = (offset and self.buffer[offset - 1] != "\n")

lines = "".join(self.buffer[offset:]).split("\n")

cursor_found = False
lines_beyond_cursor = 0
for ln, line in enumerate(lines, num_common_lines):
ll = len(line)
if 0 <= pos <= ll:
line_len = len(line)
if 0 <= pos <= line_len:
self.lxy = pos, ln
cursor_found = True
elif cursor_found:
Expand All @@ -368,34 +335,34 @@ def calc_screen(self) -> list[str]:
prompt_from_cache = False
prompt = ""
else:
prompt = self.get_prompt(ln, ll >= pos >= 0)
prompt = self.get_prompt(ln, line_len >= pos >= 0)
while "\n" in prompt:
pre_prompt, _, prompt = prompt.partition("\n")
last_refresh_line_end_offsets.append(offset)
screen.append(pre_prompt)
screeninfo.append((0, []))
pos -= ll + 1
prompt, lp = self.process_prompt(prompt)
l, l2 = disp_str(line)
wrapcount = (wlen(l) + lp) // self.console.width
if wrapcount == 0:
offset += ll + 1 # Takes all of the line plus the newline
pos -= line_len + 1
prompt, prompt_len = self.process_prompt(prompt)
chars, char_widths = disp_str(line)
wrapcount = (sum(char_widths) + prompt_len) // self.console.width
trace("wrapcount = {wrapcount}", wrapcount=wrapcount)
if wrapcount == 0 or not char_widths:
offset += line_len + 1 # Takes all of the line plus the newline
last_refresh_line_end_offsets.append(offset)
screen.append(prompt + l)
screeninfo.append((lp, l2))
screen.append(prompt + "".join(chars))
screeninfo.append((prompt_len, char_widths))
else:
i = 0
while l:
prelen = lp if i == 0 else 0
pre = prompt
prelen = prompt_len
for wrap in range(wrapcount + 1):
index_to_wrap_before = 0
column = 0
for character_width in l2:
if column + character_width >= self.console.width - prelen:
for char_width in char_widths:
if column + char_width + prelen >= self.console.width:
break
index_to_wrap_before += 1
column += character_width
pre = prompt if i == 0 else ""
if len(l) > index_to_wrap_before:
column += char_width
if len(chars) > index_to_wrap_before:
offset += index_to_wrap_before
post = "\\"
after = [1]
Expand All @@ -404,11 +371,14 @@ def calc_screen(self) -> list[str]:
post = ""
after = []
last_refresh_line_end_offsets.append(offset)
screen.append(pre + l[:index_to_wrap_before] + post)
screeninfo.append((prelen, l2[:index_to_wrap_before] + after))
l = l[index_to_wrap_before:]
l2 = l2[index_to_wrap_before:]
i += 1
render = pre + "".join(chars[:index_to_wrap_before]) + post
render_widths = char_widths[:index_to_wrap_before] + after
screen.append(render)
screeninfo.append((prelen, render_widths))
chars = chars[index_to_wrap_before:]
char_widths = char_widths[index_to_wrap_before:]
pre = ""
prelen = 0
self.screeninfo = screeninfo
self.cxy = self.pos2xy()
if self.msg:
Expand Down Expand Up @@ -537,9 +507,9 @@ def setpos_from_xy(self, x: int, y: int) -> None:
pos = 0
i = 0
while i < y:
prompt_len, character_widths = self.screeninfo[i]
offset = len(character_widths) - character_widths.count(0)
in_wrapped_line = prompt_len + sum(character_widths) >= self.console.width
prompt_len, char_widths = self.screeninfo[i]
offset = len(char_widths)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this breaking 0-width characters (like combining marks)?

Copy link
Contributor Author

@ambv ambv Mar 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, this is fixing 0-width characters.

Edit: to explain more clearly. The code I removed looks like it was meant to help with 0-width characters, but the code was actually the wrong way round. The reason why was that originally the "0-width" was historically used in pyrepl to show characters like ^C as two-wide by returning [1, 0] from disp_str(). You can still see that in the docstring I edited. But either we made a change to undo that when originally merging pyrepl, or they changed it over the course of history so it didn't work anyway. So those 0-counts weren't doing anything productive anyway, but were only adding to our confusion. As proof, all zero-width characters are currently printed out as part of the Other unicodedata group:

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Screenshot 2025-03-21 at 17 47 16

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be clear, the screenshot represents the state before this PR, and this PR is not affecting it in any way.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The way I discovered all this was that I originally wanted disp_str() to emit syntax highlighting as separate zero-width characters and then it turned out that the removal of the .count(0) was necessary for the cursor positioning to work correctly. And even that was insufficient, since having zero-width characters in the stream was causing wrapping on narrow terminals to fall in infinite loops, there's an open issue about this (#126685).

This PR fixes #126685.

in_wrapped_line = prompt_len + sum(char_widths) >= self.console.width
if in_wrapped_line:
pos += offset - 1 # -1 cause backslash is not in buffer
else:
Expand All @@ -560,29 +530,33 @@ def setpos_from_xy(self, x: int, y: int) -> None:

def pos2xy(self) -> tuple[int, int]:
"""Return the x, y coordinates of position 'pos'."""
# this *is* incomprehensible, yes.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😆

p, y = 0, 0
l2: list[int] = []

prompt_len, y = 0, 0
char_widths: list[int] = []
pos = self.pos
assert 0 <= pos <= len(self.buffer)

# optimize for the common case: typing at the end of the buffer
if pos == len(self.buffer) and len(self.screeninfo) > 0:
y = len(self.screeninfo) - 1
p, l2 = self.screeninfo[y]
return p + sum(l2) + l2.count(0), y
prompt_len, char_widths = self.screeninfo[y]
return prompt_len + sum(char_widths), y

for prompt_len, char_widths in self.screeninfo:
offset = len(char_widths)
in_wrapped_line = prompt_len + sum(char_widths) >= self.console.width
if in_wrapped_line:
offset -= 1 # need to remove line-wrapping backslash

for p, l2 in self.screeninfo:
l = len(l2) - l2.count(0)
in_wrapped_line = p + sum(l2) >= self.console.width
offset = l - 1 if in_wrapped_line else l # need to remove backslash
if offset >= pos:
break

if p + sum(l2) >= self.console.width:
pos -= l - 1 # -1 cause backslash is not in buffer
else:
pos -= l + 1 # +1 cause newline is in buffer
if not in_wrapped_line:
offset += 1 # there's a newline in buffer

pos -= offset
Comment on lines +554 to +557
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see why these two are equivalent :(

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is equivalent, but it's subtle. Note that even in the old version, the if test in line 580 is the same as what's under the variable in_wrapped_line.

In the new version, we're reusing the offset variable that already had -1 applied if in_wrapped_line. So we only need to handle the newline case, which we do.

There's less computation in the new version, so it reads better to me. It almost could be shortened further, but the break check is awkwardly in the middle there, so I kept it for equivalence.

y += 1
return p + sum(l2[:pos]), y
return prompt_len + sum(char_widths[:pos]), y

def insert(self, text: str | list[str]) -> None:
"""Insert 'text' at the insertion point."""
Expand Down
14 changes: 8 additions & 6 deletions Lib/_pyrepl/types.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from collections.abc import Callable, Iterator

Callback = Callable[[], object]
SimpleContextManager = Iterator[None]
KeySpec = str # like r"\C-c"
CommandName = str # like "interrupt"
EventTuple = tuple[CommandName, str]
Completer = Callable[[str, int], str | None]
type Callback = Callable[[], object]
type SimpleContextManager = Iterator[None]
type KeySpec = str # like r"\C-c"
type CommandName = str # like "interrupt"
type EventTuple = tuple[CommandName, str]
type Completer = Callable[[str, int], str | None]
type CharBuffer = list[str]
type CharWidths = list[int]
39 changes: 39 additions & 0 deletions Lib/_pyrepl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import unicodedata
import functools

from .types import CharBuffer, CharWidths
from .trace import trace

ANSI_ESCAPE_SEQUENCE = re.compile(r"\x1b\[[ -@]*[A-~]")
ZERO_WIDTH_BRACKET = re.compile(r"\x01.*?\x02")
ZERO_WIDTH_TRANS = str.maketrans({"\x01": "", "\x02": ""})
Expand Down Expand Up @@ -36,3 +39,39 @@ def unbracket(s: str, including_content: bool = False) -> str:
if including_content:
return ZERO_WIDTH_BRACKET.sub("", s)
return s.translate(ZERO_WIDTH_TRANS)


def disp_str(buffer: str) -> tuple[CharBuffer, CharWidths]:
r"""Decompose the input buffer into a printable variant.

Returns a tuple of two lists:
- the first list is the input buffer, character by character;
- the second list is the visible width of each character in the input
buffer.

Examples:
>>> utils.disp_str("a = 9")
(['a', ' ', '=', ' ', '9'], [1, 1, 1, 1, 1])
"""
chars: CharBuffer = []
char_widths: CharWidths = []

if not buffer:
return chars, char_widths

for c in buffer:
if c == "\x1a": # CTRL-Z on Windows
chars.append(c)
char_widths.append(2)
elif ord(c) < 128:
chars.append(c)
char_widths.append(1)
elif unicodedata.category(c).startswith("C"):
c = r"\u%04x" % ord(c)
chars.append(c)
char_widths.append(len(c))
else:
chars.append(c)
char_widths.append(str_width(c))
trace("disp_str({buffer}) = {s}, {b}", buffer=repr(buffer), s=chars, b=char_widths)
return chars, char_widths
Comment on lines +44 to +77
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check if you like something of this version with some improvements:

Suggested change
def disp_str(buffer: str) -> tuple[CharBuffer, CharWidths]:
r"""Decompose the input buffer into a printable variant.
Returns a tuple of two lists:
- the first list is the input buffer, character by character;
- the second list is the visible width of each character in the input
buffer.
Examples:
>>> utils.disp_str("a = 9")
(['a', ' ', '=', ' ', '9'], [1, 1, 1, 1, 1])
"""
chars: CharBuffer = []
char_widths: CharWidths = []
if not buffer:
return chars, char_widths
for c in buffer:
if c == "\x1a": # CTRL-Z on Windows
chars.append(c)
char_widths.append(2)
elif ord(c) < 128:
chars.append(c)
char_widths.append(1)
elif unicodedata.category(c).startswith("C"):
c = r"\u%04x" % ord(c)
chars.append(c)
char_widths.append(len(c))
else:
chars.append(c)
char_widths.append(str_width(c))
trace("disp_str({buffer}) = {s}, {b}", buffer=repr(buffer), s=chars, b=char_widths)
return chars, char_widths
CharBuffer = List[str]
CharWidths = List[int]
CTRL_Z = "\x1a"
ASCII_MAX = 127
CTRL_Z_WIDTH = 2
ASCII_WIDTH = 1
@lru_cache(maxsize=1024)
def cached_category(c: str) -> str:
"""Cache unicodedata.category calls for better performance."""
return unicodedata.category(c)
def disp_str(buffer: str) -> Tuple[CharBuffer, CharWidths]:
r"""Decompose the input buffer into a printable variant.
Returns a tuple of two lists:
- the first list is the input buffer, character by character;
- the second list is the visible width of each character in the input buffer.
Examples:
>>> utils.disp_str("a = 9")
(['a', ' ', '=', ' ', '9'], [1, 1, 1, 1, 1])
"""
# Fast path for empty buffer
if not buffer:
return [], []
# Optimization for common case: pure ASCII text
if all(ord(c) < ASCII_MAX for c in buffer):
chars = list(buffer)
char_widths = [ASCII_WIDTH] * len(buffer)
trace("disp_str({buffer}) = {s}, {b}", buffer=repr(buffer), s=chars, b=char_widths)
return chars, char_widths
chars: CharBuffer = []
char_widths: CharWidths = []
# Pre-allocate lists for better performance
chars = [None] * len(buffer)
char_widths = [0] * len(buffer)
for i, c in enumerate(buffer):
if c == CTRL_Z: # CTRL-Z on Windows
chars[i] = c
char_widths[i] = CTRL_Z_WIDTH
elif ord(c) < ASCII_MAX:
chars[i] = c
char_widths[i] = ASCII_WIDTH
elif cached_category(c).startswith("C"):
unicode_repr = f"\\u{ord(c):04x}"
chars[i] = unicode_repr
char_widths[i] = len(unicode_repr)
else:
chars[i] = c
try:
char_widths[i] = str_width(c)
except Exception:
# Fallback if str_width fails
char_widths[i] = 1
trace("disp_str({buffer}) = {s}, {b}", buffer=repr(buffer), s=chars, b=char_widths)
return chars, char_widths

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like caching character categories, will add this to the third PR in the sequence. The other optimization you added would have to be removed anyway when adding color codes.

Loading