Skip to content

Improve speed of textwrap.dedent #131791

Closed
Closed
@Marius-Juston

Description

@Marius-Juston

Feature or enhancement

Proposal:

Current code:

_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)

def dedent(text):
    """Remove any common leading whitespace from every line in `text`.

    This can be used to make triple-quoted strings line up with the left
    edge of the display, while still presenting them in the source code
    in indented form.

    Note that tabs and spaces are both treated as whitespace, but they
    are not equal: the lines "  hello" and "\\thello" are
    considered to have no common leading whitespace.

    Entirely blank lines are normalized to a newline character.
    """
    # Look for the longest leading string of spaces and tabs common to
    # all lines.
    margin = None
    text = _whitespace_only_re.sub('', text)
    indents = _leading_whitespace_re.findall(text)
    for indent in indents:
        if margin is None:
            margin = indent

        # Current line more deeply indented than previous winner:
        # no change (previous winner is still on top).
        elif indent.startswith(margin):
            pass

        # Current line consistent with and no deeper than previous winner:
        # it's the new winner.
        elif margin.startswith(indent):
            margin = indent

        # Find the largest common whitespace between current line and previous
        # winner.
        else:
            for i, (x, y) in enumerate(zip(margin, indent)):
                if x != y:
                    margin = margin[:i]
                    break

    # sanity check (testing/debugging only)
    if 0 and margin:
        for line in text.split("\n"):
            assert not line or line.startswith(margin), \
                   "line = %r, margin = %r" % (line, margin)

    if margin:
        text = re.sub(r'(?m)^' + margin, '', text)
    return text

Can speed up the process for large files up to 4x the speed:

def dedent_faster(text: str, only_whitespace: bool = True) -> str:
    """Remove any common leading whitespace from every line in `text`.

    This can be used to make triple-quoted strings line up with the left
    edge of the display, while still presenting them in the source code
    in indented form.

    Note that tabs and spaces are both treated as whitespace, but they
    are not equal: the lines "  hello" and "\\thello" are
    considered to have no common leading whitespace.
    
    If `only_whitespace` is `True`, the leading whitespaces are removed from the text. Otherwise, all the common leading text is removed.

    Entirely blank lines are normalized to a newline character.
    """
    # Early return for empty input
    if not text:
        return text

    # Split into lines
    lines = text.splitlines(True)

    # Fast path for single line - but make sure we still dedent!
    if len(lines) == 1:
        line = lines[0]
        stripped = line.strip()
        if not stripped:  # Blank line
            return "\n" if line.endswith("\n") else ""

        # Find leading whitespace for a single line
        if only_whitespace:
            i = 0
            while i < len(line) and line[i] in " \t":
                i += 1
            if i > 0:  # Has leading whitespace to remove
                return line[i:]
        else:
            lead_size = len(line) - len(line.lstrip())
            if lead_size > 0:  # Has leading whitespace to remove
                return line[lead_size:]
        return line  # No whitespace to remove

    # Cache method lookups for faster access
    _strip = str.strip
    _startswith = str.startswith
    _endswith = str.endswith

    # Find first two non-blank lines
    non_blank = []
    for line in lines:
        if _strip(line):
            non_blank.append(line)
            if len(non_blank) == 2:
                break

    # All lines are blank
    if not non_blank:
        result = []
        append = result.append
        for line in lines:
            append("\n" if _endswith(line, "\n") else "")
        return "".join(result)

    # Calculate margin length efficiently
    if len(non_blank) == 1:
        # Single non-blank line
        line = non_blank[0]
        if only_whitespace:
            # Manually find leading whitespace (faster than regex)
            i = 0
            line_len = len(line)
            while i < line_len and line[i] in " \t":
                i += 1
            margin_len = i
        else:
            # Use built-in lstrip for non-whitespace case
            margin_len = len(line) - len(line.lstrip())
    else:
        # Find common prefix of first two non-blank lines
        a, b = non_blank
        min_len = min(len(a), len(b))
        i = 0

        if only_whitespace:
            # Manual loop is faster than character-by-character comparison
            while i < min_len and a[i] == b[i] and a[i] in " \t":
                i += 1
        else:
            while i < min_len and a[i] == b[i]:
                i += 1

        margin_len = i

    # No margin to remove - return original with blank line normalization
    if margin_len == 0:
        result = []
        append = result.append
        for line in lines:
            if _strip(line):  # Non-blank line
                append(line)
            else:  # Blank line
                append("\n" if _endswith(line, "\n") else "")
        return "".join(result)

    # Get margin string once for repeated comparison
    margin = non_blank[0][:margin_len]

    # Pre-allocate result list with a size hint for better memory efficiency
    result = []
    append = result.append

    # Process all lines with optimized operations
    for line in lines:
        if not _strip(line):  # Blank line (including whitespace-only lines)
            append("\n" if _endswith(line, "\n") else "")
        elif _startswith(line, margin):  # Has margin
            # Slice operation is very fast in Python
            append(line[margin_len:])
        else:  # No matching margin
            append(line)

    # Single join is faster than incremental string building
    return "".join(result)

which has the following speed outputs:

if __name__ == '__main__':
    with open("../Objects/unicodeobject.c") as f:
        raw_text = f.read()


    tests = []
    test_names = []
    test_names = ['', "    ", "\t", "abc  \t", ' \t  abc', ' \t  abc  \t  ']

    index = 0

    temp = dedent(raw_text)

    for indent_v in test_names:
        text = indent(raw_text, indent_v)

        tests.append(text)


        output = dedent_faster(text, only_whitespace=False)

        print("Validating large text with not only whitespace indentation:", indent_v.encode("ascii"), output == temp)


    # Basic indented text with empty lines
    text = """
        def hello():
            print("Hello, world!")


        """

    tests.append(text)
    test_names.append("Basic indented text with empty lines")

    # Text with mixed indentation and blank lines
    text = """
        This is a test.

        Another line.

    """

    tests.append(text)
    test_names.append("Text with mixed indentation and blank lines")

    # No indentation (edge case)
    text = """No indents here.
    Just normal text.

    With empty lines."""

    tests.append(text)
    test_names.append("No indentation (edge case)")

    # Only blank lines (should preserve them)
    text = """


    """

    tests.append(text)
    test_names.append("Only blank lines")

    # Edge case: No common prefix to remove
    text = """hello
        world
    """

    tests.append(text)
    test_names.append("Edge case: No common prefix to remove")

    # Edge case: Single indented line
    text = """    Only one indented line"""

    tests.append(text)
    test_names.append("Edge case: Single indented line")

    # Edge case: Single indented line
    text = """    """

    tests.append(text)
    test_names.append("Edge case: Single indented line only")

    # Edge case: Single indented line
    text = """"""

    tests.append(text)
    test_names.append("Edge case: Empty text")

    for text, name in zip(tests, test_names):
        print(f"========= Case: {name.encode('ascii')} =========")

        a = dedent(text)

        for func in [dedent_faster, dedent]:
            single_test = func(text)

            print(func.__name__, a == single_test)

            it = timeit.Timer(lambda: func(text))
            result = it.repeat(number=10_000)
            result.sort()
            print(f"{func.__name__} Min: {result[0]:.4f}msec")

Returning the following:

Validating large text with not only whitespace indentation: b'' True
Validating large text with not only whitespace indentation: b'    ' True
Validating large text with not only whitespace indentation: b'\t' True
Validating large text with not only whitespace indentation: b'abc  \t' True
Validating large text with not only whitespace indentation: b' \t  abc' True
Validating large text with not only whitespace indentation: b' \t  abc  \t  ' True
========= Case: b'' =========
dedent_faster True
dedent_faster Min: 1.5848msec
dedent True
dedent Min: 6.6143msec
========= Case: b'    ' =========
dedent_faster True
dedent_faster Min: 2.5275msec
dedent True
dedent Min: 10.6884msec
========= Case: b'\t' =========
dedent_faster True
dedent_faster Min: 2.3215msec
dedent True
dedent Min: 9.9588msec
========= Case: b'abc  \t' =========
dedent_faster True
dedent_faster Min: 1.5026msec
dedent True
dedent Min: 6.7075msec
========= Case: b' \t  abc' =========
dedent_faster True
dedent_faster Min: 2.5997msec
dedent True
dedent Min: 10.6693msec
========= Case: b' \t  abc  \t  ' =========
dedent_faster True
dedent_faster Min: 2.6204msec
dedent True
dedent Min: 11.7285msec
========= Case: b'Basic indented text with empty lines' =========
dedent_faster True
dedent_faster Min: 0.0016msec
dedent True
dedent Min: 0.0022msec
========= Case: b'Text with mixed indentation and blank lines' =========
dedent_faster True
dedent_faster Min: 0.0016msec
dedent True
dedent Min: 0.0020msec
========= Case: b'No indentation (edge case)' =========
dedent_faster True
dedent_faster Min: 0.0008msec
dedent True
dedent Min: 0.0013msec
========= Case: b'Only blank lines' =========
dedent_faster True
dedent_faster Min: 0.0006msec
dedent True
dedent Min: 0.0005msec
========= Case: b'Edge case: No common prefix to remove' =========
dedent_faster True
dedent_faster Min: 0.0007msec
dedent True
dedent Min: 0.0008msec
========= Case: b'Edge case: Single indented line' =========
dedent_faster True
dedent_faster Min: 0.0004msec
dedent True
dedent Min: 0.0013msec
========= Case: b'Edge case: Single indented line only' =========
dedent_faster True
dedent_faster Min: 0.0002msec
dedent True
dedent Min: 0.0003msec
========= Case: b'Edge case: Empty text' =========
dedent_faster True
dedent_faster Min: 0.0001msec
dedent True
dedent Min: 0.0002msec

Which thus returns a 4x performance boost for large files. Another advantage is this this method allows for people to just remove all common prefixes to the file as an option instead of just whitespaces.

( This was optimized iteratively using Claude + ChatGPT models )

Has this already been discussed elsewhere?

No response given

Links to previous discussion of this feature:

No response

Linked PRs

Metadata

Metadata

Assignees

No one assigned

    Labels

    performancePerformance or resource usagestdlibPython modules in the Lib dirtype-featureA feature request or enhancement

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions