Skip to content

gh-113317: Add libclinic.block_parser and libclinic.language modules #116819

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
338 changes: 2 additions & 336 deletions Tools/clinic/clinic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,9 @@
#
from __future__ import annotations

import abc
import argparse
import ast
import builtins as bltns
import collections
import contextlib
import dataclasses as dc
import enum
Expand Down Expand Up @@ -57,6 +55,8 @@
ClassDict, ModuleDict, FunctionKind,
CALLABLE, STATIC_METHOD, CLASS_METHOD, METHOD_INIT, METHOD_NEW,
GETTER, SETTER)
from libclinic.language import Language, PythonLanguage
from libclinic.block_parser import Block, BlockParser


# TODO:
Expand Down Expand Up @@ -144,96 +144,6 @@ def __init__(self) -> None:
self.unlock: list[str] = []


class Language(metaclass=abc.ABCMeta):

start_line = ""
body_prefix = ""
stop_line = ""
checksum_line = ""

def __init__(self, filename: str) -> None:
self.filename = filename

@abc.abstractmethod
def render(
self,
clinic: Clinic,
signatures: Iterable[Module | Class | Function]
) -> str:
...

def parse_line(self, line: str) -> None:
...

def validate(self) -> None:
def assert_only_one(
attr: str,
*additional_fields: str
) -> None:
"""
Ensures that the string found at getattr(self, attr)
contains exactly one formatter replacement string for
each valid field. The list of valid fields is
['dsl_name'] extended by additional_fields.

e.g.
self.fmt = "{dsl_name} {a} {b}"

# this passes
self.assert_only_one('fmt', 'a', 'b')

# this fails, the format string has a {b} in it
self.assert_only_one('fmt', 'a')

# this fails, the format string doesn't have a {c} in it
self.assert_only_one('fmt', 'a', 'b', 'c')

# this fails, the format string has two {a}s in it,
# it must contain exactly one
self.fmt2 = '{dsl_name} {a} {a}'
self.assert_only_one('fmt2', 'a')

"""
fields = ['dsl_name']
fields.extend(additional_fields)
line: str = getattr(self, attr)
fcf = libclinic.FormatCounterFormatter()
fcf.format(line)
def local_fail(should_be_there_but_isnt: bool) -> None:
if should_be_there_but_isnt:
fail("{} {} must contain {{{}}} exactly once!".format(
self.__class__.__name__, attr, name))
else:
fail("{} {} must not contain {{{}}}!".format(
self.__class__.__name__, attr, name))

for name, count in fcf.counts.items():
if name in fields:
if count > 1:
local_fail(True)
else:
local_fail(False)
for name in fields:
if fcf.counts.get(name) != 1:
local_fail(True)

assert_only_one('start_line')
assert_only_one('stop_line')

field = "arguments" if "{arguments}" in self.checksum_line else "checksum"
assert_only_one('checksum_line', field)



class PythonLanguage(Language):

language = 'Python'
start_line = "#/*[{dsl_name} input]"
body_prefix = "#"
stop_line = "#[{dsl_name} start generated code]*/"
checksum_line = "#/*[{dsl_name} end generated code: {arguments}]*/"


ParamTuple = tuple["Parameter", ...]


Expand Down Expand Up @@ -1646,250 +1556,6 @@ def render_function(
return clinic.get_destination('block').dump()


@dc.dataclass(slots=True, repr=False)
class Block:
r"""
Represents a single block of text embedded in
another file. If dsl_name is None, the block represents
verbatim text, raw original text from the file, in
which case "input" will be the only non-false member.
If dsl_name is not None, the block represents a Clinic
block.

input is always str, with embedded \n characters.
input represents the original text from the file;
if it's a Clinic block, it is the original text with
the body_prefix and redundant leading whitespace removed.

dsl_name is either str or None. If str, it's the text
found on the start line of the block between the square
brackets.

signatures is a list.
It may only contain clinic.Module, clinic.Class, and
clinic.Function objects. At the moment it should
contain at most one of each.

output is either str or None. If str, it's the output
from this block, with embedded '\n' characters.

indent is a str. It's the leading whitespace
that was found on every line of input. (If body_prefix is
not empty, this is the indent *after* removing the
body_prefix.)

"indent" is different from the concept of "preindent"
(which is not stored as state on Block objects).
"preindent" is the whitespace that
was found in front of every line of input *before* the
"body_prefix" (see the Language object). If body_prefix
is empty, preindent must always be empty too.

To illustrate the difference between "indent" and "preindent":

Assume that '_' represents whitespace.
If the block processed was in a Python file, and looked like this:
____#/*[python]
____#__for a in range(20):
____#____print(a)
____#[python]*/
"preindent" would be "____" and "indent" would be "__".

"""
input: str
dsl_name: str | None = None
signatures: list[Module | Class | Function] = dc.field(default_factory=list)
output: Any = None # TODO: Very dynamic; probably untypeable in its current form?
indent: str = ''

def __repr__(self) -> str:
dsl_name = self.dsl_name or "text"
def summarize(s: object) -> str:
s = repr(s)
if len(s) > 30:
return s[:26] + "..." + s[0]
return s
parts = (
repr(dsl_name),
f"input={summarize(self.input)}",
f"output={summarize(self.output)}"
)
return f"<clinic.Block {' '.join(parts)}>"


class BlockParser:
"""
Block-oriented parser for Argument Clinic.
Iterator, yields Block objects.
"""

def __init__(
self,
input: str,
language: Language,
*,
verify: bool = True
) -> None:
"""
"input" should be a str object
with embedded \n characters.

"language" should be a Language object.
"""
language.validate()

self.input = collections.deque(reversed(input.splitlines(keepends=True)))
self.block_start_line_number = self.line_number = 0

self.language = language
before, _, after = language.start_line.partition('{dsl_name}')
assert _ == '{dsl_name}'
self.find_start_re = libclinic.create_regex(before, after,
whole_line=False)
self.start_re = libclinic.create_regex(before, after)
self.verify = verify
self.last_checksum_re: re.Pattern[str] | None = None
self.last_dsl_name: str | None = None
self.dsl_name: str | None = None
self.first_block = True

def __iter__(self) -> BlockParser:
return self

def __next__(self) -> Block:
while True:
if not self.input:
raise StopIteration

if self.dsl_name:
try:
return_value = self.parse_clinic_block(self.dsl_name)
except ClinicError as exc:
exc.filename = self.language.filename
exc.lineno = self.line_number
raise
self.dsl_name = None
self.first_block = False
return return_value
block = self.parse_verbatim_block()
if self.first_block and not block.input:
continue
self.first_block = False
return block


def is_start_line(self, line: str) -> str | None:
match = self.start_re.match(line.lstrip())
return match.group(1) if match else None

def _line(self, lookahead: bool = False) -> str:
self.line_number += 1
line = self.input.pop()
if not lookahead:
self.language.parse_line(line)
return line

def parse_verbatim_block(self) -> Block:
lines = []
self.block_start_line_number = self.line_number

while self.input:
line = self._line()
dsl_name = self.is_start_line(line)
if dsl_name:
self.dsl_name = dsl_name
break
lines.append(line)

return Block("".join(lines))

def parse_clinic_block(self, dsl_name: str) -> Block:
in_lines = []
self.block_start_line_number = self.line_number + 1
stop_line = self.language.stop_line.format(dsl_name=dsl_name)
body_prefix = self.language.body_prefix.format(dsl_name=dsl_name)

def is_stop_line(line: str) -> bool:
# make sure to recognize stop line even if it
# doesn't end with EOL (it could be the very end of the file)
if line.startswith(stop_line):
remainder = line.removeprefix(stop_line)
if remainder and not remainder.isspace():
fail(f"Garbage after stop line: {remainder!r}")
return True
else:
# gh-92256: don't allow incorrectly formatted stop lines
if line.lstrip().startswith(stop_line):
fail(f"Whitespace is not allowed before the stop line: {line!r}")
return False

# consume body of program
while self.input:
line = self._line()
if is_stop_line(line) or self.is_start_line(line):
break
if body_prefix:
line = line.lstrip()
assert line.startswith(body_prefix)
line = line.removeprefix(body_prefix)
in_lines.append(line)

# consume output and checksum line, if present.
if self.last_dsl_name == dsl_name:
checksum_re = self.last_checksum_re
else:
before, _, after = self.language.checksum_line.format(dsl_name=dsl_name, arguments='{arguments}').partition('{arguments}')
assert _ == '{arguments}'
checksum_re = libclinic.create_regex(before, after, word=False)
self.last_dsl_name = dsl_name
self.last_checksum_re = checksum_re
assert checksum_re is not None

# scan forward for checksum line
out_lines = []
arguments = None
while self.input:
line = self._line(lookahead=True)
match = checksum_re.match(line.lstrip())
arguments = match.group(1) if match else None
if arguments:
break
out_lines.append(line)
if self.is_start_line(line):
break

output: str | None
output = "".join(out_lines)
if arguments:
d = {}
for field in shlex.split(arguments):
name, equals, value = field.partition('=')
if not equals:
fail(f"Mangled Argument Clinic marker line: {line!r}")
d[name.strip()] = value.strip()

if self.verify:
if 'input' in d:
checksum = d['output']
else:
checksum = d['checksum']

computed = libclinic.compute_checksum(output, len(checksum))
if checksum != computed:
fail("Checksum mismatch! "
f"Expected {checksum!r}, computed {computed!r}. "
"Suggested fix: remove all generated code including "
"the end marker, or use the '-f' option.")
else:
# put back output
output_lines = output.splitlines(keepends=True)
self.line_number -= len(output_lines)
self.input.extend(reversed(output_lines))
output = None

return Block("".join(in_lines), dsl_name, output=output)


@dc.dataclass(slots=True, frozen=True)
class Include:
"""
Expand Down
Loading