diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000..fb95f884 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,12 @@ +# These are supported funding model platforms + +github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] +patreon: # Replace with a single Patreon username +open_collective: # Replace with a single Open Collective username +ko_fi: # Replace with a single Ko-fi username +tidelift: "pypi/pyparsing" +community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry +liberapay: # Replace with a single Liberapay username +issuehunt: # Replace with a single IssueHunt username +otechie: # Replace with a single Otechie username +custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] diff --git a/.github/SECURITY.md b/.github/SECURITY.md new file mode 100644 index 00000000..4abec8a8 --- /dev/null +++ b/.github/SECURITY.md @@ -0,0 +1,11 @@ +# Security Policy + +Pyparsing itself has no known security vulnerabilities. It does not +itself access any risk-inherent methods like `exec` or `eval`, nor does it import +any modules not part of the Python standard library. + +Parsers written with pyparsing *may* introduce security vulnerabilities. If so, this +information should be forwarded to the maintainer of those parsers. + +If you find that pyparsing itself has a security vulnerability, please report it to +https://tidelift.com/security. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..39b78521 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,48 @@ +name: Continuous Integration +on: + push: + branches: + - master + + pull_request: + paths: + - .github/workflows/ci.yml + - pyparsing/* + - pyproject.toml + - tox.ini + +permissions: + contents: read + +jobs: + tests: + name: Unit tests + runs-on: ${{ matrix.os || 'ubuntu-latest' }} + strategy: + matrix: + os: ["ubuntu-latest"] + toxenv: [py, pyparsing_packaging] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + include: + - python-version: "3.11" + os: macos-latest + - python-version: "3.11" + toxenv: mypy-test + - python-version: "pypy-3.9" + env: + TOXENV: ${{ matrix.toxenv || 'py' }} + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox railroad-diagrams Jinja2 + + - name: Test + run: tox diff --git a/.gitignore b/.gitignore index 98ea5c75..68c9efcc 100644 --- a/.gitignore +++ b/.gitignore @@ -7,22 +7,7 @@ working/* # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # User-specific stuff: -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/dictionaries - -# Sensitive or high-churn files: -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.xml -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml - -# Gradle: -.idea/**/gradle.xml -.idea/**/libraries +.idea # CMake cmake-build-debug/ @@ -171,3 +156,8 @@ venv.bak/ .mypy_cache/ # End of https://www.gitignore.io/api/python,pycharm + +# For developers on OSX +.DS_Store + +examples/verilog/ diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml deleted file mode 100644 index 4fa25d8f..00000000 --- a/.idea/inspectionProfiles/Project_Default.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 6c993b77..00000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 076a4ebe..00000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/pyparsing.iml b/.idea/pyparsing.iml deleted file mode 100644 index 5f2f3dc6..00000000 --- a/.idea/pyparsing.iml +++ /dev/null @@ -1,22 +0,0 @@ - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7f..00000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..799985da --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: +- repo: https://github.com/python/black + rev: stable + hooks: + - id: black + language_version: python3.6 diff --git a/.scrutinizer.yml b/.scrutinizer.yml deleted file mode 100644 index 871ceeec..00000000 --- a/.scrutinizer.yml +++ /dev/null @@ -1,39 +0,0 @@ -# This file contains the configuration for Scrutinizer-CI, a tool we use for software quality purposes. -build: - environment: - python: 3.6.3 - - variables: - PYTHON_VERSIONS: 'jython-2.7.1 pypy2.7-5.10.0 pypy3.5-5.10.1 2.7.15 3.3.6 3.4.5 3.5.6' - - dependencies: - override: - - 'SCRIPT_PATH=$PWD/scrutinizer-pyenv.sh' - - - 'pushd . ' - - 'cd $HOME' - - - command: '$SCRIPT_PATH' - not_if: 'exists-in-cache repository "$PYTHON_VERSIONS"' - idle_timeout: 3000 - - command: 'store-in-cache repository "$PYTHON_VERSIONS" .pyenv' - not_if: 'exists-in-cache repository "$PYTHON_VERSIONS"' - - - command: 'restore-from-cache repository "$PYTHON_VERSIONS"' - only_if: 'exists-in-cache repository "$PYTHON_VERSIONS"' - - - 'popd' - - - 'pip install -r requirements-dev.txt' - - tests: - override: - - 'pyenv local $PYTHON_VERSIONS' - # Following command generates .coverage file, the output of the "coverage" tool. - - command: 'tox && coverage combine' - coverage: # This section instructs Scrutinizer-CI this command produces test coverage output. - file: '.coverage' - format: 'py-cc' - - 'py-scrutinizer-run' - - - true # to have scrutinizer-ci not infer any commands. diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 0d3fccf6..00000000 --- a/.travis.yml +++ /dev/null @@ -1,31 +0,0 @@ -sudo: false - -language: python - -matrix: - include: - - python: 2.7 - - python: 3.4 - - python: 3.5 - - python: 3.6 - - python: 3.7 - dist: xenial - sudo: true - fast_finish: true - -install: - - pip install codecov - -script: - - python unitTests.py - - python simple_unit_tests.py - - PYTHONPATH=. python examples/numerics.py - - PYTHONPATH=. python examples/TAP.py - - PYTHONPATH=. python examples/romanNumerals.py - - PYTHONPATH=. python examples/sexpParser.py - - PYTHONPATH=. python examples/oc.py - - PYTHONPATH=. python examples/delta_time.py - - PYTHONPATH=. python examples/eval_arith.py - -after_success: - - codecov diff --git a/BUILDING.md b/BUILDING.md new file mode 100644 index 00000000..a91e06fb --- /dev/null +++ b/BUILDING.md @@ -0,0 +1,38 @@ +# BUILDING + +pyparsing uses the [flit](https://flit.readthedocs.io/) build system +that is compliant with [PEP 517](https://www.python.org/dev/peps/pep-0517/). +Therefore, any PEP 517-compliant tools can be used to build it. + + +## Building using flit + +To build the distribution files using flit, type: + +``` +$ flit build +``` + +The generated sdist and wheel will be placed in `dist/` directory. + + +## Building using build + +[build](https://github.com/pypa/build) is a generic builder for PEP 517 +projects. To build the distribution files using build, type: + +``` +$ pyproject-build +``` + +The generated sdist and wheel will be placed in `dist/` directory. + + +## Testing + +pyparsing uses [tox](https://tox.wiki/en/latest/) to run tests. +In order to run the complete test suite, type: + +``` +$ tox +``` diff --git a/CHANGES b/CHANGES index 328e0c4b..74cf5a2d 100644 --- a/CHANGES +++ b/CHANGES @@ -2,26 +2,1208 @@ Change Log ========== -Version 2.4.2a - July, 2019 +NOTE: In the future release 3.2.0, use of many of the pre-PEP8 methods (such as +`ParserElement.parseString`) will start to raise `DeprecationWarnings`. 3.2.0 should +get released some time later in 2023. I currently plan to completely +drop the pre-PEP8 methods in pyparsing 4.0, though we won't see that release until +at least late 2023 if not 2024. So there is plenty of time to convert existing parsers to +the new function names before the old functions are completely removed. (Big +help from Devin J. Pohly in structuring the code to enable this peaceful transition.) + +Version 3.2.0 will also discontinue support for Python versions 3.6 and 3.7. + +Version 3.1.0 - June, 2023 +-------------------------- +- Added `tag_emitter.py` to examples. This example demonstrates how to insert + tags into your parsed results that are not part of the original parsed text. + + +Version 3.1.0b2 - May, 2023 --------------------------- -It turns out I got the meaning of `[...]` absolutely backwards, -so I've deleted 2.4.1 and am repushing this release as 2.4.2a -for people to give it a try before I call it ready to go. +- Updated `create_diagram()` code to be compatible with railroad-diagrams package + version 3.0. Fixes Issue #477 (railroad diagrams generated with black bars), + reported by Sam Morley-Short. -The `expr[...]` notation was pushed out to be synonymous with -`OneOrMore(expr)`, but this is really counter to most Python -notations (and even other internal pyparsing notations as well). +- Fixed bug in `NotAny`, where parse actions on the negated expr were not being run. + This could cause `NotAny` to incorrectly fail if the expr would normally match, + but would fail to match if a condition used as a parse action returned False. + Fixes Issue #482, raised by byaka, thank you! -It also seems that I introduced an ugly bug in the changes made -to Or, so 2.4.1 really needs to be unreleased. So sorry, -everyone! +- Fixed `create_diagram()` to accept keyword args, to be passed through to the + `template.render()` method to generate the output HTML (PR submitted by Aussie Schnore, + good catch!) -(Updated) -- A new shorthand notation has been added for repetition - expressions: expr[min, max], with '...' valid as a min - or max value: - - expr[...] and expr[0, ...] are equivalent to - ZeroOrMore(expr) +- Fixed bug in `python_quoted_string` regex. + +- Added `examples/bf.py` Brainf*ck parser/executor example. Illustrates using + a pyparsing grammar to parse language syntax, and attach executable AST nodes to + the parsed results. + + +Version 3.1.0b1 - April, 2023 +----------------------------- +- Added support for Python 3.12. + +- API CHANGE: A slight change has been implemented when unquoting a quoted string + parsed using the `QuotedString` class. Formerly, when unquoting and processing + whitespace markers such as \t and \n, these substitutions would occur first, and + then any additional '\' escaping would be done on the resulting string. This would + parse "\\n" as "\". Now escapes and whitespace markers are all processed + in a single pass working left to right, so the quoted string "\\n" would get unquoted + to "\n" (a backslash followed by "n"). Fixes issue #474 raised by jakeanq, + thanks! + +- Added named field "url" to `pyparsing.common.url`, returning the entire + parsed URL string. + +- Fixed bug when parse actions returned an empty string for an expression that + had a results name, that the results name was not saved. That is: + + expr = Literal("X").add_parse_action(lambda tokens: "")("value") + result = expr.parse_string("X") + print(result["value"]) + + would raise a `KeyError`. Now empty strings will be saved with the associated + results name. Raised in Issue #470 by Nicco Kunzmann, thank you. + +- Fixed bug in `SkipTo` where ignore expressions were not properly handled while + scanning for the target expression. Issue #475, reported by elkniwt, thanks + (this bug has been there for a looooong time!). + +- Updated `ci.yml` permissions to limit default access to source - submitted by Joyce + Brum of Google. Thanks so much! + +- Updated the `lucene_grammar.py` example (better support for '*' and '?' wildcards) + and corrected the test cases - brought to my attention by Elijah Nicol, good catch! + + +Version 3.1.0a1 - March, 2023 +----------------------------- +- API ENHANCEMENT: `Optional(expr)` may now be written as `expr | ""` + + This will make this code: + + "{" + Optional(Literal("A") | Literal("a")) + "}" + + writable as: + + "{" + (Literal("A") | Literal("a") | "") + "}" + + Some related changes implemented as part of this work: + - `Literal("")` now internally generates an `Empty()` (and no longer raises an exception) + - `Empty` is now a subclass of `Literal` + + Suggested by Antony Lee (issue #412), PR (#413) by Devin J. Pohly. + +- Added new class property `identifier` to all Unicode set classes in `pyparsing.unicode`, + using the class's values for `cls.identchars` and `cls.identbodychars`. Now Unicode-aware + parsers that formerly wrote: + + ppu = pyparsing.unicode + ident = Word(ppu.Greek.identchars, ppu.Greek.identbodychars) + + can now write: + + ident = ppu.Greek.identifier + # or + # ident = ppu.Ελληνικά.identifier + +- `ParseResults` now has a new method `deepcopy()`, in addition to the current + `copy()` method. `copy()` only makes a shallow copy - any contained `ParseResults` + are copied as references - changes in the copy will be seen as changes in the original. + In many cases, a shallow copy is sufficient, but some applications require a deep copy. + `deepcopy()` makes a deeper copy: any contained `ParseResults` or other mappings or + containers are built with copies from the original, and do not get changed if the + original is later changed. Addresses issue #463, reported by Bryn Pickering. + +- Reworked `delimited_list` function into the new `DelimitedList` class. + `DelimitedList` has the same constructor interface as `delimited_list`, and + in this release, `delimited_list` changes from a function to a synonym for + `DelimitedList`. `delimited_list` and the older `delimitedList` method will be + deprecated in a future release, in favor of `DelimitedList`. + +- Error messages from `MatchFirst` and `Or` expressions will try to give more details + if one of the alternatives matches better than the others, but still fails. + Question raised in Issue #464 by msdemlei, thanks! + +- Added new class method `ParserElement.using_each`, to simplify code + that creates a sequence of `Literals`, `Keywords`, or other `ParserElement` + subclasses. + + For instance, to define suppressible punctuation, you would previously + write: + + LPAR, RPAR, LBRACE, RBRACE, SEMI = map(Suppress, "(){};") + + You can now write: + + LPAR, RPAR, LBRACE, RBRACE, SEMI = Suppress.using_each("(){};") + + `using_each` will also accept optional keyword args, which it will + pass through to the class initializer. Here is an expression for + single-letter variable names that might be used in an algebraic + expression: + + algebra_var = MatchFirst( + Char.using_each(string.ascii_lowercase, as_keyword=True) + ) + +- Added new builtin `python_quoted_string`, which will match any form + of single-line or multiline quoted strings defined in Python. (Inspired + by discussion with Andreas Schörgenhumer in Issue #421.) + +- Extended `expr[]` notation for repetition of `expr` to accept a + slice, where the slice's stop value indicates a `stop_on` + expression: + + test = "BEGIN aaa bbb ccc END" + BEGIN, END = Keyword.using_each("BEGIN END".split()) + body_word = Word(alphas) + + expr = BEGIN + Group(body_word[...:END]) + END + # equivalent to + # expr = BEGIN + Group(ZeroOrMore(body_word, stop_on=END)) + END + + print(expr.parse_string(test)) + + Prints: + + ['BEGIN', ['aaa', 'bbb', 'ccc'], 'END'] + +- `ParserElement.validate()` is deprecated. It predates the support for left-recursive + parsers, and was prone to false positives (warning that a grammar was invalid when + it was in fact valid). It will be removed in a future pyparsing release. In its + place, developers should use debugging and analytical tools, such as `ParserElement.set_debug()` + and `ParserElement.create_diagram()`. + (Raised in Issue #444, thanks Andrea Micheli!) + +- Added bool `embed` argument to `ParserElement.create_diagram()`. + When passed as True, the resulting diagram will omit the ``, + ``, and `` tags so that it can be embedded in other + HTML source. (Useful when embedding a call to `create_diagram()` in + a PyScript HTML page.) + +- Added `recurse` argument to `ParserElement.set_debug` to set the + debug flag on an expression and all of its sub-expressions. Requested + by multimeric in Issue #399. + +- Added '·' (Unicode MIDDLE DOT) to the set of Latin1.identbodychars. + +- Fixed bug in `Word` when `max=2`. Also added performance enhancement + when specifying `exact` argument. Reported in issue #409 by + panda-34, nice catch! + +- `Word` arguments are now validated if `min` and `max` are both + given, that `min` <= `max`; raises `ValueError` if values are invalid. + +- Fixed bug in srange, when parsing escaped '/' and '\' inside a + range set. + +- Fixed exception messages for some `ParserElements` with custom names, + which instead showed their contained expression names. + +- Fixed bug in pyparsing.common.url, when input URL is not alone + on an input line. Fixes Issue #459, reported by David Kennedy. + +- Multiple added and corrected type annotations. With much help from + Stephen Rosen, thanks! + +- Some documentation and error message clarifications on pyparsing's + keyword logic, cited by Basil Peace. + +- General docstring cleanup for Sphinx doc generation, PRs submitted + by Devin J. Pohly. A dirty job, but someone has to do it - much + appreciated! + +- `invRegex.py` example renamed to `inv_regex.py` and updated to PEP-8 + variable and method naming. PR submitted by Ross J. Duff, thanks! + +- Removed examples `sparser.py` and `pymicko.py`, since each included its + own GPL license in the header. Since this conflicts with pyparsing's + MIT license, they were removed from the distribution to avoid + confusion among those making use of them in their own projects. + + +Version 3.0.9 - May, 2022 +------------------------- +- Added Unicode set `BasicMultilingualPlane` (may also be referenced + as `BMP`) representing the Basic Multilingual Plane (Unicode + characters up to code point 65535). Can be used to parse + most language characters, but omits emojis, wingdings, etc. + Raised in discussion with Dave Tapley (issue #392). + +- To address mypy confusion of `pyparsing.Optional` and `typing.Optional` + resulting in `error: "_SpecialForm" not callable` message + reported in issue #365, fixed the import in `exceptions.py`. Nice + sleuthing by Iwan Aucamp and Dominic Davis-Foster, thank you! + (Removed definitions of `OptionalType`, `DictType`, and `IterableType` + and replaced them with `typing.Optional`, `typing.Dict`, and + `typing.Iterable` throughout.) + +- Fixed typo in jinja2 template for railroad diagrams, thanks for the + catch Nioub (issue #388). + +- Removed use of deprecated `pkg_resources` package in + railroad diagramming code (issue #391). + +- Updated `bigquery_view_parser.py` example to parse examples at + https://cloud.google.com/bigquery/docs/reference/legacy-sql + + +Version 3.0.8 - April, 2022 +--------------------------- +- API CHANGE: modified `pyproject.toml` to require Python version + 3.6.8 or later for pyparsing 3.x. Earlier minor versions of 3.6 + fail in evaluating the `version_info` class (implemented using + `typing.NamedTuple`). If you are using an earlier version of Python + 3.6, you will need to use pyparsing 2.4.7. + +- Improved pyparsing import time by deferring regex pattern compiles. + PR submitted by Anthony Sottile to fix issue #362, thanks! + +- Updated build to use flit, PR by Michał Górny, added `BUILDING.md` + doc and removed old Windows build scripts - nice cleanup work! + +- More type-hinting added for all arithmetic and logical operator + methods in `ParserElement`. PR from Kazantcev Andrey, thank you. + +- Fixed `infix_notation`'s definitions of `lpar` and `rpar`, to accept + parse expressions such that they do not get suppressed in the parsed + results. PR submitted by Philippe Prados, nice work. + +- Fixed bug in railroad diagramming with expressions containing `Combine` + elements. Reported by Jeremy White, thanks! + +- Added `show_groups` argument to `create_diagram` to highlight grouped + elements with an unlabeled bounding box. + +- Added `unicode_denormalizer.py` to the examples as a demonstration + of how Python's interpreter will accept Unicode characters in + identifiers, but normalizes them back to ASCII so that identifiers + `print` and `𝕡𝓻ᵢ𝓃𝘁` and `𝖕𝒓𝗂𝑛ᵗ` are all equivalent. + +- Removed imports of deprecated `sre_constants` module for catching + exceptions when compiling regular expressions. PR submitted by + Serhiy Storchaka, thank you. + + +Version 3.0.7 - January, 2022 +----------------------------- +- Fixed bug #345, in which delimitedList changed expressions in place + using `expr.streamline()`. Reported by Kim Gräsman, thanks! + +- Fixed bug #346, when a string of word characters was passed to WordStart + or `WordEnd` instead of just taking the default value. Originally posted + as a question by Parag on StackOverflow, good catch! + +- Fixed bug #350, in which `White` expressions could fail to match due to + unintended whitespace-skipping. Reported by Fu Hanxi, thank you! + +- Fixed bug #355, when a `QuotedString` is defined with characters in its + quoteChar string containing regex-significant characters such as ., *, + ?, [, ], etc. + +- Fixed bug in `ParserElement.run_tests` where comments would be displayed + using `with_line_numbers`. + +- Added optional "min" and "max" arguments to `delimited_list`. PR + submitted by Marius, thanks! + +- Added new API change note in `whats_new_in_pyparsing_3_0_0`, regarding + a bug fix in the `bool()` behavior of `ParseResults`. + + Prior to pyparsing 3.0.x, the `ParseResults` class implementation of + `__bool__` would return `False` if the `ParseResults` item list was empty, + even if it contained named results. In 3.0.0 and later, `ParseResults` will + return `True` if either the item list is not empty *or* if the named + results dict is not empty. + + # generate an empty ParseResults by parsing a blank string with + # a ZeroOrMore + result = Word(alphas)[...].parse_string("") + print(result.as_list()) + print(result.as_dict()) + print(bool(result)) + + # add a results name to the result + result["name"] = "empty result" + print(result.as_list()) + print(result.as_dict()) + print(bool(result)) + + Prints: + + [] + {} + False + + [] + {'name': 'empty result'} + True + + In previous versions, the second call to `bool()` would return `False`. + +- Minor enhancement to Word generation of internal regular expression, to + emit consecutive characters in range, such as "ab", as "ab", not "a-b". + +- Fixed character ranges for search terms using non-Western characters + in booleansearchparser, PR submitted by tc-yu, nice work! + +- Additional type annotations on public methods. + + +Version 3.0.6 - November, 2021 +------------------------------ +- Added `suppress_warning()` method to individually suppress a warning on a + specific ParserElement. Used to refactor `original_text_for` to preserve + internal results names, which, while undocumented, had been adopted by + some projects. + +- Fix bug when `delimited_list` was called with a str literal instead of a + parse expression. + + +Version 3.0.5 - November, 2021 +------------------------------ +- Added return type annotations for `col`, `line`, and `lineno`. + +- Fixed bug when `warn_ungrouped_named_tokens_in_collection` warning was raised + when assigning a results name to an `original_text_for` expression. + (Issue #110, would raise warning in packaging.) + +- Fixed internal bug where `ParserElement.streamline()` would not return self if + already streamlined. + +- Changed `run_tests()` output to default to not showing line and column numbers. + If line numbering is desired, call with `with_line_numbers=True`. Also fixed + minor bug where separating line was not included after a test failure. + + +Version 3.0.4 - October, 2021 +----------------------------- +- Fixed bug in which `Dict` classes did not correctly return tokens as nested + `ParseResults`, reported by and fix identified by Bu Sun Kim, many thanks!!! + +- Documented API-changing side-effect of converting `ParseResults` to use `__slots__` + to pre-define instance attributes. This means that code written like this (which + was allowed in pyparsing 2.4.7): + + result = Word(alphas).parseString("abc") + result.xyz = 100 + + now raises this Python exception: + + AttributeError: 'ParseResults' object has no attribute 'xyz' + + To add new attribute values to ParseResults object in 3.0.0 and later, you must + assign them using indexed notation: + + result["xyz"] = 100 + + You will still be able to access this new value as an attribute or as an + indexed item. + +- Fixed bug in railroad diagramming where the vertical limit would count all + expressions in a group, not just those that would create visible railroad + elements. + + +Version 3.0.3 - October, 2021 +----------------------------- +- Fixed regex typo in `one_of` fix for `as_keyword=True`. + +- Fixed a whitespace-skipping bug, Issue #319, introduced as part of the revert + of the `LineStart` changes. Reported by Marc-Alexandre Côté, + thanks! + +- Added header column labeling > 100 in `with_line_numbers` - some input lines + are longer than others. + + +Version 3.0.2 - October, 2021 +----------------------------- +- Reverted change in behavior with `LineStart` and `StringStart`, which changed the + interpretation of when and how `LineStart` and `StringStart` should match when + a line starts with spaces. In 3.0.0, the `xxxStart` expressions were not + really treated like expressions in their own right, but as modifiers to the + following expression when used like `LineStart() + expr`, so that if there + were whitespace on the line before `expr` (which would match in versions prior + to 3.0.0), the match would fail. + + 3.0.0 implemented this by automatically promoting `LineStart() + expr` to + `AtLineStart(expr)`, which broke existing parsers that did not expect `expr` to + necessarily be right at the start of the line, but only be the first token + found on the line. This was reported as a regression in Issue #317. + + In 3.0.2, pyparsing reverts to the previous behavior, but will retain the new + `AtLineStart` and `AtStringStart` expression classes, so that parsers can chose + whichever behavior applies in their specific instance. Specifically: + + # matches expr if it is the first token on the line + # (allows for leading whitespace) + LineStart() + expr + + # matches only if expr is found in column 1 + AtLineStart(expr) + +- Performance enhancement to `one_of` to always generate an internal `Regex`, + even if `caseless` or `as_keyword` args are given as `True` (unless explicitly + disabled by passing `use_regex=False`). + +- `IndentedBlock` class now works with `recursive` flag. By default, the + results parsed by an `IndentedBlock` are grouped. This can be disabled by constructing + the `IndentedBlock` with `grouped=False`. + + +Version 3.0.1 - October, 2021 +----------------------------- +- Fixed bug where `Word(max=n)` did not match word groups less than length 'n'. + Thanks to Joachim Metz for catching this! + +- Fixed bug where `ParseResults` accidentally created recursive contents. + Joachim Metz on this one also! + +- Fixed bug where `warn_on_multiple_string_args_to_oneof` warning is raised + even when not enabled. + + +Version 3.0.0 - October, 2021 +----------------------------- +- A consolidated list of all the changes in the 3.0.0 release can be found in + `docs/whats_new_in_3_0_0.rst`. + (https://github.com/pyparsing/pyparsing/blob/master/docs/whats_new_in_3_0_0.rst) + + +Version 3.0.0.final - October, 2021 +----------------------------------- +- Added support for python `-W` warning option to call `enable_all_warnings`() at startup. + Also detects setting of `PYPARSINGENABLEALLWARNINGS` environment variable to any non-blank + value. (If using `-Wd` for testing, but wishing to disable pyparsing warnings, add + `-Wi:::pyparsing`.) + +- Fixed named results returned by `url` to match fields as they would be parsed + using `urllib.parse.urlparse`. + +- Early response to `with_line_numbers` was positive, with some requested enhancements: + . added a trailing "|" at the end of each line (to show presence of trailing spaces); + can be customized using `eol_mark` argument + . added expand_tabs argument, to control calling str.expandtabs (defaults to True + to match `parseString`) + . added mark_spaces argument to support display of a printing character in place of + spaces, or Unicode symbols for space and tab characters + . added mark_control argument to support highlighting of control characters using + '.' or Unicode symbols, such as "␍" and "␊". + +- Modified helpers `common_html_entity` and `replace_html_entity()` to use the HTML + entity definitions from `html.entities.html5`. + +- Updated the class diagram in the pyparsing docs directory, along with the supporting + .puml file (PlantUML markup) used to create the diagram. + +- Added global method `autoname_elements()` to call `set_name()` on all locally + defined `ParserElements` that haven't been explicitly named using `set_name()`, using + their local variable name. Useful for setting names on multiple elements when + creating a railroad diagram. + + a = pp.Literal("a") + b = pp.Literal("b").set_name("bbb") + pp.autoname_elements() + + `a` will get named "a", while `b` will keep its name "bbb". + + +Version 3.0.0rc2 - October, 2021 +-------------------------------- +- Added `url` expression to `pyparsing_common`. (Sample code posted by Wolfgang Fahl, + very nice!) + + This new expression has been added to the `urlExtractorNew.py` example, to show how + it extracts URL fields into separate results names. + +- Added method to `pyparsing_test` to help debugging, `with_line_numbers`. + Returns a string with line and column numbers corresponding to values shown + when parsing with expr.set_debug(): + + data = """\ + A + 100""" + expr = pp.Word(pp.alphanums).set_name("word").set_debug() + print(ppt.with_line_numbers(data)) + expr[...].parseString(data) + + prints: + + 1 + 1234567890 + 1: A + 2: 100 + Match word at loc 3(1,4) + A + ^ + Matched word -> ['A'] + Match word at loc 11(2,7) + 100 + ^ + Matched word -> ['100'] + +- Added new example `cuneiform_python.py` to demonstrate creating a new Unicode + range, and writing a Cuneiform->Python transformer (inspired by zhpy). + +- Fixed issue #272, reported by PhasecoreX, when `LineStart`() expressions would match + input text that was not necessarily at the beginning of a line. + + As part of this fix, two new classes have been added: AtLineStart and AtStringStart. + The following expressions are equivalent: + + LineStart() + expr and AtLineStart(expr) + StringStart() + expr and AtStringStart(expr) + + [`LineStart` and `StringStart` changes reverted in 3.0.2.] + +- Fixed `ParseFatalExceptions` failing to override normal exceptions or expression + matches in `MatchFirst` expressions. Addresses issue #251, reported by zyp-rgb. + +- Fixed bug in which `ParseResults` replaces a collection type value with an invalid + type annotation (as a result of changed behavior in Python 3.9). Addresses issue #276, reported by + Rob Shuler, thanks. + +- Fixed bug in `ParseResults` when calling `__getattr__` for special double-underscored + methods. Now raises `AttributeError` for non-existent results when accessing a + name starting with '__'. Addresses issue #208, reported by Joachim Metz. + +- Modified debug fail messages to include the expression name to make it easier to sync + up match vs success/fail debug messages. + + +Version 3.0.0rc1 - September, 2021 +---------------------------------- +- Railroad diagrams have been reformatted: + . creating diagrams is easier - call + + expr.create_diagram("diagram_output.html") + + create_diagram() takes 3 arguments: + . the filename to write the diagram HTML + . optional 'vertical' argument, to specify the minimum number of items in a path + to be shown vertically; default=3 + . optional 'show_results_names' argument, to specify whether results name + annotations should be shown; default=False + . every expression that gets a name using `setName()` gets separated out as + a separate subdiagram + . results names can be shown as annotations to diagram items + . `Each`, `FollowedBy`, and `PrecededBy` elements get [ALL], [LOOKAHEAD], and [LOOKBEHIND] + annotations + . removed annotations for Suppress elements + . some diagram cleanup when a grammar contains Forward elements + . check out the examples make_diagram.py and railroad_diagram_demo.py + +- Type annotations have been added to most public API methods and classes. + +- Better exception messages to show full word where an exception occurred. + + Word(alphas, alphanums)[...].parseString("ab1 123", parseAll=True) + + Was: + pyparsing.ParseException: Expected end of text, found '1' (at char 4), (line:1, col:5) + Now: + pyparsing.exceptions.ParseException: Expected end of text, found '123' (at char 4), (line:1, col:5) + +- Suppress can be used to suppress text skipped using "...". + + source = "lead in START relevant text END trailing text" + start_marker = Keyword("START") + end_marker = Keyword("END") + find_body = Suppress(...) + start_marker + ... + end_marker + print(find_body.parseString(source).dump()) + + Prints: + + ['START', 'relevant text ', 'END'] + - _skipped: ['relevant text '] + +- New string constants `identchars` and `identbodychars` to help in defining identifier Word expressions + + Two new module-level strings have been added to help when defining identifiers, `identchars` and `identbodychars`. + + Instead of writing:: + + import pyparsing as pp + identifier = pp.Word(pp.alphas + "_", pp.alphanums + "_") + + you will be able to write:: + + identifier = pp.Word(pp.identchars, pp.identbodychars) + + Those constants have also been added to all the Unicode string classes:: + + import pyparsing as pp + ppu = pp.pyparsing_unicode + + cjk_identifier = pp.Word(ppu.CJK.identchars, ppu.CJK.identbodychars) + greek_identifier = pp.Word(ppu.Greek.identchars, ppu.Greek.identbodychars) + +- Added a caseless parameter to the `CloseMatch` class to allow for casing to be + ignored when checking for close matches. (Issue #281) (PR by Adrian Edwards, thanks!) + +- Fixed bug in Located class when used with a results name. (Issue #294) + +- Fixed bug in `QuotedString` class when the escaped quote string is not a + repeated character. (Issue #263) + +- `parseFile()` and `create_diagram()` methods now will accept `pathlib.Path` + arguments. + + +Version 3.0.0b3 - August, 2021 +------------------------------ +- PEP-8 compatible names are being introduced in pyparsing version 3.0! + All methods such as `parseString` have been replaced with the PEP-8 + compliant name `parse_string`. In addition, arguments such as `parseAll` + have been renamed to `parse_all`. For backward-compatibility, synonyms for + all renamed methods and arguments have been added, so that existing + pyparsing parsers will not break. These synonyms will be removed in a future + release. + + In addition, the Optional class has been renamed to Opt, since it clashes + with the common typing.Optional type specifier that is used in the Python + type annotations. A compatibility synonym is defined for now, but will be + removed in a future release. + +- HUGE NEW FEATURE - Support for left-recursive parsers! + Following the method used in Python's PEG parser, pyparsing now supports + left-recursive parsers when left recursion is enabled. + + import pyparsing as pp + pp.ParserElement.enable_left_recursion() + + # a common left-recursion definition + # define a list of items as 'list + item | item' + # BNF: + # item_list := item_list item | item + # item := word of alphas + item_list = pp.Forward() + item = pp.Word(pp.alphas) + item_list <<= item_list + item | item + + item_list.run_tests("""\ + To parse or not to parse that is the question + """) + Prints: + + ['To', 'parse', 'or', 'not', 'to', 'parse', 'that', 'is', 'the', 'question'] + + Great work contributed by Max Fischer! + +- `delimited_list` now supports an additional flag `allow_trailing_delim`, + to optionally parse an additional delimiter at the end of the list. + Contributed by Kazantcev Andrey, thanks! + +- Removed internal comparison of results values against b"", which + raised a `BytesWarning` when run with `python -bb`. Fixes issue #271 reported + by Florian Bruhin, thank you! + +- Fixed STUDENTS table in sql2dot.py example, fixes issue #261 reported by + legrandlegrand - much better. + +- Python 3.5 will not be supported in the pyparsing 3 releases. This will allow + for future pyparsing releases to add parameter type annotations, and to take + advantage of dict key ordering in internal results name tracking. + + +Version 3.0.0b2 - December, 2020 +-------------------------------- +- API CHANGE + `locatedExpr` is being replaced by the class `Located`. `Located` has the same + constructor interface as `locatedExpr`, but fixes bugs in the returned + `ParseResults` when the searched expression contains multiple tokens, or + has internal results names. + + `locatedExpr` is deprecated, and will be removed in a future release. + + +Version 3.0.0b1 - November, 2020 +-------------------------------- +- API CHANGE + Diagnostic flags have been moved to an enum, `pyparsing.Diagnostics`, and + they are enabled through module-level methods: + - `pyparsing.enable_diag()` + - `pyparsing.disable_diag()` + - `pyparsing.enable_all_warnings()` + +- API CHANGE + Most previous `SyntaxWarnings` that were warned when using pyparsing + classes incorrectly have been converted to `TypeError` and `ValueError` exceptions, + consistent with Python calling conventions. All warnings warned by diagnostic + flags have been converted from `SyntaxWarnings` to `UserWarnings`. + +- To support parsers that are intended to generate native Python collection + types such as lists and dicts, the `Group` and `Dict` classes now accept an + additional boolean keyword argument `aslist` and `asdict` respectively. See + the `jsonParser.py` example in the `pyparsing/examples` source directory for + how to return types as `ParseResults` and as Python collection types, and the + distinctions in working with the different types. + + In addition parse actions that must return a value of list type (which would + normally be converted internally to a `ParseResults`) can override this default + behavior by returning their list wrapped in the new `ParseResults.List` class: + + # this parse action tries to return a list, but pyparsing + # will convert to a ParseResults + def return_as_list_but_still_get_parse_results(tokens): + return tokens.asList() + + # this parse action returns the tokens as a list, and pyparsing will + # maintain its list type in the final parsing results + def return_as_list(tokens): + return ParseResults.List(tokens.asList()) + + This is the mechanism used internally by the `Group` class when defined + using `aslist=True`. + +- A new `IndentedBlock` class is introduced, to eventually replace the + current `indentedBlock` helper method. The interface is largely the same, + however, the new class manages its own internal indentation stack, so + it is no longer necessary to maintain an external `indentStack` variable. + +- API CHANGE + Added `cache_hit` keyword argument to debug actions. Previously, if packrat + parsing was enabled, the debug methods were not called in the event of cache + hits. Now these methods will be called, with an added argument + `cache_hit=True`. + + If you are using packrat parsing and enable debug on expressions using a + custom debug method, you can add the `cache_hit=False` keyword argument, + and your method will be called on packrat cache hits. If you choose not + to add this keyword argument, the debug methods will fail silently, + behaving as they did previously. + +- When using `setDebug` with packrat parsing enabled, packrat cache hits will + now be included in the output, shown with a leading '*'. (Previously, cache + hits and responses were not included in debug output.) For those using custom + debug actions, see the previous item regarding an optional API change + for those methods. + +- `setDebug` output will also show more details about what expression + is about to be parsed (the current line of text being parsed, and + the current parse position): + + Match integer at loc 0(1,1) + 1 2 3 + ^ + Matched integer -> ['1'] + + The current debug location will also be indicated after whitespace + has been skipped (was previously inconsistent, reported in Issue #244, + by Frank Goyens, thanks!). + +- Modified the repr() output for `ParseResults` to include the class + name as part of the output. This is to clarify for new pyparsing users + who misread the repr output as a tuple of a list and a dict. pyparsing + results will now read like: + + ParseResults(['abc', 'def'], {'qty': 100}] + + instead of just: + + (['abc', 'def'], {'qty': 100}] + +- Fixed bugs in Each when passed `OneOrMore` or `ZeroOrMore` expressions: + . first expression match could be enclosed in an extra nesting level + . out-of-order expressions now handled correctly if mixed with required + expressions + . results names are maintained correctly for these expressions + +- Fixed traceback trimming, and added `ParserElement.verbose_traceback` + save/restore to `reset_pyparsing_context()`. + +- Default string for `Word` expressions now also include indications of + `min` and `max` length specification, if applicable, similar to regex length + specifications: + + Word(alphas) -> "W:(A-Za-z)" + Word(nums) -> "W:(0-9)" + Word(nums, exact=3) -> "W:(0-9){3}" + Word(nums, min=2) -> "W:(0-9){2,...}" + Word(nums, max=3) -> "W:(0-9){1,3}" + Word(nums, min=2, max=3) -> "W:(0-9){2,3}" + + For expressions of the `Char` class (similar to `Word(..., exact=1)`, the expression + is simply the character range in parentheses: + + Char(nums) -> "(0-9)" + Char(alphas) -> "(A-Za-z)" + +- Removed `copy()` override in `Keyword` class which did not preserve definition + of ident chars from the original expression. PR #233 submitted by jgrey4296, + thanks! + +- In addition to `pyparsing.__version__`, there is now also a `pyparsing.__version_info__`, + following the same structure and field names as in `sys.version_info`. + + +Version 3.0.0a2 - June, 2020 +---------------------------- +- Summary of changes for 3.0.0 can be found in "What's New in Pyparsing 3.0.0" + documentation. + +- API CHANGE + Changed result returned when parsing using `countedArray`, + the array items are no longer returned in a doubly-nested + list. + +- An excellent new enhancement is the new railroad diagram + generator for documenting pyparsing parsers: + + import pyparsing as pp + from pyparsing.diagram import to_railroad, railroad_to_html + from pathlib import Path + + # define a simple grammar for parsing street addresses such + # as "123 Main Street" + # number word... + number = pp.Word(pp.nums).setName("number") + name = pp.Word(pp.alphas).setName("word")[1, ...] + + parser = number("house_number") + name("street") + parser.setName("street address") + + # construct railroad track diagram for this parser and + # save as HTML + rr = to_railroad(parser) + Path('parser_rr_diag.html').write_text(railroad_to_html(rr)) + + Very nice work provided by Michael Milton, thanks a ton! + +- Enhanced default strings created for Word expressions, now showing + string ranges if possible. `Word(alphas)` would formerly + print as `W:(ABCD...)`, now prints as `W:(A-Za-z)`. + +- Added `ignoreWhitespace(recurse:bool = True)`` and added a + recurse argument to `leaveWhitespace`, both added to provide finer + control over pyparsing's whitespace skipping. Also contributed + by Michael Milton. + +- The unicode range definitions for the various languages were + recalculated by interrogating the unicodedata module by character + name, selecting characters that contained that language in their + Unicode name. (Issue #227) + + Also, pyparsing_unicode.Korean was renamed to Hangul (Korean + is also defined as a synonym for compatibility). + +- Enhanced `ParseResults` dump() to show both results names and list + subitems. Fixes bug where adding a results name would hide + lower-level structures in the `ParseResults`. + +- Added new __diag__ warnings: + + "warn_on_parse_using_empty_Forward" - warns that a Forward + has been included in a grammar, but no expression was + attached to it using '<<=' or '<<' + + "warn_on_assignment_to_Forward" - warns that a Forward has + been created, but was probably later overwritten by + erroneously using '=' instead of '<<=' (this is a common + mistake when using Forwards) + (**currently not working on PyPy**) + +- Added `ParserElement`.recurse() method to make it simpler for + grammar utilities to navigate through the tree of expressions in + a pyparsing grammar. + +- Fixed bug in `ParseResults` repr() which showed all matching + entries for a results name, even if `listAllMatches` was set + to False when creating the `ParseResults` originally. Reported + by Nicholas42 on GitHub, good catch! (Issue #205) + +- Modified refactored modules to use relative imports, as + pointed out by setuptools project member jaraco, thank you! + +- Off-by-one bug found in the roman_numerals.py example, a bug + that has been there for about 14 years! PR submitted by + Jay Pedersen, nice catch! + +- A simplified Lua parser has been added to the examples + (lua_parser.py). + +- Added make_diagram.py to the examples directory to demonstrate + creation of railroad diagrams for selected pyparsing examples. + Also restructured some examples to make their parsers importable + without running their embedded tests. + + +Version 3.0.0a1 - April, 2020 +----------------------------- +- Removed Py2.x support and other deprecated features. Pyparsing + now requires Python 3.5 or later. If you are using an earlier + version of Python, you must use a Pyparsing 2.4.x version + + Deprecated features removed: + . `ParseResults.asXML()` - if used for debugging, switch + to using `ParseResults.dump()`; if used for data transfer, + use `ParseResults.asDict()` to convert to a nested Python + dict, which can then be converted to XML or JSON or + other transfer format + + . `operatorPrecedence` synonym for `infixNotation` - + convert to calling `infixNotation` + + . `commaSeparatedList` - convert to using + pyparsing_common.comma_separated_list + + . `upcaseTokens` and `downcaseTokens` - convert to using + `pyparsing_common.upcaseTokens` and `downcaseTokens` + + . __compat__.collect_all_And_tokens will not be settable to + False to revert to pre-2.3.1 results name behavior - + review use of names for `MatchFirst` and Or expressions + containing And expressions, as they will return the + complete list of parsed tokens, not just the first one. + Use `__diag__.warn_multiple_tokens_in_named_alternation` + to help identify those expressions in your parsers that + will have changed as a result. + +- Removed support for running `python setup.py test`. The setuptools + maintainers consider the test command deprecated (see + ). To run the Pyparsing test, + use the command `tox`. + +- API CHANGE: + The staticmethod `ParseException.explain` has been moved to + `ParseBaseException.explain_exception`, and a new `explain` instance + method added to `ParseBaseException`. This will make calls to `explain` + much more natural: + + try: + expr.parseString("...") + except ParseException as pe: + print(pe.explain()) + +- POTENTIAL API CHANGE: + `ZeroOrMore` expressions that have results names will now + include empty lists for their name if no matches are found. + Previously, no named result would be present. Code that tested + for the presence of any expressions using "if name in results:" + will now always return True. This code will need to change to + "if name in results and results[name]:" or just + "if results[name]:". Also, any parser unit tests that check the + `asDict()` contents will now see additional entries for parsers + having named `ZeroOrMore` expressions, whose values will be `[]`. + +- POTENTIAL API CHANGE: + Fixed a bug in which calls to `ParserElement.setDefaultWhitespaceChars` + did not change whitespace definitions on any pyparsing built-in + expressions defined at import time (such as `quotedString`, or those + defined in pyparsing_common). This would lead to confusion when + built-in expressions would not use updated default whitespace + characters. Now a call to `ParserElement.setDefaultWhitespaceChars` + will also go and update all pyparsing built-ins to use the new + default whitespace characters. (Note that this will only modify + expressions defined within the pyparsing module.) Prompted by + work on a StackOverflow question posted by jtiai. + +- Expanded __diag__ and __compat__ to actual classes instead of + just namespaces, to add some helpful behavior: + - enable() and .disable() methods to give extra + help when setting or clearing flags (detects invalid + flag names, detects when trying to set a __compat__ flag + that is no longer settable). Use these methods now to + set or clear flags, instead of directly setting to True or + False. + + import pyparsing as pp + pp.__diag__.enable("warn_multiple_tokens_in_named_alternation") + + - __diag__.enable_all_warnings() is another helper that sets + all "warn*" diagnostics to True. + + pp.__diag__.enable_all_warnings() + + - added new warning, "warn_on_match_first_with_lshift_operator" to + warn when using '<<' with a '|' `MatchFirst` operator, which will + create an unintended expression due to precedence of operations. + + Example: This statement will erroneously define the `fwd` expression + as just `expr_a`, even though `expr_a | expr_b` was intended, + since '<<' operator has precedence over '|': + + fwd << expr_a | expr_b + + To correct this, use the '<<=' operator (preferred) or parentheses + to override operator precedence: + + fwd <<= expr_a | expr_b + or + fwd << (expr_a | expr_b) + +- Cleaned up default tracebacks when getting a `ParseException` when calling + `parseString`. Exception traces should now stop at the call in `parseString`, + and not include the internal traceback frames. (If the full traceback + is desired, then set `ParserElement`.verbose_traceback to True.) + +- Fixed `FutureWarnings` that sometimes are raised when '[' passed as a + character to Word. + +- New namespace, assert methods and classes added to support writing + unit tests. + - `assertParseResultsEquals` + - `assertParseAndCheckList` + - `assertParseAndCheckDict` + - `assertRunTestResults` + - `assertRaisesParseException` + - `reset_pyparsing_context` context manager, to restore pyparsing + config settings + +- Enhanced error messages and error locations when parsing fails on + the Keyword or `CaselessKeyword` classes due to the presence of a + preceding or trailing keyword character. Surfaced while + working with metaperl on issue #201. + +- Enhanced the Regex class to be compatible with re's compiled with the + re-equivalent regex module. Individual expressions can be built with + regex compiled expressions using: + + import pyparsing as pp + import regex + + # would use regex for this expression + integer_parser = pp.Regex(regex.compile(r'\d+')) + + Inspired by PR submitted by bjrnfrdnnd on GitHub, very nice! + +- Fixed handling of `ParseSyntaxExceptions` raised as part of Each + expressions, when sub-expressions contain '-' backtrack + suppression. As part of resolution to a question posted by John + Greene on StackOverflow. + +- Potentially *huge* performance enhancement when parsing Word + expressions built from pyparsing_unicode character sets. Word now + internally converts ranges of consecutive characters to regex + character ranges (converting "0123456789" to "0-9" for instance), + resulting in as much as 50X improvement in performance! Work + inspired by a question posted by Midnighter on StackOverflow. + +- Improvements in select_parser.py, to include new SQL syntax + from SQLite. PR submitted by Robert Coup, nice work! + +- Fixed bug in `PrecededBy` which caused infinite recursion, issue #127 + submitted by EdwardJB. + +- Fixed bug in `CloseMatch` where end location was incorrectly + computed; and updated partial_gene_match.py example. + +- Fixed bug in `indentedBlock` with a parser using two different + types of nested indented blocks with different indent values, + but sharing the same indent stack, submitted by renzbagaporo. + +- Fixed bug in Each when using Regex, when Regex expression would + get parsed twice; issue #183 submitted by scauligi, thanks! + +- `BigQueryViewParser.py` added to examples directory, PR submitted + by Michael Smedberg, nice work! + +- booleansearchparser.py added to examples directory, PR submitted + by xecgr. Builds on searchparser.py, adding support for '*' + wildcards and non-Western alphabets. + +- Fixed bug in delta_time.py example, when using a quantity + of seconds/minutes/hours/days > 999. + +- Fixed bug in regex definitions for real and sci_real expressions in + pyparsing_common. Issue #194, reported by Michael Wayne Goodman, thanks! + +- Fixed `FutureWarning` raised beginning in Python 3.7 for Regex expressions + containing '[' within a regex set. + +- Minor reformatting of output from `runTests` to make embedded + comments more visible. + +- And finally, many thanks to those who helped in the restructuring + of the pyparsing code base as part of this release. Pyparsing now + has more standard package structure, more standard unit tests, + and more standard code formatting (using black). Special thanks + to jdufresne, klahnakoski, mattcarmody, and ckeygusuz, to name just + a few. + + +Version 2.4.7 - April, 2020 +--------------------------- +- Backport of selected fixes from 3.0.0 work: + . Each bug with Regex expressions + . And expressions not properly constructing with generator + . Traceback abbreviation + . Bug in delta_time example + . Fix regexen in pyparsing_common.real and .sci_real + . Avoid FutureWarning on Python 3.7 or later + . Cleanup output in runTests if comments are embedded in test string + + +Version 2.4.6 - December, 2019 +------------------------------ +- Fixed typos in White mapping of whitespace characters, to use + correct "\u" prefix instead of "u\". + +- Fix bug in left-associative ternary operators defined using + infixNotation. First reported on StackOverflow by user Jeronimo. + +- Backport of pyparsing_test namespace from 3.0.0, including + TestParseResultsAsserts mixin class defining unittest-helper + methods: + . def assertParseResultsEquals( + self, result, expected_list=None, expected_dict=None, msg=None) + . def assertParseAndCheckList( + self, expr, test_string, expected_list, msg=None, verbose=True) + . def assertParseAndCheckDict( + self, expr, test_string, expected_dict, msg=None, verbose=True) + . def assertRunTestResults( + self, run_tests_report, expected_parse_results=None, msg=None) + . def assertRaisesParseException(self, exc_type=ParseException, msg=None) + + To use the methods in this mixin class, declare your unittest classes as: + + from pyparsing import pyparsing_test as ppt + class MyParserTest(ppt.TestParseResultsAsserts, unittest.TestCase): + ... + + +Version 2.4.5 - November, 2019 +------------------------------ +- NOTE: final release compatible with Python 2.x. + +- Fixed issue with reading README.rst as part of setup.py's + initialization of the project's long_description, with a + non-ASCII space character causing errors when installing from + source on platforms where UTF-8 is not the default encoding. + + +Version 2.4.4 - November, 2019 +-------------------------------- +- Unresolved symbol reference in 2.4.3 release was masked by stdout + buffering in unit tests, thanks for the prompt heads-up, Ned + Batchelder! + + +Version 2.4.3 - November, 2019 +------------------------------ +- Fixed a bug in ParserElement.__eq__ that would for some parsers + create a recursion error at parser definition time. Thanks to + Michael Clerx for the assist. (Addresses issue #123) + +- Fixed bug in indentedBlock where a block that ended at the end + of the input string could cause pyparsing to loop forever. Raised + as part of discussion on StackOverflow with geckos. + +- Backports from pyparsing 3.0.0: + . __diag__.enable_all_warnings() + . Fixed bug in PrecededBy which caused infinite recursion, issue #127 + . support for using regex-compiled RE to construct Regex expressions + + +Version 2.4.2 - July, 2019 +-------------------------- +- Updated the shorthand notation that has been added for repetition + expressions: expr[min, max], with '...' valid as a min or max value: + - expr[...] and expr[0, ...] are equivalent to ZeroOrMore(expr) - expr[1, ...] is equivalent to OneOrMore(expr) - expr[n, ...] or expr[n,] is equivalent to expr*n + ZeroOrMore(expr) @@ -32,13 +1214,91 @@ everyone! if more than n exprs exist in the input stream. If this behavior is desired, then write expr[..., n] + ~expr. + Better interpretation of [...] as ZeroOrMore raised by crowsonkb, + thanks for keeping me in line! + + If upgrading from 2.4.1 or 2.4.1.1 and you have used `expr[...]` + for `OneOrMore(expr)`, it must be updated to `expr[1, ...]`. + - The defaults on all the `__diag__` switches have been set to False, to avoid getting alarming warnings. To use these diagnostics, set - them to True after importing pyparsing. Example: + them to True after importing pyparsing. + + Example: import pyparsing as pp pp.__diag__.warn_multiple_tokens_in_named_alternation = True +- Fixed bug introduced by the use of __getitem__ for repetition, + overlooking Python's legacy implementation of iteration + by sequentially calling __getitem__ with increasing numbers until + getting an IndexError. Found during investigation of problem + reported by murlock, merci! + + +Version 2.4.2a1 - July, 2019 +---------------------------- +It turns out I got the meaning of `[...]` absolutely backwards, +so I've deleted 2.4.1 and am repushing this release as 2.4.2a1 +for people to give it a try before I can call it ready to go. + +The `expr[...]` notation was pushed out to be synonymous with +`OneOrMore(expr)`, but this is really counter to most Python +notations (and even other internal pyparsing notations as well). +It should have been defined to be equivalent to ZeroOrMore(expr). + +- Changed [...] to emit ZeroOrMore instead of OneOrMore. + +- Removed code that treats ParserElements like iterables. + +- Change all __diag__ switches to False. + + +Version 2.4.1.1 - July 24, 2019 +------------------------------- +This is a re-release of version 2.4.1 to restore the release history +in PyPI, since the 2.4.1 release was deleted. + +There are 3 known issues in this release, which are fixed in +the upcoming 2.4.2: + +- API change adding support for `expr[...]` - the original + code in 2.4.1 incorrectly implemented this as OneOrMore. + Code using this feature under this release should explicitly + use `expr[0, ...]` for ZeroOrMore and `expr[1, ...]` for + OneOrMore. In 2.4.2 you will be able to write `expr[...]` + equivalent to `ZeroOrMore(expr)`. + +- Bug if composing And, Or, MatchFirst, or Each expressions + using an expression. This only affects code which uses + explicit expression construction using the And, Or, etc. + classes instead of using overloaded operators '+', '^', and + so on. If constructing an And using a single expression, + you may get an error that "cannot multiply ParserElement by + 0 or (0, 0)" or a Python `IndexError`. Change code like + + cmd = Or(Word(alphas)) + + to + + cmd = Or([Word(alphas)]) + + (Note that this is not the recommended style for constructing + Or expressions.) + +- Some newly-added `__diag__` switches are enabled by default, + which may give rise to noisy user warnings for existing parsers. + You can disable them using: + + import pyparsing as pp + pp.__diag__.warn_multiple_tokens_in_named_alternation = False + pp.__diag__.warn_ungrouped_named_tokens_in_collection = False + pp.__diag__.warn_name_set_on_empty_Forward = False + pp.__diag__.warn_on_multiple_string_args_to_oneof = False + pp.__diag__.enable_debug_on_named_expressions = False + + In 2.4.2 these will all be set to False by default. + Version 2.4.1 - July, 2019 -------------------------- @@ -150,9 +1410,9 @@ Version 2.4.1 - July, 2019 - warn_ungrouped_named_tokens_in_collection - flag to enable warnings when a results name is defined on a containing expression with ungrouped subexpressions that also have results names (default=True) - - warn_name_set_on_empty_Forward - flag to enable warnings whan a Forward is defined + - warn_name_set_on_empty_Forward - flag to enable warnings when a Forward is defined with a results name, but has no contents defined (default=False) - - warn_on_multiple_string_args_to_oneof - flag to enable warnings whan oneOf is + - warn_on_multiple_string_args_to_oneof - flag to enable warnings when oneOf is incorrectly called with multiple str arguments (default=True) - enable_debug_on_named_expressions - flag to auto-enable debug on all subsequent calls to ParserElement.setName() (default=False) @@ -840,7 +2100,7 @@ Version 2.1.6 - August, 2016 repr form provides important information when debugging parse actions. -Verison 2.1.5 - June, 2016 +Version 2.1.5 - June, 2016 ------------------------------ - Added ParserElement.split() generator method, similar to re.split(). Includes optional arguments maxsplit (to limit the number of splits), @@ -1163,7 +2423,7 @@ Version 2.0.2 - April, 2014 - Added "pprint()" method to ParseResults, to simplify troubleshooting and prettified output. Now instead of importing the pprint module and then writing "pprint.pprint(result)", you can just write - "result.pprint()". This method also accepts addtional positional and + "result.pprint()". This method also accepts additional positional and keyword arguments (such as indent, width, etc.), which get passed through directly to the pprint method (see https://docs.python.org/2/library/pprint.html#pprint.pprint). @@ -1295,7 +2555,7 @@ Version 1.5.7 - November, 2012 - Fixed bug in srange when using '\x###' hex character codes. -- Addeed optional 'intExpr' argument to countedArray, so that you +- Added optional 'intExpr' argument to countedArray, so that you can define your own expression that will evaluate to an integer, to be used as the count for the following elements. Allows you to define a countedArray with the count given in hex, for example, @@ -1970,7 +3230,7 @@ Version 1.4.6 - April, 2007 programs, at some cost to performance (3-5%). Suggested by bca48150 on the pyparsing wiki, thanks! -- Enhanced the documentation describing the vagaries and idiosyncracies +- Enhanced the documentation describing the vagaries and idiosyncrasies of parsing strings with embedded tabs, and the impact on: . parse actions . scanString diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7b19d7a9..a2da1a57 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -29,46 +29,88 @@ If you have a question on using pyparsing, there are a number of resources avail and Python features. - [submit an issue](https://github.com/pyparsing/pyparsing/issues) - If you have a problem with pyparsing that looks - like an actual bug, or have an idea for a feature to add to pyaprsing please submit an issue on GitHub. Some + like an actual bug, or have an idea for a feature to add to pyparsing please submit an issue on GitHub. Some pyparsing behavior may be counter-intuitive, so try to review some of the other resources first, or some of the other open and closed issues. Or post your question on SO or reddit. But don't wait until you are desperate and frustrated - just ask! :) +## Submitting examples + +If you have an example you wish to submit, please follow these guidelines. + +- **License - Submitted example code must be available for distribution with the rest of pyparsing under the MIT + open source license.** + +- Please follow PEP8 name and coding guidelines, and use the black formatter + to auto-format code. + +- Examples should import pyparsing and the common namespace classes as: + + import pyparsing as pp + # if necessary + ppc = pp.pyparsing_common + ppu = pp.pyparsing_unicode + +- Submitted examples *must* be Python 3.6.8 or later compatible. (It is acceptable if examples use Python + features added after 3.6) + +- Where possible use operators to create composite parse expressions: + + expr = expr_a + expr_b | expr_c + + instead of: + + expr = pp.MatchFirst([pp.And([expr_a, expr_b]), expr_c]) + + Exception: if using a generator to create an expression: + + import keyword + python_keywords = keyword.kwlist + any_keyword = pp.MatchFirst(pp.Keyword(kw) + for kw in python_keywords)) + +- Learn [Common Pitfalls When Writing Parsers](https://github.com/pyparsing/pyparsing/wiki/Common-Pitfalls-When-Writing-Parsers) and + how to avoid them when developing new examples. + +- See additional notes under [Some Coding Points](#some-coding-points). ## Submitting changes If you are considering proposing updates to pyparsing, please bear in mind the following guidelines. -Please review [_The Zen of Pyparsing_ and _The Zen of Pyparsing -Development_](https://github.com/pyparsing/pyparsing/wiki/Zen) -article on the pyparsing wiki, to get a general feel for the historical and future approaches to pyparsing's +Please review [_The Zen of Pyparsing_ and _The Zen of Pyparsing +Development_](https://github.com/pyparsing/pyparsing/wiki/Zen) +article on the pyparsing wiki, to get a general feel for the historical and future approaches to pyparsing's design, and intended developer experience as an embedded DSL. +If you are using new Python features or changing usage of the Python stdlib, please check that they work as +intended on prior versions of Python (currently back to Python 3.6.8). + ## Some design points -- Minimize additions to the module namespace. Over time, pyparsing's namespace has acquired a *lot* of names. - New features have been encapsulated into namespace classes to try to hold back the name flooding when importing +- Minimize additions to the module namespace. Over time, pyparsing's namespace has acquired a *lot* of names. + New features have been encapsulated into namespace classes to try to hold back the name flooding when importing pyparsing. -- New operator overloads will need to show broad applicability. +- New operator overloads for ParserElement will need to show broad applicability, and should be related to + parser construction. - Performance tuning should focus on parse time performance. Optimizing parser definition performance is secondary. -- New external dependencies will require substantial justification, and if included, will need to be guarded for +- New external dependencies will require substantial justification, and if included, will need to be guarded for `ImportError`s raised if the external module is not installed. ## Some coding points These coding styles are encouraged whether submitting code for core pyparsing or for submitting an example. -- PEP8 - at this time, pyparsing is very non-compliant with many PEP8 guidelines, especially those regarding +- PEP8 - pyparsing has historically been very non-compliant with many PEP8 guidelines, especially those regarding name casing. I had just finished several years of Java and Smalltalk development, and camel case seemed to be the - future trend in coding styles. There are plans to convert these names to PEP8-conformant snake case, but this will - be done over several releases to provide a migration path for current pyparsing-dependent applications. See more - information at the [PEP8 wiki page](https://github.com/pyparsing/pyparsing/wiki/PEP-8-planning). - - If you wish to submit a new example, please follow PEP8 name and coding guidelines. Example code must be available - for distribution with the rest of pyparsing under the MIT open source license. + future trend in coding styles. As of version 3.0.0, pyparsing is moving over to PEP8 naming, while maintaining + compatibility with existing parser code by defining synonyms using the legacy names. These names will be + retained until a future release (probably 4.0), to provide a migration path for current pyparsing-dependent + applications - DO NOT MODIFY OR REMOVE THESE NAMES. + See more information at the [PEP8 wiki page](https://github.com/pyparsing/pyparsing/wiki/PEP-8-planning). - No backslashes for line continuations. Continuation lines for expressions in ()'s should start with the continuing operator: @@ -77,37 +119,17 @@ These coding styles are encouraged whether submitting code for core pyparsing or + some_other_long_thing + even_another_long_thing) -- Changes to core pyparsing must be compatible back to Py3.5 without conditionalizing. Later Py3 features may be +- Maximum line length is 120 characters. (Black will override this.) + +- Changes to core pyparsing must be compatible back to Py3.6 without conditionalizing. Later Py3 features may be used in examples by way of illustration. - str.format() statements should use named format arguments (unless this proves to be a slowdown at parse time). -- List, tuple, and dict literals should include a trailing comma after the last element, which reduces changeset +- List, tuple, and dict literals should include a trailing comma after the last element, which reduces changeset clutter when another element gets added to the end. -- Examples should import pyparsing and the common namespace classes as: - - import pyparsing as pp - # if necessary - ppc = pp.pyparsing_common - ppu = pp.pyparsing_unicode - -- Where possible use operators to create composite parse expressions: - - expr = expr_a + expr_b | expr_c - - instead of: - - expr = pp.MatchFirst([pp.And([expr_a, expr_b]), expr_c]) - - Exception: if using a generator to create an expression: - - import keyword - python_keywords = keyword.kwlist - any_keyword = pp.MatchFirst(pp.Keyword(kw) - for kw in python_keywords)) - -- Learn [The Classic Blunders](https://github.com/pyparsing/pyparsing/wiki/The-Classic-Blunders) and - how to avoid them when developing new examples. +- New features should be accompanied by updates to unitTests.py and a bullet in the CHANGES file. -- New features should be accompanied with updates to unitTests.py and a bullet in the CHANGES file. +- Do not modify pyparsing_archive.py. This file is kept as a reference artifact from when pyparsing was distributed + as a single source file. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index a13fe7f0..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,8 +0,0 @@ -include pyparsing.py -include HowToUsePyparsing.html pyparsingClassDiagram.* -include README.md CODE_OF_CONDUCT.md CHANGES LICENSE -include examples/*.py examples/Setup.ini examples/*.dfm examples/*.ics examples/*.html examples/*.h -recursive-include docs * -prune docs/_build/* -recursive-include test * -include simple_unit_tests.py unitTests.py diff --git a/MANIFEST.in_bdist b/MANIFEST.in_bdist deleted file mode 100644 index 57ce301c..00000000 --- a/MANIFEST.in_bdist +++ /dev/null @@ -1,6 +0,0 @@ -include pyparsing.py -include HowToUsePyparsing.html pyparsingClassDiagram.* -include README CHANGES LICENSE -include examples/*.py examples/Setup.ini examples/*.dfm examples/*.ics examples/*.html -include docs/* -include robots.txt diff --git a/README.rst b/README.rst index 0d702d75..17e36aa8 100644 --- a/README.rst +++ b/README.rst @@ -1,56 +1,89 @@ -PyParsing – A Python Parsing Module -=================================== +PyParsing -- A Python Parsing Module +==================================== -|Build Status| +|Version| |Build Status| |Coverage| |License| |Python Versions| Introduction ============ The pyparsing module is an alternative approach to creating and -executing simple grammars, vs. the traditional lex/yacc approach, or the +executing simple grammars, vs. the traditional lex/yacc approach, or the use of regular expressions. The pyparsing module provides a library of classes that client code uses to construct the grammar directly in Python code. -Here is a program to parse “Hello, World!” (or any greeting of the form -“salutation, addressee!”): +*[Since first writing this description of pyparsing in late 2003, this +technique for developing parsers has become more widespread, under the +name Parsing Expression Grammars - PEGs. See more information on PEGs* +`here `__ +*.]* + +Here is a program to parse ``"Hello, World!"`` (or any greeting of the form +``"salutation, addressee!"``): .. code:: python from pyparsing import Word, alphas - greet = Word( alphas ) + "," + Word( alphas ) + "!" + greet = Word(alphas) + "," + Word(alphas) + "!" hello = "Hello, World!" - print(hello, "->", greet.parseString( hello )) + print(hello, "->", greet.parseString(hello)) The program outputs the following:: Hello, World! -> ['Hello', ',', 'World', '!'] The Python representation of the grammar is quite readable, owing to the -self-explanatory class names, and the use of ‘+’, ‘\|’ and ‘^’ operator +self-explanatory class names, and the use of '+', '|' and '^' operator definitions. -The parsed results returned from parseString() can be accessed as a +The parsed results returned from ``parseString()`` is a collection of type +``ParseResults``, which can be accessed as a nested list, a dictionary, or an object with named attributes. The pyparsing module handles some of the problems that are typically -vexing when writing text parsers: - extra or missing whitespace (the -above program will also handle “Hello,World!”, “Hello , World !”, etc.) -- quoted strings - embedded comments +vexing when writing text parsers: + +- extra or missing whitespace (the above program will also handle ``"Hello,World!"``, ``"Hello , World !"``, etc.) +- quoted strings +- embedded comments The examples directory includes a simple SQL parser, simple CORBA IDL parser, a config file parser, a chemical formula parser, and a four- function algebraic notation parser, among many others. +Documentation +============= + +There are many examples in the online docstrings of the classes +and methods in pyparsing. You can find them compiled into `online docs `__. Additional +documentation resources and project info are listed in the online +`GitHub wiki `__. An +entire directory of examples can be found `here `__. + License ======= - MIT License. See header of pyparsing.py +MIT License. See header of the `pyparsing __init__.py `__ file. History ======= - See CHANGES file. +See `CHANGES `__ file. + +.. |Build Status| image:: https://github.com/pyparsing/pyparsing/actions/workflows/ci.yml/badge.svg + :target: https://github.com/pyparsing/pyparsing/actions/workflows/ci.yml + +.. |Coverage| image:: https://codecov.io/gh/pyparsing/pyparsing/branch/master/graph/badge.svg + :target: https://codecov.io/gh/pyparsing/pyparsing + +.. |Version| image:: https://img.shields.io/pypi/v/pyparsing?style=flat-square + :target: https://pypi.org/project/pyparsing/ + :alt: Version + +.. |License| image:: https://img.shields.io/pypi/l/pyparsing.svg?style=flat-square + :target: https://pypi.org/project/pyparsing/ + :alt: License -.. |Build Status| image:: https://travis-ci.org/pyparsing/pyparsing.svg?branch=master - :target: https://travis-ci.org/pyparsing/pyparsing +.. |Python Versions| image:: https://img.shields.io/pypi/pyversions/pyparsing.svg?style=flat-square + :target: https://pypi.org/project/python-liquid/ + :alt: Python versions diff --git a/docs/HowToUsePyparsing.rst b/docs/HowToUsePyparsing.rst index 3e9e1f87..4f4e6a87 100644 --- a/docs/HowToUsePyparsing.rst +++ b/docs/HowToUsePyparsing.rst @@ -3,12 +3,12 @@ Using the pyparsing module ========================== :author: Paul McGuire -:address: ptmcg@users.sourceforge.net +:address: ptmcg.pm+pyparsing@gmail.com -:revision: 2.0.1a -:date: July, 2013 (minor update August, 2018) +:revision: 3.1.0 +:date: April, 2023 -:copyright: Copyright |copy| 2003-2013 Paul McGuire. +:copyright: Copyright |copy| 2003-2023 Paul McGuire. .. |copy| unicode:: 0xA9 @@ -24,8 +24,30 @@ Using the pyparsing module .. contents:: :depth: 4 Note: While this content is still valid, there are more detailed -descriptions and examples at the online doc server at -https://pythonhosted.org/pyparsing/pyparsing-module.html +descriptions and extensive examples at the `online doc server +`_, and +in the online help for the various pyparsing classes and methods (viewable +using the Python interpreter's built-in ``help()`` function). You will also +find many example scripts in the `examples `_ +directory of the pyparsing GitHub repo. + +----------- + +**Note**: *In pyparsing 3.0, many method and function names which were +originally written using camelCase have been converted to PEP8-compatible +snake_case. So ``parseString()`` is being renamed to ``parse_string()``, +``delimitedList`` to DelimitedList_, and so on. You may see the old +names in legacy parsers, and they will be supported for a time with +synonyms, but the synonyms will be removed in a future release.* + +*If you are using this documentation, but working with a 2.4.x version of pyparsing, +you'll need to convert methods and arguments from the documented snake_case +names to the legacy camelCase names. In pyparsing 3.0.x, both forms are +supported, but the legacy forms are deprecated; they will be dropped in a +future release.* + +----------- + Steps to follow =============== @@ -33,37 +55,47 @@ Steps to follow To parse an incoming data string, the client code must follow these steps: 1. First define the tokens and patterns to be matched, and assign - this to a program variable. Optional results names or parsing + this to a program variable. Optional results names or parse actions can also be defined at this time. -2. Call ``parseString()`` or ``scanString()`` on this variable, passing in +2. Call ``parse_string()`` or ``scan_string()`` on this variable, passing in the string to be parsed. During the matching process, whitespace between tokens is skipped by default (although this can be changed). When token matches occur, any defined parse action methods are called. -3. Process the parsed results, returned as a list of strings. - Matching results may also be accessed as named attributes of +3. Process the parsed results, returned as a ParseResults_ object. + The ParseResults_ object can be accessed as if it were a list of + strings. Matching results may also be accessed as named attributes of the returned results, if names are defined in the definition of - the token pattern, using ``setResultsName()``. + the token pattern, using ``set_results_name()``. Hello, World! ------------- -The following complete Python program will parse the greeting "Hello, World!", +The following complete Python program will parse the greeting ``"Hello, World!"``, or any other greeting of the form ", !":: - from pyparsing import Word, alphas + import pyparsing as pp - greet = Word(alphas) + "," + Word(alphas) + "!" - greeting = greet.parseString("Hello, World!") - print greeting + greet = pp.Word(pp.alphas) + "," + pp.Word(pp.alphas) + "!" + for greeting_str in [ + "Hello, World!", + "Bonjour, Monde!", + "Hola, Mundo!", + "Hallo, Welt!", + ]: + greeting = greet.parse_string(greeting_str) + print(greeting) The parsed tokens are returned in the following form:: ['Hello', ',', 'World', '!'] + ['Bonjour', ',', 'Monde', '!'] + ['Hola', ',', 'Mundo', '!'] + ['Hallo', ',', 'Welt', '!'] Usage notes @@ -78,13 +110,13 @@ Usage notes - To keep up the readability of your code, use operators_ such as ``+``, ``|``, ``^``, and ``~`` to combine expressions. You can also combine - string literals with ParseExpressions - they will be - automatically converted to Literal objects. For example:: + string literals with ``ParseExpressions`` - they will be + automatically converted to Literal_ objects. For example:: integer = Word(nums) # simple unsigned integer - variable = Word(alphas, max=1) # single letter variable, such as x, z, m, etc. - arithOp = Word("+-*/", max=1) # arithmetic operators - equation = variable + "=" + integer + arithOp + integer # will match "x=2+2", etc. + variable = Char(alphas) # single letter variable, such as x, z, m, etc. + arith_op = one_of("+ - * /") # arithmetic operators + equation = variable + "=" + integer + arith_op + integer # will match "x=2+2", etc. In the definition of ``equation``, the string ``"="`` will get added as a ``Literal("=")``, but in a more readable way. @@ -102,19 +134,21 @@ Usage notes Of course, it is quite simple to extend this example to support more elaborate expressions, with nesting with parentheses, floating point numbers, scientific notation, and named constants - (such as ``e`` or ``pi``). See ``fourFn.py``, included in the examples directory. + (such as ``e`` or ``pi``). See `fourFn.py `_, + and `simpleArith.py `_ + included in the examples directory. - To modify pyparsing's default whitespace skipping, you can use one or more of the following methods: - - use the static method ``ParserElement.setDefaultWhitespaceChars`` - to override the normal set of whitespace chars (' \t\n'). For instance + - use the static method ``ParserElement.set_default_whitespace_chars`` + to override the normal set of whitespace chars (``' \t\n'``). For instance when defining a grammar in which newlines are significant, you should - call ``ParserElement.setDefaultWhitespaceChars(' \t')`` to remove + call ``ParserElement.set_default_whitespace_chars(' \t')`` to remove newline from the set of skippable whitespace characters. Calling this method will affect all pyparsing expressions defined afterward. - - call ``leaveWhitespace()`` on individual expressions, to suppress the + - call ``leave_whitespace()`` on individual expressions, to suppress the skipping of whitespace before trying to match the expression - use ``Combine`` to require that successive expressions must be @@ -140,7 +174,7 @@ Usage notes - ``expr*3`` is equivalent to ``expr + expr + expr`` - - ``expr[2, 3]`` is equivalent to ``expr + expr + Optional(expr)`` + - ``expr[2, 3]`` is equivalent to ``expr + expr + Opt(expr)`` - ``expr[n, ...]`` or ``expr[n,]`` is equivalent to ``expr*n + ZeroOrMore(expr)`` (read as "at least n instances of expr") @@ -148,9 +182,9 @@ Usage notes - ``expr[... ,n]`` is equivalent to ``expr*(0, n)`` (read as "0 to n instances of expr") - - ``expr[...]`` is equivalent to ``ZeroOrMore(expr)`` + - ``expr[...]``, ``expr[0, ...]`` and ``expr * ...`` are equivalent to ``ZeroOrMore(expr)`` - - ``expr[0, ...]`` is equivalent to ``ZeroOrMore(expr)`` + - ``expr[1, ...]`` is equivalent to ``OneOrMore(expr)`` Note that ``expr[..., n]`` does not raise an exception if more than n exprs exist in the input stream; that is, @@ -158,23 +192,28 @@ Usage notes occurrences. If this behavior is desired, then write ``expr[..., n] + ~expr``. -- ``MatchFirst`` expressions are matched left-to-right, and the first +- ``[]`` notation will also accept a stop expression using ':' slice + notation: + + - ``expr[...:end_expr]`` is equivalent to ``ZeroOrMore(expr, stop_on=end_expr)`` + +- MatchFirst_ expressions are matched left-to-right, and the first match found will skip all later expressions within, so be sure to define less-specific patterns after more-specific patterns. - If you are not sure which expressions are most specific, use Or + If you are not sure which expressions are most specific, use Or_ expressions (defined using the ``^`` operator) - they will always match the longest expression, although they are more compute-intensive. -- ``Or`` expressions will evaluate all of the specified subexpressions +- Or_ expressions will evaluate all of the specified subexpressions to determine which is the "best" match, that is, which matches the longest string in the input data. In case of a tie, the - left-most expression in the ``Or`` list will win. + left-most expression in the Or_ list will win. - If parsing the contents of an entire file, pass it to the - ``parseFile`` method using:: + ``parse_file`` method using:: - expr.parseFile( sourceFile ) + expr.parse_file(source_file) - ``ParseExceptions`` will report the location where an expected token or expression failed to match. For example, if we tried to use our @@ -195,7 +234,7 @@ Usage notes - Punctuation may be significant for matching, but is rarely of much interest in the parsed results. Use the ``suppress()`` method to keep these tokens from cluttering up your returned lists of - tokens. For example, ``delimitedList()`` matches a succession of + tokens. For example, DelimitedList_ matches a succession of one or more expressions, separated by delimiters (commas by default), but only returns a list of the actual expressions - the delimiters are used for parsing, but are suppressed from the @@ -208,45 +247,48 @@ Usage notes expressions. It is much easier to access a token using its field name than using a positional index, especially if the expression contains optional elements. You can also shortcut - the ``setResultsName`` call:: + the ``set_results_name`` call:: - stats = "AVE:" + realNum.setResultsName("average") + \ - "MIN:" + realNum.setResultsName("min") + \ - "MAX:" + realNum.setResultsName("max") + stats = ("AVE:" + real_num.set_results_name("average") + + "MIN:" + real_num.set_results_name("min") + + "MAX:" + real_num.set_results_name("max")) - can now be written as this:: + can more simply and cleanly be written as this:: - stats = "AVE:" + realNum("average") + \ - "MIN:" + realNum("min") + \ - "MAX:" + realNum("max") + stats = ("AVE:" + real_num("average") + + "MIN:" + real_num("min") + + "MAX:" + real_num("max")) - Be careful when defining parse actions that modify global variables or - data structures (as in ``fourFn.py``), especially for low level tokens - or expressions that may occur within an ``And`` expression; an early element - of an ``And`` may match, but the overall expression may fail. + data structures (as in fourFn.py_), especially for low level tokens + or expressions that may occur within an And_ expression; an early element + of an And_ may match, but the overall expression may fail. Classes ======= +All the pyparsing classes can be found in this +`UML class diagram <_static/pyparsingClassDiagram_3.0.9.jpg>`_. + Classes in the pyparsing module ------------------------------- ``ParserElement`` - abstract base class for all pyparsing classes; methods for code to use are: -- ``parseString( sourceString, parseAll=False )`` - only called once, on the overall +- ``parse_string(source_string, parse_all=False)`` - only called once, on the overall matching pattern; returns a ParseResults_ object that makes the matched tokens available as a list, and optionally as a dictionary, - or as an object with named attributes; if parseAll is set to True, then - parseString will raise a ParseException if the grammar does not process + or as an object with named attributes; if ``parse_all`` is set to True, then + ``parse_string`` will raise a ParseException_ if the grammar does not process the complete input string. -- ``parseFile( sourceFile )`` - a convenience function, that accepts an +- ``parse_file(source_file)`` - a convenience function, that accepts an input file object or filename. The file contents are passed as a - string to ``parseString()``. ``parseFile`` also supports the ``parseAll`` argument. + string to ``parse_string()``. ``parse_file`` also supports the ``parse_all`` argument. -- ``scanString( sourceString )`` - generator function, used to find and +- ``scan_string(source_string)`` - generator function, used to find and extract matching text in the given source string; for each matched text, returns a tuple of: @@ -256,40 +298,86 @@ methods for code to use are: - end location in the given source string - ``scanString`` allows you to scan through the input source string for + ``scan_string`` allows you to scan through the input source string for random matches, instead of exhaustively defining the grammar for the entire - source text (as would be required with ``parseString``). + source text (as would be required with ``parse_string``). -- ``transformString( sourceString )`` - convenience wrapper function for - ``scanString``, to process the input source string, and replace matching +- ``transform_string(source_string)`` - convenience wrapper function for + ``scan_string``, to process the input source string, and replace matching text with the tokens returned from parse actions defined in the grammar - (see setParseAction_). + (see set_parse_action_). -- ``searchString( sourceString )`` - another convenience wrapper function for - ``scanString``, returns a list of the matching tokens returned from each - call to ``scanString``. +- ``search_string(source_string)`` - another convenience wrapper function for + ``scan_string``, returns a list of the matching tokens returned from each + call to ``scan_string``. -- ``setName( name )`` - associate a short descriptive name for this +- ``set_name(name)`` - associate a short descriptive name for this element, useful in displaying exceptions and trace information -- ``setResultsName( string, listAllMatches=False )`` - name to be given +- ``run_tests(tests_string)`` - useful development and testing method on + expressions, to pass a multiline string of sample strings to test against + the expression. Comment lines (beginning with ``#``) can be inserted + and they will be included in the test output:: + + digits = Word(nums).set_name("numeric digits") + real_num = Combine(digits + '.' + digits) + real_num.run_tests("""\ + # valid number + 3.14159 + + # no integer part + .00001 + + # no decimal + 101 + + # no decimal value + 101. + """) + + will print:: + + # valid number + 3.14159 + ['3.14159'] + + # no integer part + .00001 + ^ + FAIL: Expected numeric digits, found '.' (at char 0), (line:1, col:1) + + # no decimal + 101 + ^ + FAIL: Expected ".", found end of text (at char 3), (line:1, col:4) + + # no decimal value + 101. + ^ + FAIL: Expected numeric digits, found end of text (at char 4), (line:1, col:5) + +.. _set_results_name: + +- ``set_results_name(string, list_all_matches=False)`` - name to be given to tokens matching the element; if multiple tokens within - a repetition group (such as ``ZeroOrMore`` or ``delimitedList``) the - default is to return only the last matching token - if listAllMatches + a repetition group (such as ZeroOrMore_ or DelimitedList_) the + default is to return only the last matching token - if ``list_all_matches`` is set to True, then a list of all the matching tokens is returned. - (New in 1.5.6 - a results name with a trailing '*' character will be - interpreted as setting listAllMatches to True.) + + ``expr.set_results_name("key")`` can also be written ``expr("key")`` + (a results name with a trailing '*' character will be + interpreted as setting ``list_all_matches`` to ``True``). + Note: - ``setResultsName`` returns a *copy* of the element so that a single + ``set_results_name`` returns a *copy* of the element so that a single basic element can be referenced multiple times and given different names within a complex grammar. -.. _setParseAction: +.. _set_parse_action: -- ``setParseAction( *fn )`` - specify one or more functions to call after successful - matching of the element; each function is defined as ``fn( s, - loc, toks )``, where: +- ``set_parse_action(*fn)`` - specify one or more functions to call after successful + matching of the element; each function is defined as ``fn(s, loc, toks)``, where: - ``s`` is the original parse string @@ -297,128 +385,188 @@ methods for code to use are: - ``toks`` is the list of the matched tokens, packaged as a ParseResults_ object - Multiple functions can be attached to a ParserElement by specifying multiple - arguments to setParseAction, or by calling setParseAction multiple times. + Parse actions can have any of the following signatures:: + + fn(s: str, loc: int, tokens: ParseResults) + fn(loc: int, tokens: ParseResults) + fn(tokens: ParseResults) + fn() + + Multiple functions can be attached to a ``ParserElement`` by specifying multiple + arguments to ``set_parse_action``, or by calling ``add_parse_action``. Calls to ``set_parse_action`` + will replace any previously defined parse actions. ``set_parse_action(None)`` will clear + all previously defined parse actions. Each parse action function can return a modified ``toks`` list, to perform conversion, or string modifications. For brevity, ``fn`` may also be a lambda - here is an example of using a parse action to convert matched integer tokens from strings to integers:: - intNumber = Word(nums).setParseAction( lambda s,l,t: [ int(t[0]) ] ) - - If ``fn`` does not modify the ``toks`` list, it does not need to return - anything at all. - -- ``setBreak( breakFlag=True )`` - if breakFlag is True, calls pdb.set_break() + int_number = Word(nums).set_parse_action(lambda s, l, t: [int(t[0])]) + + If ``fn`` modifies the ``toks`` list in-place, it does not need to return + and pyparsing will use the modified ``toks`` list. + + If ``set_parse_action`` is called with an argument of ``None``, then this clears all parse actions + attached to that expression. + + A nice short-cut for calling ``set_parse_action`` is to use it as a decorator:: + + identifier = Word(alphas, alphanums+"_") + + @identifier.set_parse_action + def resolve_identifier(results: ParseResults): + return variable_values.get(results[0]) + + (Posted by @MisterMiyagi in this SO answer: https://stackoverflow.com/a/63031959/165216) + +- ``add_parse_action`` - similar to ``set_parse_action``, but instead of replacing any + previously defined parse actions, will append the given action or actions to the + existing defined parse actions. + +- ``add_condition`` - a simplified form of ``add_parse_action`` if the purpose + of the parse action is to simply do some validation, and raise an exception + if the validation fails. Takes a method that takes the same arguments, + but simply returns ``True`` or ``False``. If ``False`` is returned, an exception will be + raised. + +- ``set_break(break_flag=True)`` - if ``break_flag`` is ``True``, calls ``pdb.set_break()`` as this expression is about to be parsed -- ``copy()`` - returns a copy of a ParserElement; can be used to use the same +- ``copy()`` - returns a copy of a ``ParserElement``; can be used to use the same parse expression in different places in a grammar, with different parse actions - attached to each + attached to each; a short-form ``expr()`` is equivalent to ``expr.copy()`` -- ``leaveWhitespace()`` - change default behavior of skipping +- ``leave_whitespace()`` - change default behavior of skipping whitespace before starting matching (mostly used internally to the pyparsing module, rarely used by client code) -- ``setWhitespaceChars( chars )`` - define the set of chars to be ignored - as whitespace before trying to match a specific ParserElement, in place of the +- ``set_whitespace_chars(chars)`` - define the set of chars to be ignored + as whitespace before trying to match a specific ``ParserElement``, in place of the default set of whitespace (space, tab, newline, and return) -- ``setDefaultWhitespaceChars( chars )`` - class-level method to override +- ``set_default_whitespace_chars(chars)`` - class-level method to override the default set of whitespace chars for all subsequently created ParserElements (including copies); useful when defining grammars that treat one or more of the default whitespace characters as significant (such as a line-sensitive grammar, to omit newline from the list of ignorable whitespace) - ``suppress()`` - convenience function to suppress the output of the - given element, instead of wrapping it with a Suppress object. + given element, instead of wrapping it with a ``Suppress`` object. -- ``ignore( expr )`` - function to specify parse expression to be +- ``ignore(expr)`` - function to specify parse expression to be ignored while matching defined patterns; can be called repeatedly to specify multiple expressions; useful to specify patterns of comment syntax, for example -- ``setDebug( dbgFlag=True )`` - function to enable/disable tracing output +- ``set_debug(flag=True)`` - function to enable/disable tracing output when trying to match this element - ``validate()`` - function to verify that the defined grammar does not - contain infinitely recursive constructs + contain infinitely recursive constructs (``validate()`` is deprecated, and + will be removed in a future pyparsing release. Pyparsing now supports + left-recursive parsers, which this function attempted to catch.) -.. _parseWithTabs: +.. _parse_with_tabs: -- ``parseWithTabs()`` - function to override default behavior of converting +- ``parse_with_tabs()`` - function to override default behavior of converting tabs to spaces before parsing the input string; rarely used, except when specifying whitespace-significant grammars using the White_ class. -- ``enablePackrat()`` - a class-level static method to enable a memoizing +- ``enable_packrat()`` - a class-level static method to enable a memoizing performance enhancement, known as "packrat parsing". packrat parsing is disabled by default, since it may conflict with some user programs that use parse actions. To activate the packrat feature, your - program must call the class method ParserElement.enablePackrat(). For best - results, call enablePackrat() immediately after importing pyparsing. + program must call the class method ``ParserElement.enable_packrat()``. For best + results, call ``enable_packrat()`` immediately after importing pyparsing. +- ``enable_left_recursion()`` - a class-level static method to enable + pyparsing with left-recursive (LR) parsers. Similar to ``ParserElement.enable_packrat()``, + your program must call the class method ``ParserElement.enable_left_recursion()`` to + enable this feature. ``enable_left_recursion()`` uses a separate packrat cache, and so + is incompatible with ``enable_packrat()``. Basic ParserElement subclasses ------------------------------ +.. _Literal: + - ``Literal`` - construct with a string to be matched exactly +.. _CaselessLiteral: + - ``CaselessLiteral`` - construct with a string to be matched, but without case checking; results are always returned as the defining literal, NOT as they are found in the input string -- ``Keyword`` - similar to Literal, but must be immediately followed by +.. _Keyword: + +- ``Keyword`` - similar to Literal_, but must be immediately followed by whitespace, punctuation, or other non-keyword characters; prevents accidental matching of a non-keyword that happens to begin with a defined keyword -- ``CaselessKeyword`` - similar to Keyword, but with caseless matching - behavior +- ``CaselessKeyword`` - similar to Keyword_, but with caseless matching + behavior as described in CaselessLiteral_. .. _Word: - ``Word`` - one or more contiguous characters; construct with a string containing the set of allowed initial characters, and an optional second string of allowed body characters; for instance, - a common Word construct is to match a code identifier - in C, a + a common ``Word`` construct is to match a code identifier - in C, a valid identifier must start with an alphabetic character or an underscore ('_'), followed by a body that can also include numeric digits. That is, ``a``, ``i``, ``MAX_LENGTH``, ``_a1``, ``b_109_``, and ``plan9FromOuterSpace`` are all valid identifiers; ``9b7z``, ``$a``, ``.section``, and ``0debug`` are not. To - define an identifier using a Word, use either of the following:: + define an identifier using a ``Word``, use either of the following:: + + Word(alphas+"_", alphanums+"_") + Word(srange("[a-zA-Z_]"), srange("[a-zA-Z0-9_]")) + + Pyparsing also provides pre-defined strings ``identchars`` and + ``identbodychars`` so that you can also write:: - - Word( alphas+"_", alphanums+"_" ) - - Word( srange("[a-zA-Z_]"), srange("[a-zA-Z0-9_]") ) + Word(identchars, identbodychars) If only one string given, it specifies that the same character set defined for the initial character is used for the word body; for instance, to define an identifier that can only be composed of capital letters and - underscores, use:: + underscores, use one of:: - - Word( "ABCDEFGHIJKLMNOPQRSTUVWXYZ_" ) - - Word( srange("[A-Z_]") ) + ``Word("ABCDEFGHIJKLMNOPQRSTUVWXYZ_")`` + ``Word(srange("[A-Z_]"))`` - A Word may + A ``Word`` may also be constructed with any of the following optional parameters: - ``min`` - indicating a minimum length of matching characters - ``max`` - indicating a maximum length of matching characters - - ``exact`` - indicating an exact length of matching characters + - ``exact`` - indicating an exact length of matching characters; + if ``exact`` is specified, it will override any values for ``min`` or ``max`` + + - ``as_keyword`` - indicating that preceding and following characters must + be whitespace or non-keyword characters + + - ``exclude_chars`` - a string of characters that should be excluded from + init_chars and body_chars - If ``exact`` is specified, it will override any values for ``min`` or ``max``. + Sometimes you want to define a word using all + characters in a range except for one or two of them; you can do this + with the ``exclude_chars`` argument. This is helpful if you want to define + a word with all ``printables`` except for a single delimiter character, such + as '.'. Previously, you would have to create a custom string to pass to Word. + With this change, you can just create ``Word(printables, exclude_chars='.')``. - New in 1.5.6 - Sometimes you want to define a word using all - characters in a range except for one or two of them; you can do this - with the new ``excludeChars`` argument. This is helpful if you want to define - a word with all printables except for a single delimiter character, such - as '.'. Previously, you would have to create a custom string to pass to Word. - With this change, you can just create ``Word(printables, excludeChars='.')``. +- ``Char`` - a convenience form of ``Word`` that will match just a single character from + a string of matching characters:: + + single_digit = Char(nums) - ``CharsNotIn`` - similar to Word_, but matches characters not in the given constructor string (accepts only one string for both @@ -427,26 +575,28 @@ Basic ParserElement subclasses - ``Regex`` - a powerful construct, that accepts a regular expression to be matched at the current parse position; accepts an optional - ``flags`` parameter, corresponding to the flags parameter in the re.compile + ``flags`` parameter, corresponding to the flags parameter in the ``re.compile`` method; if the expression includes named sub-fields, they will be - represented in the returned ParseResults_ + represented in the returned ParseResults_. - ``QuotedString`` - supports the definition of custom quoted string - formats, in addition to pyparsing's built-in ``dblQuotedString`` and - ``sglQuotedString``. ``QuotedString`` allows you to specify the following + formats, in addition to pyparsing's built-in ``dbl_quoted_string`` and + ``sgl_quoted_string``. ``QuotedString`` allows you to specify the following parameters: - - ``quoteChar`` - string of one or more characters defining the quote delimiting string + - ``quote_char`` - string of one or more characters defining the quote delimiting string - - ``escChar`` - character to escape quotes, typically backslash (default=None) + - ``esc_char`` - character to escape quotes, typically backslash (default=None) - - ``escQuote`` - special quote sequence to escape an embedded quote string (such as SQL's "" to escape an embedded ") (default=None) + - ``esc_quote`` - special quote sequence to escape an embedded quote string (such as SQL's "" to escape an embedded ") (default=None) - ``multiline`` - boolean indicating whether quotes can span multiple lines (default=False) - - ``unquoteResults`` - boolean indicating whether the matched text should be unquoted (default=True) + - ``unquote_results`` - boolean indicating whether the matched text should be unquoted (default=True) + + - ``end_quote_char`` - string of one or more characters defining the end of the quote delimited string (default=None => same as ``quote_char``) - - ``endQuoteChar`` - string of one or more characters defining the end of the quote delimited string (default=None => same as quoteChar) +.. _SkipTo: - ``SkipTo`` - skips ahead in the input string, accepting any characters up to the specified pattern; may be constructed with @@ -458,8 +608,16 @@ Basic ParserElement subclasses - ``ignore`` - allows the user to specify patterns to not be matched, to prevent false matches - - ``failOn`` - if a literal string or expression is given for this argument, it defines an expression that - should cause the ``SkipTo`` expression to fail, and not skip over that expression + - ``fail_on`` - if a literal string or expression is given for this argument, it defines an expression that + should cause the SkipTo_ expression to fail, and not skip over that expression + + ``SkipTo`` can also be written using ``...``:: + + LBRACE, RBRACE = map(Literal, "{}") + + brace_expr = LBRACE + SkipTo(RBRACE) + RBRACE + # can also be written as + brace_expr = LBRACE + ... + RBRACE .. _White: @@ -468,72 +626,106 @@ Basic ParserElement subclasses ignored by pyparsing. However, some grammars are whitespace-sensitive, such as those that use leading tabs or spaces to indicating grouping or hierarchy. (If matching on tab characters, be sure to call - parseWithTabs_ on the top-level parse element.) + parse_with_tabs_ on the top-level parse element.) - ``Empty`` - a null expression, requiring no characters - will always match; useful for debugging and for specialized grammars -- ``NoMatch`` - opposite of Empty, will never match; useful for debugging +- ``NoMatch`` - opposite of ``Empty``, will never match; useful for debugging and for specialized grammars Expression subclasses --------------------- -- ``And`` - construct with a list of ParserElements, all of which must - match for And to match; can also be created using the '+' - operator; multiple expressions can be Anded together using the '*' +.. _And: + +- ``And`` - construct with a list of ``ParserElements``, all of which must + match for ``And`` to match; can also be created using the '+' + operator; multiple expressions can be ``Anded`` together using the '*' operator as in:: - ipAddress = Word(nums) + ('.'+Word(nums))*3 + ip_address = Word(nums) + ('.' + Word(nums)) * 3 A tuple can be used as the multiplier, indicating a min/max:: - usPhoneNumber = Word(nums) + ('-'+Word(nums))*(1,2) + us_phone_number = Word(nums) + ('-' + Word(nums)) * (1,2) A special form of ``And`` is created if the '-' operator is used - instead of the '+' operator. In the ipAddress example above, if - no trailing '.' and Word(nums) are found after matching the initial - Word(nums), then pyparsing will back up in the grammar and try other - alternatives to ipAddress. However, if ipAddress is defined as:: + instead of the '+' operator. In the ``ip_address`` example above, if + no trailing '.' and ``Word(nums)`` are found after matching the initial + ``Word(nums)``, then pyparsing will back up in the grammar and try other + alternatives to ``ip_address``. However, if ``ip_address`` is defined as:: - strictIpAddress = Word(nums) - ('.'+Word(nums))*3 + strict_ip_address = Word(nums) - ('.'+Word(nums))*3 - then no backing up is done. If the first Word(nums) of strictIpAddress - is matched, then any mismatch after that will raise a ParseSyntaxException, + then no backing up is done. If the first ``Word(nums)`` of ``strict_ip_address`` + is matched, then any mismatch after that will raise a ``ParseSyntaxException``, which will halt the parsing process immediately. By careful use of the '-' operator, grammars can provide meaningful error messages close to the location where the incoming text does not match the specified grammar. -- ``Or`` - construct with a list of ParserElements, any of which must - match for Or to match; if more than one expression matches, the +.. _Or: + +- ``Or`` - construct with a list of ``ParserElements``, any of which must + match for ``Or`` to match; if more than one expression matches, the expression that makes the longest match will be used; can also be created using the '^' operator -- ``MatchFirst`` - construct with a list of ParserElements, any of - which must match for MatchFirst to match; matching is done +.. _MatchFirst: + +- ``MatchFirst`` - construct with a list of ``ParserElements``, any of + which must match for ``MatchFirst`` to match; matching is done left-to-right, taking the first expression that matches; can also be created using the '|' operator -- ``Each`` - similar to And, in that all of the provided expressions - must match; however, Each permits matching to be done in any order; +.. _Each: + +- ``Each`` - similar to And_, in that all of the provided expressions + must match; however, ``Each`` permits matching to be done in any order; can also be created using the '&' operator -- ``Optional`` - construct with a ParserElement, but this element is +- ``Opt`` - construct with a ``ParserElement``, but this element is not required to match; can be constructed with an optional ``default`` argument, containing a default string or object to be supplied if the given optional parse element is not found in the input string; parse action will only - be called if a match is found, or if a default is specified + be called if a match is found, or if a default is specified. + + (``Opt`` was formerly named ``Optional``, but since the standard Python + library module ``typing`` now defines ``Optional``, the pyparsing class has + been renamed to ``Opt``. A compatibility synonym ``Optional`` is defined, + but will be removed in a future release.) + +.. _ZeroOrMore: -- ``ZeroOrMore`` - similar to Optional, but can be repeated +- ``ZeroOrMore`` - similar to ``Opt``, but can be repeated; ``ZeroOrMore(expr)`` + can also be written as ``expr[...]``. -- ``OneOrMore`` - similar to ZeroOrMore, but at least one match must - be present +.. _OneOrMore: + +- ``OneOrMore`` - similar to ZeroOrMore_, but at least one match must + be present; ``OneOrMore(expr)`` can also be written as ``expr[1, ...]``. + +.. _DelimitedList: + +- ``DelimitedList`` - used for + matching one or more occurrences of ``expr``, separated by ``delim``. + By default, the delimiters are suppressed, so the returned results contain + only the separate list elements. Can optionally specify ``combine=True``, + indicating that the expressions and delimiters should be returned as one + combined value (useful for scoped variables, such as ``"a.b.c"``, or + ``"a::b::c"``, or paths such as ``"a/b/c"``). Can also optionally specify ``min` and ``max`` + restrictions on the length of the list, and + ``allow_trailing_delim`` to accept a trailing delimiter at the end of the list. + +.. _FollowedBy: - ``FollowedBy`` - a lookahead expression, requires matching of the given expressions, but does not advance the parsing position within the input string +.. _NotAny: + - ``NotAny`` - a negative lookahead expression, prevents matching of named expressions, does not advance the parsing position within the input string; can also be created using the unary '~' operator @@ -544,31 +736,38 @@ Expression subclasses Expression operators -------------------- -- ``~`` - creates NotAny using the expression after the operator +- ``+`` - creates And_ using the expressions before and after the operator -- ``+`` - creates And using the expressions before and after the operator +- ``|`` - creates MatchFirst_ (first left-to-right match) using the expressions before and after the operator -- ``|`` - creates MatchFirst (first left-to-right match) using the expressions before and after the operator +- ``^`` - creates Or_ (longest match) using the expressions before and after the operator -- ``^`` - creates Or (longest match) using the expressions before and after the operator +- ``&`` - creates Each_ using the expressions before and after the operator -- ``&`` - creates Each using the expressions before and after the operator - -- ``*`` - creates And by multiplying the expression by the integer operand; if - expression is multiplied by a 2-tuple, creates an And of (min,max) - expressions (similar to "{min,max}" form in regular expressions); if - min is None, intepret as (0,max); if max is None, interpret as - expr*min + ZeroOrMore(expr) +- ``*`` - creates And_ by multiplying the expression by the integer operand; if + expression is multiplied by a 2-tuple, creates an And_ of ``(min,max)`` + expressions (similar to ``{min,max}`` form in regular expressions); if + ``min`` is ``None``, interpret as ``(0,max)``; if ``max`` is ``None``, interpret as + ``expr*min + ZeroOrMore(expr)`` - ``-`` - like ``+`` but with no backup and retry of alternatives -- ``*`` - repetition of expression +- ``~`` - creates NotAny_ using the expression after the operator -- ``==`` - matching expression to string; returns True if the string matches the given expression +- ``==`` - matching expression to string; returns ``True`` if the string matches the given expression - ``<<=`` - inserts the expression following the operator as the body of the - Forward expression before the operator + ``Forward`` expression before the operator (``<<`` can also be used, but ``<<=`` is preferred + to avoid operator precedence misinterpretation of the pyparsing expression) + +- ``...`` - inserts a SkipTo_ expression leading to the next expression, as in + ``Keyword("start") + ... + Keyword("end")``. +- ``[min, max]`` - specifies repetition similar to ``*`` with ``min`` and ``max`` specified + as the minimum and maximum number of repetitions. ``...`` can be used in place of ``None``. + For example ``expr[...]`` is equivalent to ``ZeroOrMore(expr)``, ``expr[1, ...]`` is + equivalent to ``OneOrMore(expr)``, and ``expr[..., 3]`` is equivalent to "up to 3 instances + of ``expr``". Positional subclasses @@ -592,7 +791,7 @@ Converter subclasses -------------------- - ``Combine`` - joins all matched tokens into a single string, using - specified joinString (default ``joinString=""``); expects + specified ``join_string`` (default ``join_string=""``); expects all matching tokens to be adjacent, with no intervening whitespace (can be overridden by specifying ``adjacent=False`` in constructor) @@ -605,21 +804,17 @@ Special subclasses ------------------ - ``Group`` - causes the matched tokens to be enclosed in a list; - useful in repeated elements like ``ZeroOrMore`` and ``OneOrMore`` to + useful in repeated elements like ZeroOrMore_ and OneOrMore_ to break up matched tokens into groups for each repeated pattern - ``Dict`` - like ``Group``, but also constructs a dictionary, using the - [0]'th elements of all enclosed token lists as the keys, and + ``[0]``'th elements of all enclosed token lists as the keys, and each token list as the value -- ``SkipTo`` - catch-all matching expression that accepts all characters - up until the given pattern is found to match; useful for specifying - incomplete grammars - - ``Forward`` - placeholder token used to define recursive token patterns; when defining the actual expression later in the - program, insert it into the ``Forward`` object using the ``<<`` - operator (see ``fourFn.py`` for an example). + program, insert it into the ``Forward`` object using the ``<<=`` + operator (see fourFn.py_ for an example). Other classes @@ -628,31 +823,47 @@ Other classes - ``ParseResults`` - class used to contain and manage the lists of tokens created from parsing the input using the user-defined parse - expression. ParseResults can be accessed in a number of ways: + expression. ``ParseResults`` can be accessed in a number of ways: - as a list - - total list of elements can be found using len() + - total list of elements can be found using ``len()`` - - individual elements can be found using [0], [1], [-1], etc. + - individual elements can be found using ``[0], [1], [-1],`` etc., + or retrieved using slices - elements can be deleted using ``del`` - - the -1th element can be extracted and removed in a single operation + - the ``-1``th element can be extracted and removed in a single operation using ``pop()``, or any element can be extracted and removed using ``pop(n)`` + - a nested ParseResults_ can be created by using the pyparsing ``Group`` class + around elements in an expression:: + + Word(alphas) + Group(Word(nums)[...]) + Word(alphas) + + will parse the string "abc 100 200 300 end" as:: + + ['abc', ['100', '200', '300'], 'end'] + + If the ``Group`` is constructed using ``aslist=True``, the resulting tokens + will be a Python list instead of a ParseResults_. In this case, the returned value will + no longer support the extended features or methods of a ParseResults_. + - as a dictionary - - if ``setResultsName()`` is used to name elements within the + - if ``set_results_name()`` is used to name elements within the overall parse expression, then these fields can be referenced as dictionary elements or as attributes - - the Dict class generates dictionary entries using the data of the - input text - in addition to ParseResults listed as ``[ [ a1, b1, c1, ...], [ a2, b2, c2, ...] ]`` + - the ``Dict`` class generates dictionary entries using the data of the + input text - in addition to ParseResults_ listed as ``[ [ a1, b1, c1, ...], [ a2, b2, c2, ...] ]`` it also acts as a dictionary with entries defined as ``{ a1 : [ b1, c1, ... ] }, { a2 : [ b2, c2, ... ] }``; this is especially useful when processing tabular data where the first column contains a key - value for that line of data + value for that line of data; when constructed with ``asdict=True``, will + return an actual Python ``dict`` instead of a ParseResults_. In this case, the returned value will + no longer support the extended features or methods of a ParseResults_. - list elements that are deleted using ``del`` will still be accessible by their dictionary keys @@ -660,11 +871,12 @@ Other classes - supports ``get()``, ``items()`` and ``keys()`` methods, similar to a dictionary - a keyed item can be extracted and removed using ``pop(key)``. Here - key must be non-numeric (such as a string), in order to use dict + ``key`` must be non-numeric (such as a string), in order to use dict extraction instead of list extraction. - new named elements can be added (in a parse action, for instance), using the same - syntax as adding an item to a dict (``parseResults["X"]="new item"``); named elements can be removed using ``del parseResults["X"]`` + syntax as adding an item to a dict (``parse_results["X"] = "new item"``); + named elements can be removed using ``del parse_results["X"]`` - as a nested list @@ -672,16 +884,50 @@ Other classes own list structure, so that the tokens can be handled as a hierarchical tree - ParseResults can also be converted to an ordinary list of strings - by calling ``asList()``. Note that this will strip the results of any + - as an object + + - named elements can be accessed as if they were attributes of an object: + if an element is referenced that does not exist, it will return ``""``. + + ParseResults_ can also be converted to an ordinary list of strings + by calling ``as_list()``. Note that this will strip the results of any field names that have been defined for any embedded parse elements. (The ``pprint`` module is especially good at printing out the nested contents - given by ``asList()``.) + given by ``as_list()``.) + + If a ParseResults_ is built with expressions that use results names (see _set_results_name) or + using the ``Dict`` class, then those names and values can be extracted as a Python + dict using ``as_dict()``. - Finally, ParseResults can be viewed by calling ``dump()``. ``dump()` will first show - the ``asList()`` output, followed by an indented structure listing parsed tokens that + Finally, ParseResults_ can be viewed by calling ``dump()``. ``dump()`` will first show + the ``as_list()`` output, followed by an indented structure listing parsed tokens that have been assigned results names. + Here is sample code illustrating some of these methods:: + + >>> number = Word(nums) + >>> name = Combine(Word(alphas)[...], adjacent=False, join_string=" ") + >>> parser = number("house_number") + name("street_name") + >>> result = parser.parse_string("123 Main St") + >>> print(result) + ['123', 'Main St'] + >>> print(type(result)) + + >>> print(repr(result)) + (['123', 'Main St'], {'house_number': ['123'], 'street_name': ['Main St']}) + >>> result.house_number + '123' + >>> result["street_name"] + 'Main St' + >>> result.as_list() + ['123', 'Main St'] + >>> result.as_dict() + {'house_number': '123', 'street_name': 'Main St'} + >>> print(result.dump()) + ['123', 'Main St'] + - house_number: '123' + - street_name: 'Main St' + Exception classes and Troubleshooting ------------------------------------- @@ -689,20 +935,25 @@ Exception classes and Troubleshooting .. _ParseException: - ``ParseException`` - exception returned when a grammar parse fails; - ParseExceptions have attributes loc, msg, line, lineno, and column; to view the + ``ParseExceptions`` have attributes ``loc``, ``msg``, ``line``, ``lineno``, and ``column``; to view the text line and location where the reported ParseException occurs, use:: - except ParseException, err: - print err.line - print " "*(err.column-1) + "^" - print err + except ParseException as err: + print(err.line) + print(" " * (err.column - 1) + "^") + print(err) + + ``ParseExceptions`` also have an ``explain()`` method that gives this same information:: + + except ParseException as err: + print(err.explain()) - ``RecursiveGrammarException`` - exception returned by ``validate()`` if the grammar contains a recursive infinite loop, such as:: - badGrammar = Forward() - goodToken = Literal("A") - badGrammar <<= Optional(goodToken) + badGrammar + bad_grammar = Forward() + good_token = Literal("A") + bad_grammar <<= Opt(good_token) + bad_grammar - ``ParseFatalException`` - exception that parse actions can raise to stop parsing immediately. Should be used when a semantic error is found in the input text, such @@ -710,11 +961,87 @@ Exception classes and Troubleshooting - ``ParseSyntaxException`` - subclass of ``ParseFatalException`` raised when a syntax error is found, based on the use of the '-' operator when defining - a sequence of expressions in an ``And`` expression. + a sequence of expressions in an And_ expression. + +- You can also get some insights into the parsing logic using diagnostic parse actions, + and ``set_debug()``, or test the matching of expression fragments by testing them using + ``search_string()`` or ``scan_string()``. + +- Use ``with_line_numbers`` from ``pyparsing_testing`` to display the input string + being parsed, with line and column numbers that correspond to the values reported + in set_debug() output:: + + import pyparsing as pp + ppt = pp.testing + + data = """\ + A + 100""" + + expr = pp.Word(pp.alphanums).set_name("word").set_debug() + print(ppt.with_line_numbers(data)) + expr[...].parseString(data) + + prints:: + + . 1 + 1234567890 + 1: A| + 2: 100| + + Match word at loc 3(1,4) + A + ^ + Matched word -> ['A'] + Match word at loc 11(2,7) + 100 + ^ + Matched word -> ['100'] + + `with_line_numbers` has several options for displaying control characters, end-of-line + and space markers, Unicode symbols for control characters - these are documented in the + function's docstring. + +- Diagnostics can be enabled using ``pyparsing.enable_diag`` and passing + one of the following enum values defined in ``pyparsing.Diagnostics`` + + - ``warn_multiple_tokens_in_named_alternation`` - flag to enable warnings when a results + name is defined on a MatchFirst_ or Or_ expression with one or more And_ subexpressions + + - ``warn_ungrouped_named_tokens_in_collection`` - flag to enable warnings when a results + name is defined on a containing expression with ungrouped subexpressions that also + have results names + + - ``warn_name_set_on_empty_Forward`` - flag to enable warnings when a ``Forward`` is defined + with a results name, but has no contents defined + + - ``warn_on_parse_using_empty_Forward`` - flag to enable warnings when a ``Forward`` is + defined in a grammar but has never had an expression attached to it + + - ``warn_on_assignment_to_Forward`` - flag to enable warnings when a ``Forward`` is defined + but is overwritten by assigning using ``'='`` instead of ``'<<='`` or ``'<<'`` + + - ``warn_on_multiple_string_args_to_oneof`` - flag to enable warnings when ``one_of`` is + incorrectly called with multiple str arguments + + - ``enable_debug_on_named_expressions`` - flag to auto-enable debug on all subsequent + calls to ``ParserElement.set_name`` + + All warnings can be enabled by calling ``pyparsing.enable_all_warnings()``. + Sample:: + + import pyparsing as pp + pp.enable_all_warnings() + + fwd = pp.Forward().set_results_name("recursive_expr") + + >>> UserWarning: warn_name_set_on_empty_Forward: setting results name 'recursive_expr' + on Forward expression that has no contained expression -You can also get some insights into the parsing logic using diagnostic parse actions, -and setDebug(), or test the matching of expression fragments by testing them using -scanString(). + Warnings can also be enabled using the Python ``-W`` switch (using ``-Wd`` or + ``-Wd:::pyparsing``) or setting a non-empty value to the environment variable + ``PYPARSINGENABLEALLWARNINGS``. (If using ``-Wd`` for testing, but wishing to + disable pyparsing warnings, add ``-Wi:::pyparsing``.) Miscellaneous attributes and methods @@ -723,80 +1050,109 @@ Miscellaneous attributes and methods Helper methods -------------- -- ``delimitedList( expr, delim=',')`` - convenience function for - matching one or more occurrences of expr, separated by delim. - By default, the delimiters are suppressed, so the returned results contain - only the separate list elements. Can optionally specify ``combine=True``, - indicating that the expressions and delimiters should be returned as one - combined value (useful for scoped variables, such as ``"a.b.c"``, or - ``"a::b::c"``, or paths such as ``"a/b/c"``). - -- ``countedArray( expr )`` - convenience function for a pattern where an list of +- ``counted_array(expr)`` - convenience function for a pattern where an list of instances of the given expression are preceded by an integer giving the count of elements in the list. Returns an expression that parses the leading integer, reads exactly that many expressions, and returns the array of expressions in the parse results - the leading integer is suppressed from the results (although it is easily reconstructed by using len on the returned array). -- ``oneOf( string, caseless=False )`` - convenience function for quickly declaring an - alternative set of ``Literal`` tokens, by splitting the given string on - whitespace boundaries. The tokens are sorted so that longer - matches are attempted first; this ensures that a short token does +- ``one_of(choices, caseless=False, as_keyword=False)`` - convenience function for quickly declaring an + alternative set of Literal_ expressions. ``choices`` can be passed as a list of strings + or as a single string of values separated by spaces. The values are sorted so that longer + matches are attempted first; this ensures that a short value does not mask a longer one that starts with the same characters. If ``caseless=True``, - will create an alternative set of CaselessLiteral tokens. + will create an alternative set of CaselessLiteral_ tokens. If ``as_keyword=True``, + ``one_of`` will declare Keyword_ expressions instead of Literal_ expressions. -- ``dictOf( key, value )`` - convenience function for quickly declaring a - dictionary pattern of ``Dict( ZeroOrMore( Group( key + value ) ) )``. +- ``dict_of(key, value)`` - convenience function for quickly declaring a + dictionary pattern of ``Dict(ZeroOrMore(Group(key + value)))``. -- ``makeHTMLTags( tagName )`` and ``makeXMLTags( tagName )`` - convenience +- ``make_html_tags(tag_str)`` and ``make_xml_tags(tag_str)`` - convenience functions to create definitions of opening and closing tag expressions. Returns - a pair of expressions, for the corresponding and strings. Includes - support for attributes in the opening tag, such as - attributes - are returned as keyed tokens in the returned ParseResults. ``makeHTMLTags`` is less - restrictive than ``makeXMLTags``, especially with respect to case sensitivity. - -- ``infixNotation(baseOperand, operatorList)`` - (formerly named ``operatorPrecedence``) convenience function to define a - grammar for parsing infix notation - expressions with a hierarchical precedence of operators. To use the ``infixNotation`` + a pair of expressions, for the corresponding ```` and ```` strings. Includes + support for attributes in the opening tag, such as ```` - attributes + are returned as named results in the returned ParseResults_. ``make_html_tags`` is less + restrictive than ``make_xml_tags``, especially with respect to case sensitivity. + +- ``infix_notation(base_operand, operator_list)`` - + convenience function to define a grammar for parsing infix notation + expressions with a hierarchical precedence of operators. To use the ``infix_notation`` helper: 1. Define the base "atom" operand term of the grammar. For this simple grammar, the smallest operand is either - and integer or a variable. This will be the first argument - to the ``infixNotation`` method. + an integer or a variable. This will be the first argument + to the ``infix_notation`` method. 2. Define a list of tuples for each level of operator - precendence. Each tuple is of the form - ``(opExpr, numTerms, rightLeftAssoc, parseAction)``, where: + precedence. Each tuple is of the form + ``(operand_expr, num_operands, right_left_assoc, parse_action)``, where: - - ``opExpr`` - the pyparsing expression for the operator; - may also be a string, which will be converted to a Literal; if - None, indicates an empty operator, such as the implied + - ``operand_expr`` - the pyparsing expression for the operator; + may also be a string, which will be converted to a Literal_; if + ``None``, indicates an empty operator, such as the implied multiplication operation between 'm' and 'x' in "y = mx + b". - - ``numTerms`` - the number of terms for this operator (must + - ``num_operands`` - the number of terms for this operator (must be 1, 2, or 3) - - ``rightLeftAssoc`` is the indicator whether the operator is + - ``right_left_assoc`` is the indicator whether the operator is right or left associative, using the pyparsing-defined - constants ``opAssoc.RIGHT`` and ``opAssoc.LEFT``. + constants ``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``. - - ``parseAction`` is the parse action to be associated with + - ``parse_action`` is the parse action to be associated with expressions matching this operator expression (the - ``parseAction`` tuple member may be omitted) + ``parse_action`` tuple member may be omitted) - 3. Call ``infixNotation`` passing the operand expression and + 3. Call ``infix_notation`` passing the operand expression and the operator precedence list, and save the returned value as the generated pyparsing expression. You can then use this expression to parse input strings, or incorporate it into a larger, more complex grammar. -- ``matchPreviousLiteral`` and ``matchPreviousExpr`` - function to define and + ``infix_notation`` also supports optional arguments ``lpar`` and ``rpar``, to + parse groups with symbols other than "(" and ")". They may be passed as strings + (in which case they will be converted to ``Suppress`` objects, and suppressed from + the parsed results), or passed as pyparsing expressions, in which case they will + be kept as-is, and grouped with their contents. + + For instance, to use "<" and ">" for grouping symbols, you could write:: + + expr = infix_notation(int_expr, + [ + (one_of("+ -"), 2, opAssoc.LEFT), + ], + lpar="<", + rpar=">" + ) + expr.parse_string("3 - <2 + 11>") + + returning:: + + [3, '-', [2, '+', 11]] + + If the grouping symbols are to be retained, then pass them as pyparsing ``Literals``:: + + expr = infix_notation(int_expr, + [ + (one_of("+ -"), 2, opAssoc.LEFT), + ], + lpar=Literal("<"), + rpar=Literal(">") + ) + expr.parse_string("3 - <2 + 11>") + + returning:: + + [3, '-', ['<', [2, '+', 11], '>']] + +- ``match_previous_literal`` and ``match_previous_expr`` - function to define an expression that matches the same content as was parsed in a previous parse expression. For instance:: first = Word(nums) - matchExpr = first + ":" + matchPreviousLiteral(first) + match_expr = first + ":" + match_previous_literal(first) will match "1:1", but not "1:2". Since this matches at the literal level, this will also match the leading "1:1" in "1:10". @@ -804,12 +1160,12 @@ Helper methods In contrast:: first = Word(nums) - matchExpr = first + ":" + matchPreviousExpr(first) + match_expr = first + ":" + match_previous_expr(first) will *not* match the leading "1:1" in "1:10"; the expressions are evaluated first, and then compared, so "1" is compared with "10". -- ``nestedExpr(opener, closer, content=None, ignoreExpr=quotedString)`` - method for defining nested +- ``nested_expr(opener, closer, content=None, ignore_expr=quoted_string)`` - method for defining nested lists enclosed in opening and closing delimiters. - ``opener`` - opening character for a nested list (default="("); can also be a pyparsing expression @@ -818,76 +1174,70 @@ Helper methods - ``content`` - expression for items within the nested lists (default=None) - - ``ignoreExpr`` - expression for ignoring opening and closing delimiters (default=quotedString) + - ``ignore_expr`` - expression for ignoring opening and closing delimiters (default=``quoted_string``) If an expression is not provided for the content argument, the nested expression will capture all whitespace-delimited content between delimiters as a list of separate values. - Use the ignoreExpr argument to define expressions that may contain + Use the ``ignore_expr`` argument to define expressions that may contain opening or closing characters that should not be treated as opening - or closing characters for nesting, such as quotedString or a comment - expression. Specify multiple expressions using an Or or MatchFirst. - The default is quotedString, but if no expressions are to be ignored, - then pass None for this argument. + or closing characters for nesting, such as ``quoted_string`` or a comment + expression. Specify multiple expressions using an Or_ or MatchFirst_. + The default is ``quoted_string``, but if no expressions are to be ignored, + then pass ``None`` for this argument. -- ``indentedBlock( statementExpr, indentationStackVar, indent=True)`` - +- ``IndentedBlock(statement_expr, recursive=False, grouped=True)`` - function to define an indented block of statements, similar to indentation-based blocking in Python source code: - - ``statementExpr`` - the expression defining a statement that - will be found in the indented block; a valid ``indentedBlock`` - must contain at least 1 matching ``statementExpr`` + - ``statement_expr`` - the expression defining a statement that + will be found in the indented block; a valid ``IndentedBlock`` + must contain at least 1 matching ``statement_expr`` - - ``indentationStackVar`` - a Python list variable; this variable - should be common to all ``indentedBlock`` expressions defined - within the same grammar, and should be reinitialized to [1] - each time the grammar is to be used + - ``recursive`` - flag indicating whether the IndentedBlock can + itself contain nested sub-blocks of the same type of expression + (default=False) - - ``indent`` - a boolean flag indicating whether the expressions - within the block must be indented from the current parse - location; if using ``indentedBlock`` to define the left-most - statements (all starting in column 1), set ``indent`` to False + - ``grouped`` - flag indicating whether the tokens returned from + parsing the IndentedBlock should be grouped (default=True) .. _originalTextFor: -- ``originalTextFor( expr )`` - helper function to preserve the originally parsed text, regardless of any +- ``original_text_for(expr)`` - helper function to preserve the originally parsed text, regardless of any token processing or conversion done by the contained expression. For instance, the following expression:: - fullName = Word(alphas) + Word(alphas) + full_name = Word(alphas) + Word(alphas) will return the parse of "John Smith" as ['John', 'Smith']. In some applications, the actual name as it - was given in the input string is what is desired. To do this, use ``originalTextFor``:: + was given in the input string is what is desired. To do this, use ``original_text_for``:: - fullName = originalTextFor(Word(alphas) + Word(alphas)) + full_name = original_text_for(Word(alphas) + Word(alphas)) -- ``ungroup( expr )`` - function to "ungroup" returned tokens; useful - to undo the default behavior of And to always group the returned tokens, even - if there is only one in the list. (New in 1.5.6) +- ``ungroup(expr)`` - function to "ungroup" returned tokens; useful + to undo the default behavior of And_ to always group the returned tokens, even + if there is only one in the list. -- ``lineno( loc, string )`` - function to give the line number of the +- ``lineno(loc, string)`` - function to give the line number of the location within the string; the first line is line 1, newlines start new rows -- ``col( loc, string )`` - function to give the column number of the +- ``col(loc, string)`` - function to give the column number of the location within the string; the first column is column 1, newlines reset the column number to 1 -- ``line( loc, string )`` - function to retrieve the line of text - representing ``lineno( loc, string )``; useful when printing out diagnostic +- ``line(loc, string)`` - function to retrieve the line of text + representing ``lineno(loc, string)``; useful when printing out diagnostic messages for exceptions -- ``srange( rangeSpec )`` - function to define a string of characters, +- ``srange(range_spec)`` - function to define a string of characters, given a string of the form used by regexp string ranges, such as ``"[0-9]"`` for all numeric digits, ``"[A-Z_]"`` for uppercase characters plus underscore, and - so on (note that rangeSpec does not include support for generic regular + so on (note that ``range_spec`` does not include support for generic regular expressions, just string range specs) -- ``getTokensEndLoc()`` - function to call from within a parse action to get - the ending location for the matched tokens - -- ``traceParseAction(fn)`` - decorator function to debug parse actions. Lists +- ``trace_parse_action(fn)`` - decorator function to debug parse actions. Lists each call, called arguments, and return value or exception @@ -895,44 +1245,53 @@ Helper methods Helper parse actions -------------------- -- ``removeQuotes`` - removes the first and last characters of a quoted string; +- ``remove_quotes`` - removes the first and last characters of a quoted string; useful to remove the delimiting quotes from quoted strings -- ``replaceWith(replString)`` - returns a parse action that simply returns the - replString; useful when using transformString, or converting HTML entities, as in:: +- ``replace_with(repl_string)`` - returns a parse action that simply returns the + ``repl_string``; useful when using ``transform_string``, or converting HTML entities, as in:: - nbsp = Literal(" ").setParseAction( replaceWith("") ) + nbsp = Literal(" ").set_parse_action(replace_with("")) -- ``keepOriginalText``- (deprecated, use originalTextFor_ instead) restores any internal whitespace or suppressed +- ``original_text_for``- restores any internal whitespace or suppressed text within the tokens for a matched parse expression. This is especially useful when defining expressions - for scanString or transformString applications. + for ``scan_string`` or ``transform_string`` applications. -- ``withAttribute( *args, **kwargs )`` - helper to create a validating parse action to be used with start tags created - with ``makeXMLTags`` or ``makeHTMLTags``. Use ``withAttribute`` to qualify a starting tag +- ``with_attribute(*args, **kwargs)`` - helper to create a validating parse action to be used with start tags created + with ``make_xml_tags`` or ``make_html_tags``. Use ``with_attribute`` to qualify a starting tag with a required attribute value, to avoid false matches on common tags such as ```` or ``
``. - ``withAttribute`` can be called with: + ``with_attribute`` can be called with: - - keyword arguments, as in ``(class="Customer",align="right")``, or + - keyword arguments, as in ``(class="Customer", align="right")``, or - - a list of name-value tuples, as in ``( ("ns1:class", "Customer"), ("ns2:align","right") )`` + - a list of name-value tuples, as in ``(("ns1:class", "Customer"), ("ns2:align", "right"))`` An attribute can be specified to have the special value - ``withAttribute.ANY_VALUE``, which will match any value - use this to + ``with_attribute.ANY_VALUE``, which will match any value - use this to ensure that an attribute is present but any attribute value is acceptable. -- ``downcaseTokens`` - converts all matched tokens to lowercase - -- ``upcaseTokens`` - converts all matched tokens to uppercase - -- ``matchOnlyAtCol( columnNumber )`` - a parse action that verifies that +- ``match_only_at_col(column_number)`` - a parse action that verifies that an expression was matched at a particular column, raising a - ParseException if matching at a different column number; useful when parsing + ``ParseException`` if matching at a different column number; useful when parsing tabular data +- ``common.convert_to_integer()`` - converts all matched tokens to int + +- ``common.convert_to_float()`` - converts all matched tokens to float + +- ``common.convert_to_date()`` - converts matched token to a datetime.date + +- ``common.convert_to_datetime()`` - converts matched token to a datetime.datetime + +- ``common.strip_html_tags()`` - removes HTML tags from matched token + +- ``common.downcase_tokens()`` - converts all matched tokens to lowercase + +- ``common.upcase_tokens()`` - converts all matched tokens to uppercase Common string and token constants @@ -948,27 +1307,223 @@ Common string and token constants ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ +.. _identchars: + +- ``identchars`` - a string containing characters that are valid as initial identifier characters:: + + ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyzª + µºÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ + +- ``identbodychars`` - a string containing characters that are valid as identifier body characters (those following a + valid leading identifier character as given in identchars_):: + + 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyzª + µ·ºÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ + - ``printables`` - same as ``string.printable``, minus the space (``' '``) character - ``empty`` - a global ``Empty()``; will always match -- ``sglQuotedString`` - a string of characters enclosed in 's; may +- ``sgl_quoted_string`` - a string of characters enclosed in 's; may include whitespace, but not newlines -- ``dblQuotedString`` - a string of characters enclosed in "s; may +- ``dbl_quoted_string`` - a string of characters enclosed in "s; may include whitespace, but not newlines -- ``quotedString`` - ``sglQuotedString | dblQuotedString`` +- ``quoted_string`` - ``sgl_quoted_string | dbl_quoted_string`` + +- ``python_quoted_string`` - ``quoted_string | multiline quoted string`` -- ``cStyleComment`` - a comment block delimited by ``'/*'`` and ``'*/'`` sequences; can span +- ``c_style_comment`` - a comment block delimited by ``'/*'`` and ``'*/'`` sequences; can span multiple lines, but does not support nesting of comments -- ``htmlComment`` - a comment block delimited by ``''`` sequences; can span +- ``html_comment`` - a comment block delimited by ``''`` sequences; can span multiple lines, but does not support nesting of comments -- ``commaSeparatedList`` - similar to ``delimitedList``, except that the +- ``comma_separated_list`` - similar to DelimitedList_, except that the list expressions can be any text value, or a quoted string; quoted strings can safely include commas without incorrectly breaking the string into two tokens -- ``restOfLine`` - all remaining printable characters up to but not including the next +- ``rest_of_line`` - all remaining printable characters up to but not including the next newline + +- ``common.integer`` - an integer with no leading sign; parsed token is converted to int + +- ``common.hex_integer`` - a hexadecimal integer; parsed token is converted to int + +- ``common.signed_integer`` - an integer with optional leading sign; parsed token is converted to int + +- ``common.fraction`` - signed_integer '/' signed_integer; parsed tokens are converted to float + +- ``common.mixed_integer`` - signed_integer '-' fraction; parsed tokens are converted to float + +- ``common.real`` - real number; parsed tokens are converted to float + +- ``common.sci_real`` - real number with optional scientific notation; parsed tokens are convert to float + +- ``common.number`` - any numeric expression; parsed tokens are returned as converted by the matched expression + +- ``common.fnumber`` - any numeric expression; parsed tokens are converted to float + +- ``common.identifier`` - a programming identifier (follows Python's syntax convention of leading alpha or "_", + followed by 0 or more alpha, num, or "_") + +- ``common.ipv4_address`` - IPv4 address + +- ``common.ipv6_address`` - IPv6 address + +- ``common.mac_address`` - MAC address (with ":", "-", or "." delimiters) + +- ``common.iso8601_date`` - date in ``YYYY-MM-DD`` format + +- ``common.iso8601_datetime`` - datetime in ``YYYY-MM-DDThh:mm:ss.s(Z|+-00:00)`` format; trailing seconds, + milliseconds, and timezone optional; accepts separating ``'T'`` or ``' '`` + +- ``common.url`` - matches URL strings and returns a ParseResults with named fields like those returned + by ``urllib.parse.urlparse()`` + + +Unicode character sets for international parsing +------------------------------------------------ +Pyparsing includes the ``unicode`` namespace that contains definitions for ``alphas``, ``nums``, ``alphanums``, +``identchars``, ``identbodychars``, and ``printables`` for character ranges besides 7- or 8-bit ASCII. You can +access them using code like the following:: + + import pyparsing as pp + ppu = pp.unicode + + greek_word = pp.Word(ppu.Greek.alphas) + greek_word[...].parse_string("Καλημέρα κόσμε") + +The following language ranges are defined. + +========================== ================= ================================================ +Unicode set Alternate names Description +-------------------------- ----------------- ------------------------------------------------ +Arabic العربية +Chinese 中文 +CJK Union of Chinese, Japanese, and Korean sets +Cyrillic кириллица +Devanagari देवनागरी +Greek Ελληνικά +Hangul Korean, 한국어 +Hebrew עִברִית +Japanese 日本語 Union of Kanji, Katakana, and Hiragana sets +Japanese.Hiragana ひらがな +Japanese.Kanji 漢字 +Japanese.Katakana カタカナ +Latin1 All Unicode characters up to code point 255 +LatinA +LatinB +Thai ไทย +BasicMultilingualPlane BMP All Unicode characters up to code point 65535 +========================== ================= ================================================ + +The base ``unicode`` class also includes definitions based on all Unicode code points up to ``sys.maxunicode``. This +set will include emojis, wingdings, and many other specialized and typographical variant characters. + + +Generating Railroad Diagrams +============================ +Grammars are conventionally represented in what are called "railroad diagrams", which allow you to visually follow +the sequence of tokens in a grammar along lines which are a bit like train tracks. You might want to generate a +railroad diagram for your grammar in order to better understand it yourself, or maybe to communicate it to others. + +Usage +----- +To generate a railroad diagram in pyparsing, you first have to install pyparsing with the ``diagrams`` extra. +To do this, just run ``pip install pyparsing[diagrams]``, and make sure you add ``pyparsing[diagrams]`` to any +``setup.py`` or ``requirements.txt`` that specifies pyparsing as a dependency. + +Create your parser as you normally would. Then call ``create_diagram()``, passing the name of an output HTML file.:: + + street_address = Word(nums).set_name("house_number") + Word(alphas)[1, ...].set_name("street_name") + street_address.set_name("street_address") + street_address.create_diagram("street_address_diagram.html") + +This will result in the railroad diagram being written to ``street_address_diagram.html``. + +`create_diagrams` takes the following arguments: + +- ``output_html`` (str or file-like object) - output target for generated diagram HTML + +- ``vertical`` (int) - threshold for formatting multiple alternatives vertically instead of horizontally (default=3) + +- ``show_results_names`` - bool flag whether diagram should show annotations for defined results names + +- ``show_groups`` - bool flag whether groups should be highlighted with an unlabeled surrounding box + +- ``embed`` - bool flag whether generated HTML should omit , , and tags to embed + the resulting HTML in an enclosing HTML source (such as PyScript HTML) + +- ``head`` - str containing additional HTML to insert into the section of the generated code; + can be used to insert custom CSS styling + +- ``body`` - str containing additional HTML to insert at the beginning of the section of the + generated code + + +Example +------- +You can view an example railroad diagram generated from `a pyparsing grammar for +SQL SELECT statements <_static/sql_railroad.html>`_ (generated from +`examples/select_parser.py <../examples/select_parser.py>`_). + +Naming tip +---------- +Parser elements that are separately named will be broken out as their own sub-diagrams. As a short-cut alternative +to going through and adding ``.set_name()`` calls on all your sub-expressions, you can use ``autoname_elements()`` after +defining your complete grammar. For example:: + + a = pp.Literal("a") + b = pp.Literal("b").set_name("bbb") + pp.autoname_elements() + +`a` will get named "a", while `b` will keep its name "bbb". + +Customization +------------- +You can customize the resulting diagram in a few ways. +To do so, run ``pyparsing.diagrams.to_railroad`` to convert your grammar into a form understood by the +`railroad-diagrams `_ module, and +then ``pyparsing.diagrams.railroad_to_html`` to convert that into an HTML document. For example:: + + from pyparsing.diagram import to_railroad, railroad_to_html + + with open('output.html', 'w') as fp: + railroad = to_railroad(my_grammar) + fp.write(railroad_to_html(railroad)) + +This will result in the railroad diagram being written to ``output.html`` + +You can then pass in additional keyword arguments to ``pyparsing.diagrams.to_railroad``, which will be passed +into the ``Diagram()`` constructor of the underlying library, +`as explained here `_. + +In addition, you can edit global options in the underlying library, by editing constants:: + + from pyparsing.diagram import to_railroad, railroad_to_html + import railroad + + railroad.DIAGRAM_CLASS = "my-custom-class" + my_railroad = to_railroad(my_grammar) + +These options `are documented here `_. + +Finally, you can edit the HTML produced by ``pyparsing.diagrams.railroad_to_html`` by passing in certain keyword +arguments that will be used in the HTML template. Currently, these are: + +- ``head``: A string containing HTML to use in the ```` tag. This might be a stylesheet or other metadata + +- ``body``: A string containing HTML to use in the ```` tag, above the actual diagram. This might consist of a + heading, description, or JavaScript. + +If you want to provide a custom stylesheet using the ``head`` keyword, you can make use of the following CSS classes: + +- ``railroad-group``: A group containing everything relating to a given element group (ie something with a heading) + +- ``railroad-heading``: The title for each group + +- ``railroad-svg``: A div containing only the diagram SVG for each group + +- ``railroad-description``: A div containing the group description (unused) diff --git a/docs/_static/pyparsingClassDiagram.png b/docs/_static/pyparsingClassDiagram.png deleted file mode 100644 index f59baaf3..00000000 Binary files a/docs/_static/pyparsingClassDiagram.png and /dev/null differ diff --git a/docs/_static/pyparsingClassDiagram.jpg b/docs/_static/pyparsingClassDiagram_1.5.2.jpg similarity index 100% rename from docs/_static/pyparsingClassDiagram.jpg rename to docs/_static/pyparsingClassDiagram_1.5.2.jpg diff --git a/docs/_static/pyparsingClassDiagram_3.0.9.jpg b/docs/_static/pyparsingClassDiagram_3.0.9.jpg new file mode 100644 index 00000000..d92feed4 Binary files /dev/null and b/docs/_static/pyparsingClassDiagram_3.0.9.jpg differ diff --git a/docs/_static/sql_railroad.html b/docs/_static/sql_railroad.html new file mode 100644 index 00000000..03933491 --- /dev/null +++ b/docs/_static/sql_railroad.html @@ -0,0 +1,503 @@ + + + + + + + + + + +
+

Forward

+
+
+ + + + + + + + +'select' + +'*' + +column name + + + + + +',' +Suppress +column name + +'from' + +table name + + + + + +',' +Suppress +table name + + + + +'where' +'or' term +
+
+ +
+

column name

+
+
+ + + + + + +W:(A-Za-z, $0-9A-Z_a-z) +identifier + + + + +'.' + +W:(A-Za-z, $0-9A-Z_a-z) +identifier + +
+
+ +
+

table name

+
+
+ + + + + + +W:(A-Za-z, $0-9A-Z_a-z) +identifier + + + + +'.' + +W:(A-Za-z, $0-9A-Z_a-z) +identifier + +
+
+ +
+

'or' term

+
+
+ + + + + + + + + +'and' term +'or' +'and' term +_FB + +'and' term + + +'or' +'and' term + +'and' term +
+
+ +
+

'and' term

+
+
+ + + + + + + + + +'not' term +'and' +'not' term +_FB + +'not' term + + +'and' +'not' term + +'not' term +
+
+ +
+

'not' term

+
+
+ + + + + + + + +'not' +'not' term +_FB + + + +'not' +'not' term + + + + + + +column name + +'=' +'!=' +'<=' +'<' +'>=' +'>' +'EQ' +'NE' +'LT' +'LE' +'GT' +'GE' +Unnamed 2 + + +column name +'in' + + +'(' + +Unnamed 2 + + + + + +',' +Suppress +Unnamed 2 + +')' + + +column name +'in' + + +'(' +Forward +')' + + +column name +'is' + +'null' + +'not' +'null' + + + +'(' +Suppress +'or' term + +')' +Suppress +
+
+ +
+

Unnamed 2

+
+
+ + + + + + + + +Re:('[+-]?(?:\d+\.\d*|\.\d+)') +real number + +Re:('[+-]?\d+') +signed integer + + + +Re:('"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*') +'"' + +Re:("'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*") +"'" +quotedString using single or double quotes +column name +
+
+ + + \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index eaa817f6..5f5bd8a0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Configuration file for the Sphinx documentation builder. # @@ -14,15 +13,16 @@ # import os import sys -sys.path.insert(0, os.path.abspath('..')) + +sys.path.insert(0, os.path.abspath("..")) from pyparsing import __version__ as pyparsing_version # -- Project information ----------------------------------------------------- -project = 'PyParsing' -copyright = '2018, Paul T. McGuire' -author = 'Paul T. McGuire' +project = "PyParsing" +copyright = "2018-2022, Paul T. McGuire" +author = "Paul T. McGuire" # The short X.Y version version = pyparsing_version @@ -40,20 +40,20 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', + "sphinx.ext.autodoc", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -68,7 +68,7 @@ exclude_patterns = [] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # -- Options for HTML output ------------------------------------------------- @@ -76,7 +76,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = "alabaster" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -87,7 +87,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Custom sidebar templates, must be a dictionary that maps document names # to template names. @@ -103,7 +103,7 @@ # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'PyParsingdoc' +htmlhelp_basename = "PyParsingdoc" # -- Options for LaTeX output ------------------------------------------------ @@ -112,15 +112,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -130,8 +127,13 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'PyParsing.tex', 'PyParsing Documentation', - 'Paul T. McGuire', 'manual'), + ( + master_doc, + "PyParsing.tex", + "PyParsing Documentation", + "Paul T. McGuire", + "manual", + ), ] @@ -139,10 +141,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'pyparsing', 'PyParsing Documentation', - [author], 1) -] +man_pages = [(master_doc, "pyparsing", "PyParsing Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------------- @@ -151,9 +150,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'PyParsing', 'PyParsing Documentation', - author, 'PyParsing', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "PyParsing", + "PyParsing Documentation", + author, + "PyParsing", + "Python PEG parsing library.", + "Miscellaneous", + ), ] @@ -175,7 +180,7 @@ # epub_uid = '' # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # -- Extension configuration ------------------------------------------------- diff --git a/docs/index.rst b/docs/index.rst index f39b282b..65f05571 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,6 +11,7 @@ Release v\ |version| :maxdepth: 2 :caption: Contents: + whats_new_in_3_0_0 HowToUsePyparsing modules CODE_OF_CONDUCT diff --git a/docs/make_sphinx_docs.bat b/docs/make_sphinx_docs.bat new file mode 100644 index 00000000..341fd671 --- /dev/null +++ b/docs/make_sphinx_docs.bat @@ -0,0 +1 @@ +sphinx-build.exe -M html . _build diff --git a/docs/pyparsing.rst b/docs/pyparsing.rst index 6d9d44ca..6d51a78d 100644 --- a/docs/pyparsing.rst +++ b/docs/pyparsing.rst @@ -3,5 +3,5 @@ pyparsing module .. automodule:: pyparsing :members: - :undoc-members: + :special-members: :show-inheritance: diff --git a/docs/pyparsing_class_diagram.puml b/docs/pyparsing_class_diagram.puml new file mode 100644 index 00000000..f90f99e2 --- /dev/null +++ b/docs/pyparsing_class_diagram.puml @@ -0,0 +1,352 @@ +@startuml +'https://plantuml.com/class-diagram + +top to bottom direction +hide circle +hide empty members +'hide empty methods +skinparam groupInheritance 3 + +note as N1 +Class Diagram +--- +pyparsing 3.0.9 +May, 2022 +end note + +N1 <-[hidden]- unicode + +package core { + +class globals { +quoted_string +sgl_quoted_string +dbl_quoted_string +counted_array() +match_previous_literal() +match_previous_expr() +one_of() +dict_of() +original_text_for() +ungroup() +nested_expr() +make_html_tags() +make_xml_tags() +common_html_entity +replace_html_entity() +class OpAssoc +infix_notation() +class IndentedBlock +c_style_comment +html_comment +rest_of_line +dbl_slash_comment +cpp_style_comment +java_style_comment +python_style_comment +match_only_at_col() +replace_with() +remove_quotes() +with_attribute() +with_class() +trace_parse_action() +condition_as_parse_action() +srange() +token_map() +autoname_elements() +} + +class ParseResults { +class List +{static}from_dict() +__getitem__() +__setitem__() +__contains__() +__len__() +__bool__() +__iter__() +__reversed__() +__getattr__() +__add__() +__getstate__() +__setstate__() +__getnewargs__() +__dir__() +as_dict() +as_list() +dump() +get_name() +items() +keys() +values() +haskeys() +pop() +get() +insert() +append() +extend() +clear() +copy() +get_name() +pprint() +} + +class ParseBaseException #ffffff { +{static} explain_exception() +explain() +mark_input_line() +line +lineno +column +parser_element +} +class ParseException +class ParseFatalException +class ParseSyntaxException + +ParseBaseException <|-- ParseException +ParseBaseException <|-- ParseFatalException +ParseFatalException <|-- ParseSyntaxException + +class ParserElement { +name: str +results_name: str +--- +{classifier} enable_packrat() +{classifier} enable_left_recursion() +{classifier} disable_memoization() +{classifier} set_default_whitespace_chars() +{classifier} inline_literals_using() +{classifier} reset_cache() + +{static} verbose_stacktrace +suppress_warning() + +operator + () -> And +operator - () -> And.ErrorStop +operator | () -> MatchFirst +operator ^ () -> Or +operator & () -> Each +operator ~ () -> NotAny +operator [] () -> _MultipleMatch +operator () () [set_results_name()] + +add_condition() +add_parse_action() +set_parse_action() +copy() +ignore(expr) +leave_whitespace() +parse_with_tabs() +suppress() +set_break() +set_debug() +set_debug_actions() +set_name() +set_results_name() +parse_string() +scan_string() +search_string() +transform_string() +split() +run_tests() +recurse() +create_diagram() +} +class Token #ffffff +class ParseExpression #ffffff { +exprs: list[ParserElement] +} +class ParseElementEnhance #ffffff { +expr: ParserElement +} +class _PositionToken #ffffff +class Char +class White +class Word { +'Word(init_chars: str, body_chars: str, min: int, \nmax: int, exact: int, as_keyword: bool, exclude_chars: str) +} +class Keyword { +{static} set_default_keyword_chars(chars: str) +} +class CaselessKeyword +class Empty +class Literal +class Regex +class NoMatch +class CharsNotIn +class QuotedString + +class And +class Or +class MatchFirst +class Each + +class OneOrMore +class ZeroOrMore +class DelimitedList +class SkipTo +class Group +class Forward { +operator <<= () +} + +class LineStart +class LineEnd +class StringStart +class StringEnd +class WordStart +class WordEnd +class _MultipleMatch #ffffff +class FollowedBy +class PrecededBy +class AtLineStart +class AtStringStart + +class TokenConverter #ffffff +class Located +class Opt + +class Combine +class Group +class Dict +class Suppress + +ParserElement <|-- Token +ParserElement <|----- ParseExpression +Token <|-- _PositionToken +ParserElement <|----- ParseElementEnhance + +'ParseElementEnhance ---> ParserElement +'ParseExpression ---> "*" ParserElement + + +Token <|-- Empty +Token <|-- CloseMatch +Token <|-- NoMatch +Token <|-- Literal +Token <|-- Word +Token <|---- Keyword +Token <|--- Regex +Token <|--- CharsNotIn +Token <|-- White +Token <|---- QuotedString +Word <|-- Char +Literal <|-- CaselessLiteral +Keyword <|-- CaselessKeyword + +ParseExpression <|-- And +ParseExpression <|-- Or +ParseExpression <|-- MatchFirst +ParseExpression <|-- Each + +ParseElementEnhance <|-- SkipTo +ParseElementEnhance <|--- Forward +ParseElementEnhance <|-- Located +ParseElementEnhance <|--- _MultipleMatch +_MultipleMatch <|-- OneOrMore +_MultipleMatch <|-- ZeroOrMore +ParseElementEnhance <|-- DelimitedList +ParseElementEnhance <|--- NotAny +ParseElementEnhance <|--- FollowedBy +ParseElementEnhance <|--- PrecededBy +ParseElementEnhance <|-- Opt +ParseElementEnhance <|--- TokenConverter +ParseElementEnhance <|-- AtStringStart +ParseElementEnhance <|-- AtLineStart +TokenConverter <|-- Group +TokenConverter <|-- Dict +TokenConverter <|-- Suppress +TokenConverter <|-- Combine + +_PositionToken <|-- LineStart +_PositionToken <|-- LineEnd +_PositionToken <|-- WordStart +_PositionToken <|-- WordEnd +_PositionToken <|-- StringStart +_PositionToken <|-- StringEnd + +} + +package common { +class " " { +comma_separated_list +convert_to_integer() +convert_to_float() +integer +hex_integer +signed_integer +fraction +mixed_integer +real +sci_real +number +fnumber +identifier +ipv4_address +ipv6_address +mac_address +convert_to_date() +convert_to_datetime() +iso8601_date +iso8601_datetime +uuid +strip_html_tags() +upcase_tokens() +downcase_tokens() +url +} + +} +package unicode { +class unicode_set { +printables: str +alphas: str +nums: str +alphanums: str +identchars: str +identbodychars: str +} +class Latin1 +class LatinA +class LatinB +class BasicMultilingualPlane +class Chinese +class Thai +class Japanese { +class Kanji +class Hiragana +class Katakana +} +class Greek +class Hangul +class Arabic +class Devanagari +class Hebrew +class Cyrillic + +unicode_set <|-- Latin1 +unicode_set <|--- LatinA +unicode_set <|-- LatinB +unicode_set <|---- BasicMultilingualPlane +unicode_set <|-- Greek +unicode_set <|--- Cyrillic +unicode_set <|--- Chinese +unicode_set <|--- Japanese +unicode_set <|--- Hangul +Chinese <|-- CJK +Japanese <|-- CJK +Hangul <|-- CJK +unicode_set <|-- Thai +unicode_set <|-- Arabic +unicode_set <|-- Hebrew +unicode_set <|--- Devanagari + +} + +ParserElement <-[hidden] ParseBaseException +'ParseBaseException <-[hidden] globals +'globals <-[hidden] ParserElement +CJK <-[hidden]-- common + +@enduml \ No newline at end of file diff --git a/docs/whats_new_in_3_0_0.rst b/docs/whats_new_in_3_0_0.rst new file mode 100644 index 00000000..2f4fe3de --- /dev/null +++ b/docs/whats_new_in_3_0_0.rst @@ -0,0 +1,816 @@ +============================= +What's New in Pyparsing 3.0.0 +============================= + +:author: Paul McGuire + +:date: May, 2022 + +:abstract: This document summarizes the changes made + in the 3.0.0 release of pyparsing. + (Updated to reflect changes up to 3.0.10) + +.. sectnum:: :depth: 4 + +.. contents:: :depth: 4 + + +New Features +============ + +PEP-8 naming +------------ +This release of pyparsing will (finally!) include PEP-8 compatible names and arguments. +Backward-compatibility is maintained by defining synonyms using the old camelCase names +pointing to the new snake_case names. + +This code written using non-PEP8 names:: + + wd = pp.Word(pp.printables, excludeChars="$") + wd_list = pp.delimitedList(wd, delim="$") + print(wd_list.parseString("dkls$134lkjk$lsd$$").asList()) + +can now be written as:: + + wd = pp.Word(pp.printables, exclude_chars="$") + wd_list = pp.delimited_list(wd, delim="$") + print(wd_list.parse_string("dkls$134lkjk$lsd$$").as_list()) + +Pyparsing 3.0 will run both versions of this example. + +New code should be written using the PEP-8 compatible names. The compatibility +synonyms will be removed in a future version of pyparsing. + + +Railroad diagramming +-------------------- +An excellent new enhancement is the new railroad diagram +generator for documenting pyparsing parsers.:: + + import pyparsing as pp + + # define a simple grammar for parsing street addresses such + # as "123 Main Street" + # number word... + number = pp.Word(pp.nums).set_name("number") + name = pp.Word(pp.alphas).set_name("word")[1, ...] + + parser = number("house_number") + name("street") + parser.set_name("street address") + + # construct railroad track diagram for this parser and + # save as HTML + parser.create_diagram('parser_rr_diag.html') + +``create_diagram`` accepts these named arguments: + +- ``vertical`` (int) - threshold for formatting multiple alternatives vertically + instead of horizontally (default=3) +- ``show_results_names`` - bool flag whether diagram should show annotations for + defined results names +- ``show_groups`` - bool flag whether groups should be highlighted with an unlabeled surrounding box +- ``embed`` - bool flag whether generated HTML should omit ````, ````, and ```` tags to embed + the resulting HTML in an enclosing HTML source (new in 3.0.10) +- ``head`` - str containing additional HTML to insert into the ```` section of the + generated code; can be used to insert custom CSS styling +- ``body`` - str containing additional HTML to insert at the beginning of the ```` section of the + generated code + +To use this new feature, install the supporting diagramming packages using:: + + pip install pyparsing[diagrams] + +See more in the examples directory: ``make_diagram.py`` and ``railroad_diagram_demo.py``. + +(Railroad diagram enhancement contributed by Michael Milton) + +Support for left-recursive parsers +---------------------------------- +Another significant enhancement in 3.0 is support for left-recursive (LR) +parsers. Previously, given a left-recursive parser, pyparsing would +recurse repeatedly until hitting the Python recursion limit. Following +the methods of the Python PEG parser, pyparsing uses a variation of +packrat parsing to detect and handle left-recursion during parsing.:: + + import pyparsing as pp + pp.ParserElement.enable_left_recursion() + + # a common left-recursion definition + # define a list of items as 'list + item | item' + # BNF: + # item_list := item_list item | item + # item := word of alphas + item_list = pp.Forward() + item = pp.Word(pp.alphas) + item_list <<= item_list + item | item + + item_list.run_tests("""\ + To parse or not to parse that is the question + """) + +Prints:: + + ['To', 'parse', 'or', 'not', 'to', 'parse', 'that', 'is', 'the', 'question'] + +See more examples in ``left_recursion.py`` in the pyparsing examples directory. + +(LR parsing support contributed by Max Fischer) + +Packrat/memoization enable and disable methods +---------------------------------------------- +As part of the implementation of left-recursion support, new methods have been added +to enable and disable packrat parsing. + +====================== ======================================================= +Name Description +---------------------- ------------------------------------------------------- +enable_packrat Enable packrat parsing (with specified cache size) +enable_left_recursion Enable left-recursion cache +disable_memoization Disable all internal parsing caches +====================== ======================================================= + +Type annotations on all public methods +-------------------------------------- +Python 3.6 and upward compatible type annotations have been added to most of the +public methods in pyparsing. This should facilitate developing pyparsing-based +applications using IDEs for development-time type checking. + +New string constants ``identchars`` and ``identbodychars`` to help in defining identifier Word expressions +---------------------------------------------------------------------------------------------------------- +Two new module-level strings have been added to help when defining identifiers, +``identchars`` and ``identbodychars``. + +Instead of writing:: + + import pyparsing as pp + identifier = pp.Word(pp.alphas + "_", pp.alphanums + "_") + +you will be able to write:: + + identifier = pp.Word(pp.identchars, pp.identbodychars) + +Those constants have also been added to all the Unicode string classes:: + + import pyparsing as pp + ppu = pp.pyparsing_unicode + + cjk_identifier = pp.Word(ppu.CJK.identchars, ppu.CJK.identbodychars) + greek_identifier = pp.Word(ppu.Greek.identchars, ppu.Greek.identbodychars) + + +Refactored/added diagnostic flags +--------------------------------- +Expanded ``__diag__`` and ``__compat__`` to actual classes instead of +just namespaces, to add some helpful behavior: + +- ``pyparsing.enable_diag()`` and ``pyparsing.disable_diag()`` methods to give extra + help when setting or clearing flags (detects invalid + flag names, detects when trying to set a ``__compat__`` flag + that is no longer settable). Use these methods now to + set or clear flags, instead of directly setting to ``True`` or + ``False``:: + + import pyparsing as pp + pp.enable_diag(pp.Diagnostics.warn_multiple_tokens_in_named_alternation) + +- ``pyparsing.enable_all_warnings()`` is another helper that sets + all "warn*" diagnostics to ``True``:: + + pp.enable_all_warnings() + +- added support for calling ``enable_all_warnings()`` if warnings are enabled + using the Python ``-W`` switch, or setting a non-empty value to the environment + variable ``PYPARSINGENABLEALLWARNINGS``. (If using ``-Wd`` for testing, but + wishing to disable pyparsing warnings, add ``-Wi:::pyparsing``.) + +- added new warning, ``warn_on_match_first_with_lshift_operator`` to + warn when using ``'<<'`` with a ``'|'`` ``MatchFirst`` operator, + which will + create an unintended expression due to precedence of operations. + + Example: This statement will erroneously define the ``fwd`` expression + as just ``expr_a``, even though ``expr_a | expr_b`` was intended, + since ``'<<'`` operator has precedence over ``'|'``:: + + fwd << expr_a | expr_b + + To correct this, use the ``'<<='`` operator (preferred) or parentheses + to override operator precedence:: + + fwd <<= expr_a | expr_b + + or:: + + fwd << (expr_a | expr_b) + +- ``warn_on_parse_using_empty_Forward`` - warns that a ``Forward`` + has been included in a grammar, but no expression was + attached to it using ``'<<='`` or ``'<<'`` + +- ``warn_on_assignment_to_Forward`` - warns that a ``Forward`` has + been created, but was probably later overwritten by + erroneously using ``'='`` instead of ``'<<='`` (this is a common + mistake when using Forwards) + (**currently not working on PyPy**) + +Support for yielding native Python ``list`` and ``dict`` types in place of ``ParseResults`` +------------------------------------------------------------------------------------------- +To support parsers that are intended to generate native Python collection +types such as lists and dicts, the ``Group`` and ``Dict`` classes now accept an +additional boolean keyword argument ``aslist`` and ``asdict`` respectively. See +the ``jsonParser.py`` example in the ``pyparsing/examples`` source directory for +how to return types as ``ParseResults`` and as Python collection types, and the +distinctions in working with the different types. + +In addition parse actions that must return a value of list type (which would +normally be converted internally to a ``ParseResults``) can override this default +behavior by returning their list wrapped in the new ``ParseResults.List`` class:: + + # this parse action tries to return a list, but pyparsing + # will convert to a ParseResults + def return_as_list_but_still_get_parse_results(tokens): + return tokens.asList() + + # this parse action returns the tokens as a list, and pyparsing will + # maintain its list type in the final parsing results + def return_as_list(tokens): + return ParseResults.List(tokens.asList()) + +This is the mechanism used internally by the ``Group`` class when defined +using ``aslist=True``. + +New Located class to replace ``locatedExpr`` helper method +---------------------------------------------------------- +The new ``Located`` class will replace the current ``locatedExpr`` method for +marking parsed results with the start and end locations of the parsed data in +the input string. ``locatedExpr`` had several bugs, and returned its results +in a hard-to-use format (location data and results names were mixed in with +the located expression's parsed results, and wrapped in an unnecessary extra +nesting level). + +For this code:: + + wd = Word(alphas) + for match in locatedExpr(wd).search_string("ljsdf123lksdjjf123lkkjj1222"): + print(match) + +the docs for ``locatedExpr`` show this output:: + + [[0, 'ljsdf', 5]] + [[8, 'lksdjjf', 15]] + [[18, 'lkkjj', 23]] + +The parsed values and the start and end locations are merged into a single +nested ``ParseResults`` (and any results names in the parsed values are also +merged in with the start and end location names). + +Using ``Located``, the output is:: + + [0, ['ljsdf'], 5] + [8, ['lksdjjf'], 15] + [18, ['lkkjj'], 23] + +With ``Located``, the parsed expression values and results names are kept +separate in the second parsed value, and there is no extra grouping level +on the whole result. + +The existing ``locatedExpr`` is retained for backward-compatibility, but will be +deprecated in a future release. + +New ``AtLineStart`` and ``AtStringStart`` classes +------------------------------------------------- +As part of fixing some matching behavior in ``LineStart`` and ``StringStart``, two new +classes have been added: ``AtLineStart`` and ``AtStringStart``. + +``LineStart`` and ``StringStart`` can be treated as separate elements, including whitespace skipping. +``AtLineStart`` and ``AtStringStart`` enforce that an expression starts exactly at column 1, with no +leading whitespace.:: + + (LineStart() + Word(alphas)).parseString("ABC") # passes + (LineStart() + Word(alphas)).parseString(" ABC") # passes + AtLineStart(Word(alphas)).parseString(" ABC") # fails + +[This is a fix to behavior that was added in 3.0.0, but was actually a regression from 2.4.x.] + +New ``IndentedBlock`` class to replace ``indentedBlock`` helper method +---------------------------------------------------------------------- +The new ``IndentedBlock`` class will replace the current ``indentedBlock`` method +for defining indented blocks of text, similar to Python source code. Using +``IndentedBlock``, the expression instance itself keeps track of the indent stack, +so a separate external ``indentStack`` variable is no longer required. + +Here is a simple example of an expression containing an alphabetic key, followed +by an indented list of integers:: + + integer = pp.Word(pp.nums) + group = pp.Group(pp.Char(pp.alphas) + pp.IndentedBlock(integer)) + +parses:: + + A + 100 + 101 + B + 200 + 201 + +as:: + + [['A', [100, 101]], ['B', [200, 201]]] + +By default, the results returned from the ``IndentedBlock`` are grouped. + +``IndentedBlock`` may also be used to define a recursive indented block (containing nested +indented blocks). + +The existing ``indentedBlock`` is retained for backward-compatibility, but will be +deprecated in a future release. + +Shortened tracebacks +-------------------- +Cleaned up default tracebacks when getting a ``ParseException`` when calling +``parse_string``. Exception traces should now stop at the call in ``parse_string``, +and not include the internal pyparsing traceback frames. (If the full traceback +is desired, then set ``ParserElement.verbose_traceback`` to ``True``.) + +Improved debug logging +---------------------- +Debug logging has been improved by: + +- Including ``try/match/fail`` logging when getting results from the + packrat cache (previously cache hits did not show debug logging). + Values returned from the packrat cache are marked with an '*'. + +- Improved fail logging, showing the failed expression, text line, and marker where + the failure occurred. + +- Adding ``with_line_numbers`` to ``pyparsing_testing``. Use ``with_line_numbers`` + to visualize the data being parsed, with line and column numbers corresponding + to the values output when enabling ``set_debug()`` on an expression:: + + data = """\ + A + 100""" + expr = pp.Word(pp.alphanums).set_name("word").set_debug() + print(ppt.with_line_numbers(data)) + expr[...].parseString(data) + + prints:: + + . 1 + 1234567890 + 1: A + 2: 100 + Match word at loc 3(1,4) + A + ^ + Matched word -> ['A'] + Match word at loc 11(2,7) + 100 + ^ + Matched word -> ['100'] + +New / improved examples +----------------------- +- ``number_words.py`` includes a parser/evaluator to parse ``"forty-two"`` + and return ``42``. Also includes example code to generate a railroad + diagram for this parser. + +- ``BigQueryViewParser.py`` added to examples directory, submitted + by Michael Smedberg. + +- ``booleansearchparser.py`` added to examples directory, submitted + by xecgr. Builds on searchparser.py, adding support for '*' + wildcards and non-Western alphabets. + +- Improvements in ``select_parser.py``, to include new SQL syntax + from SQLite, submitted by Robert Coup. + +- Off-by-one bug found in the ``roman_numerals.py`` example, a bug + that has been there for about 14 years! Submitted by + Jay Pedersen. + +- A simplified Lua parser has been added to the examples + (``lua_parser.py``). + +- Demonstration of defining a custom Unicode set for cuneiform + symbols, as well as simple Cuneiform->Python conversion is included + in ``cuneiform_python.py``. + +- Fixed bug in ``delta_time.py`` example, when using a quantity + of seconds/minutes/hours/days > 999. + +Other new features +------------------ +- ``url`` expression added to ``pyparsing_common``, with named fields for + common fields in URLs. See the updated ``urlExtractorNew.py`` file in the + ``examples`` directory. Submitted by Wolfgang Fahl. + +- ``DelimitedList`` now supports an additional flag ``allow_trailing_delim``, + to optionally parse an additional delimiter at the end of the list. + Submitted by Kazantcev Andrey. + +- Added global method ``autoname_elements()`` to call ``set_name()`` on all locally + defined ``ParserElements`` that haven't been explicitly named using ``set_name()``, using + their local variable name. Useful for setting names on multiple elements when + creating a railroad diagram:: + + a = pp.Literal("a") + b = pp.Literal("b").set_name("bbb") + pp.autoname_elements() + + ``a`` will get named "a", while ``b`` will keep its name "bbb". + +- Enhanced default strings created for ``Word`` expressions, now showing + string ranges if possible. ``Word(alphas)`` would formerly + print as ``W:(ABCD...)``, now prints as ``W:(A-Za-z)``. + +- Better exception messages to show full word where an exception occurred.:: + + Word(alphas)[...].parse_string("abc 123", parse_all=True) + + Was:: + + pyparsing.ParseException: Expected end of text, found '1' (at char 4), (line:1, col:5) + + Now:: + + pyparsing.exceptions.ParseException: Expected end of text, found '123' (at char 4), (line:1, col:5) + +- Using ``...`` for ``SkipTo`` can now be wrapped in ``Suppress`` to suppress + the skipped text from the returned parse results.:: + + source = "lead in START relevant text END trailing text" + start_marker = Keyword("START") + end_marker = Keyword("END") + find_body = Suppress(...) + start_marker + ... + end_marker + print(find_body.parse_string(source).dump()) + + Prints:: + + ['START', 'relevant text ', 'END'] + - _skipped: ['relevant text '] + +- Added ``ignore_whitespace(recurse:bool = True)`` and added a + ``recurse`` argument to ``leave_whitespace``, both added to provide finer + control over pyparsing's whitespace skipping. Contributed by + Michael Milton. + +- Added ``ParserElement.recurse()`` method to make it simpler for + grammar utilities to navigate through the tree of expressions in + a pyparsing grammar. + +- The ``repr()`` string for ``ParseResults`` is now of the form:: + + ParseResults([tokens], {named_results}) + + The previous form omitted the leading ``ParseResults`` class name, + and was easily misinterpreted as a ``tuple`` containing a ``list`` and + a ``dict``. + +- Minor reformatting of output from ``run_tests`` to make embedded + comments more visible. + +- New ``pyparsing_test`` namespace, assert methods and classes added to support writing + unit tests. + + - ``assertParseResultsEquals`` + - ``assertParseAndCheckList`` + - ``assertParseAndCheckDict`` + - ``assertRunTestResults`` + - ``assertRaisesParseException`` + - ``reset_pyparsing_context`` context manager, to restore pyparsing + config settings + +- Enhanced error messages and error locations when parsing fails on + the ``Keyword`` or ``CaselessKeyword`` classes due to the presence of a + preceding or trailing keyword character. + +- Enhanced the ``Regex`` class to be compatible with re's compiled with the + re-equivalent ``regex`` module. Individual expressions can be built with + regex compiled expressions using:: + + import pyparsing as pp + import regex + + # would use regex for this expression + integer_parser = pp.Regex(regex.compile(r'\d+')) + +- Fixed handling of ``ParseSyntaxExceptions`` raised as part of ``Each`` + expressions, when sub-expressions contain ``'-'`` backtrack + suppression. + +- Potential performance enhancement when parsing ``Word`` + expressions built from ``pyparsing_unicode`` character sets. ``Word`` now + internally converts ranges of consecutive characters to regex + character ranges (converting ``"0123456789"`` to ``"0-9"`` for instance). + +- Added a caseless parameter to the ``CloseMatch`` class to allow for casing to be + ignored when checking for close matches. Contributed by Adrian Edwards. + + +API Changes +=========== + +- [Note added in pyparsing 3.0.7, reflecting a change in 3.0.0] + Fixed a bug in the ``ParseResults`` class implementation of ``__bool__``, which + would formerly return ``False`` if the ``ParseResults`` item list was empty, even if it + contained named results. Now ``ParseResults`` will return ``True`` if either the item + list is not empty *or* if the named results list is not empty:: + + # generate an empty ParseResults by parsing a blank string with a ZeroOrMore + result = Word(alphas)[...].parse_string("") + print(result.as_list()) + print(result.as_dict()) + print(bool(result)) + + # add a results name to the result + result["name"] = "empty result" + print(result.as_list()) + print(result.as_dict()) + print(bool(result)) + + Prints:: + + [] + {} + False + + [] + {'name': 'empty result'} + True + + In previous versions, the second call to ``bool()`` would return ``False``. + +- [Note added in pyparsing 3.0.4, reflecting a change in 3.0.0] + The ``ParseResults`` class now uses ``__slots__`` to pre-define instance attributes. This + means that code written like this (which was allowed in pyparsing 2.4.7):: + + result = Word(alphas).parseString("abc") + result.xyz = 100 + + now raises this Python exception:: + + AttributeError: 'ParseResults' object has no attribute 'xyz' + + To add new attribute values to ParseResults object in 3.0.0 and later, you must + assign them using indexed notation:: + + result["xyz"] = 100 + + You will still be able to access this new value as an attribute or as an + indexed item. + +- ``enable_diag()`` and ``disable_diag()`` methods to + enable specific diagnostic values (instead of setting them + to ``True`` or ``False``). ``enable_all_warnings()`` has + also been added. + +- ``counted_array`` formerly returned its list of items nested + within another list, so that accessing the items required + indexing the 0'th element to get the actual list. This + extra nesting has been removed. In addition, if there are + other metadata fields parsed between the count and the + list items, they can be preserved in the resulting list + if given results names. + +- ``ParseException.explain()`` is now an instance method of + ``ParseException``:: + + expr = pp.Word(pp.nums) * 3 + try: + expr.parse_string("123 456 A789") + except pp.ParseException as pe: + print(pe.explain(depth=0)) + + prints:: + + 123 456 A789 + ^ + ParseException: Expected W:(0-9), found 'A789' (at char 8), (line:1, col:9) + + To run explain against other exceptions, use + ``ParseException.explain_exception()``. + +- Debug actions now take an added keyword argument ``cache_hit``. + Now that debug actions are called for expressions matched in the + packrat parsing cache, debug actions are now called with this extra + flag, set to ``True``. For custom debug actions, it is necessary to add + support for this new argument. + +- ``ZeroOrMore`` expressions that have results names will now + include empty lists for their name if no matches are found. + Previously, no named result would be present. Code that tested + for the presence of any expressions using ``"if name in results:"`` + will now always return ``True``. This code will need to change to + ``"if name in results and results[name]:"`` or just + ``"if results[name]:"``. Also, any parser unit tests that check the + ``as_dict()`` contents will now see additional entries for parsers + having named ``ZeroOrMore`` expressions, whose values will be ``[]``. + +- ``ParserElement.set_default_whitespace_chars`` will now update + whitespace characters on all built-in expressions defined + in the pyparsing module. + +- ``camelCase`` names have been converted to PEP-8 ``snake_case`` names. + + Method names and arguments that were camel case (such as ``parseString``) + have been replaced with PEP-8 snake case versions (``parse_string``). + + Backward-compatibility synonyms for all names and arguments have + been included, to allow parsers written using the old names to run + without change. The synonyms will be removed in a future release. + New parser code should be written using the new PEP-8 snake case names. + +============================== ================================ +Name Previous name +------------------------------ -------------------------------- +ParserElement +- parse_string parseString +- scan_string scanString +- search_string searchString +- transform_string transformString +- add_condition addCondition +- add_parse_action addParseAction +- can_parse_next canParseNext +- default_name defaultName +- enable_left_recursion enableLeftRecursion +- enable_packrat enablePackrat +- ignore_whitespace ignoreWhitespace +- inline_literals_using inlineLiteralsUsing +- parse_file parseFile +- leave_whitespace leaveWhitespace +- parse_string parseString +- parse_with_tabs parseWithTabs +- reset_cache resetCache +- run_tests runTests +- scan_string scanString +- search_string searchString +- set_break setBreak +- set_debug setDebug +- set_debug_actions setDebugActions +- set_default_whitespace_chars setDefaultWhitespaceChars +- set_fail_action setFailAction +- set_name setName +- set_parse_action setParseAction +- set_results_name setResultsName +- set_whitespace_chars setWhitespaceChars +- transform_string transformString +- try_parse tryParse + +ParseResults +- as_list asList +- as_dict asDict +- get_name getName + +ParseBaseException +- parser_element parserElement + +any_open_tag anyOpenTag +any_close_tag anyCloseTag +c_style_comment cStyleComment +common_html_entity commonHTMLEntity +condition_as_parse_action conditionAsParseAction +counted_array countedArray +cpp_style_comment cppStyleComment +dbl_quoted_string dblQuotedString +dbl_slash_comment dblSlashComment +DelimitedList delimitedList +DelimitedList delimited_list +dict_of dictOf +html_comment htmlComment +infix_notation infixNotation +java_style_comment javaStyleComment +line_end lineEnd +line_start lineStart +make_html_tags makeHTMLTags +make_xml_tags makeXMLTags +match_only_at_col matchOnlyAtCol +match_previous_expr matchPreviousExpr +match_previous_literal matchPreviousLiteral +nested_expr nestedExpr +null_debug_action nullDebugAction +one_of oneOf +OpAssoc opAssoc +original_text_for originalTextFor +python_style_comment pythonStyleComment +quoted_string quotedString +remove_quotes removeQuotes +replace_html_entity replaceHTMLEntity +replace_with replaceWith +rest_of_line restOfLine +sgl_quoted_string sglQuotedString +string_end stringEnd +string_start stringStart +token_map tokenMap +trace_parse_action traceParseAction +unicode_string unicodeString +with_attribute withAttribute +with_class withClass +============================== ================================ + +Discontinued Features +===================== + +Python 2.x no longer supported +------------------------------ +Removed Py2.x support and other deprecated features. Pyparsing +now requires Python 3.6.8 or later. If you are using an earlier +version of Python, you must use a Pyparsing 2.4.x version. + +Other discontinued features +--------------------------- +- ``ParseResults.asXML()`` - if used for debugging, switch + to using ``ParseResults.dump()``; if used for data transfer, + use ``ParseResults.as_dict()`` to convert to a nested Python + dict, which can then be converted to XML or JSON or + other transfer format + +- ``operatorPrecedence`` synonym for ``infixNotation`` - + convert to calling ``infix_notation`` + +- ``commaSeparatedList`` - convert to using + ``pyparsing_common.comma_separated_list`` + +- ``upcaseTokens`` and ``downcaseTokens`` - convert to using + ``pyparsing_common.upcase_tokens`` and ``downcase_tokens`` + +- ``__compat__.collect_all_And_tokens`` will not be settable to + ``False`` to revert to pre-2.3.1 results name behavior - + review use of names for ``MatchFirst`` and Or expressions + containing ``And`` expressions, as they will return the + complete list of parsed tokens, not just the first one. + Use ``pyparsing.enable_diag(pyparsing.Diagnostics.warn_multiple_tokens_in_named_alternation)`` + to help identify those expressions in your parsers that + will have changed as a result. + +- Removed support for running ``python setup.py test``. The setuptools + maintainers consider the ``test`` command deprecated (see + ). To run the Pyparsing tests, + use the command ``tox``. + + +Fixed Bugs +========== + +- [Reverted in 3.0.2]Fixed issue when ``LineStart()`` expressions would match input text that was not + necessarily at the beginning of a line. + + [The previous behavior was the correct behavior, since it represents the ``LineStart`` as its own + matching expression. ``ParserElements`` that must start in column 1 can be wrapped in the new + ``AtLineStart`` class.] + +- Fixed bug in regex definitions for ``real`` and ``sci_real`` expressions in + ``pyparsing_common``. + +- Fixed ``FutureWarning`` raised beginning in Python 3.7 for ``Regex`` expressions + containing '[' within a regex set. + +- Fixed bug in ``PrecededBy`` which caused infinite recursion. + +- Fixed bug in ``CloseMatch`` where end location was incorrectly + computed; and updated ``partial_gene_match.py`` example. + +- Fixed bug in ``indentedBlock`` with a parser using two different + types of nested indented blocks with different indent values, + but sharing the same indent stack. + +- Fixed bug in ``Each`` when using ``Regex``, when ``Regex`` expression would + get parsed twice. + +- Fixed bugs in ``Each`` when passed ``OneOrMore`` or ``ZeroOrMore`` expressions: + . first expression match could be enclosed in an extra nesting level + . out-of-order expressions now handled correctly if mixed with required expressions + . results names are maintained correctly for these expression + +- Fixed ``FutureWarning`` that sometimes is raised when ``'['`` passed as a + character to ``Word``. + +- Fixed debug logging to show failure location after whitespace skipping. + +- Fixed ``ParseFatalExceptions`` failing to override normal exceptions or expression + matches in ``MatchFirst`` expressions. + +- Fixed bug in which ``ParseResults`` replaces a collection type value with an invalid + type annotation (as a result of changed behavior in Python 3.9). + +- Fixed bug in ``ParseResults`` when calling ``__getattr__`` for special double-underscored + methods. Now raises ``AttributeError`` for non-existent results when accessing a + name starting with '__'. + +- Fixed bug in ``Located`` class when used with a results name. + +- Fixed bug in ``QuotedString`` class when the escaped quote string is not a + repeated character. + +Acknowledgments +=============== +And finally, many thanks to those who helped in the restructuring +of the pyparsing code base as part of this release. Pyparsing now +has more standard package structure, more standard unit tests, +and more standard code formatting (using black). Special thanks +to jdufresne, klahnakoski, mattcarmody, ckeygusuz, +tmiguelt, and toonarmycaptain to name just a few. + +Thanks also to Michael Milton and Max Fischer, who added some +significant new features to pyparsing. \ No newline at end of file diff --git a/examples/0README.html b/examples/0README.html index 617c16e5..ba5bab06 100644 --- a/examples/0README.html +++ b/examples/0README.html @@ -21,12 +21,12 @@

pyparsing Examples

-

  • holaMundo.py ~ submission by Marco Alfonso
    +
  • hola_mundo.py ~ submission by Marco Alfonso
    "Hello, World!" example translated to Spanish, from Marco Alfonso's blog.
  • -

  • chemicalFormulas.py
    +
  • chemical_formulas.py
    Simple example to demonstrate the use of ParseResults returned from parseString(). Parses a chemical formula (such as "H2O" or "C6H5OH"), and walks the returned list of tokens to calculate the molecular weight.
  • @@ -141,17 +141,6 @@

    pyparsing Examples

    -

  • sparser.py ~ submission by Tim Cera
    -A configurable parser module that can be configured with a list of tuples, giving a high-level definition for parsing common sets -of water table data files. Tim had to contend with several different styles of data file formats, each with slight variations of its own. -Tim created a configurable parser (or "SPECIFIED parser" - hence the name "sparser"), that simply works from a config variable listing -the field names and data types, and implicitly, their order in the source data file. -

    -See mayport_florida_8720220_data_def.txt for an -example configuration file. -

  • -

    -

  • romanNumerals.py
    A Roman numeral generator and parser example, showing the power of parse actions to compile Roman numerals into their integer values. @@ -256,26 +245,22 @@

    pyparsing Examples

  • builtin_parse_action_demo.py
    -New in version 1.5.7
    Demonstration of using builtins (min, max, sum, len, etc.) as parse actions.
  • antlr_grammar.py~ submission by Luca DellOlio
    -New in version 1.5.7
    Pyparsing example parsing ANTLR .a files and generating a working pyparsing parser.
  • shapes.py
    -New in version 1.5.7
    Parse actions example simple shape definition syntax, and returning the matched tokens as domain objects instead of just strings.
  • datetimeParseActions.py
    -New in version 1.5.7
    Parse actions example showing a parse action returning a datetime object instead of string tokens, and doing validation of the tokens, raising a ParseException if the given YYYY/MM/DD string does not represent a valid date. @@ -283,7 +268,6 @@

    pyparsing Examples

  • position.py
    -New in version 1.5.7
    Demonstration of a couple of different ways to capture the location a particular expression was found within the overall input string.
  • diff --git a/examples/AcManForm.dfm b/examples/AcManForm.dfm index db80f6a6..087aea14 100644 --- a/examples/AcManForm.dfm +++ b/examples/AcManForm.dfm @@ -511,7 +511,7 @@ object Form1: TForm1 object SearchFindFirst1: TSearchFindFirst Category = 'Search' Caption = 'F&ind First' - Hint = 'Find First|Finds the first occurance of specified text' + Hint = 'Find First|Finds the first occurrence of specified text' end object CustomizeActionBars1: TCustomizeActionBars Category = 'Tools' diff --git a/examples/LAparser.py b/examples/LAparser.py index 330b8f5c..31494b8b 100644 --- a/examples/LAparser.py +++ b/examples/LAparser.py @@ -57,41 +57,54 @@ """ -import re,sys -from pyparsing import Word, alphas, ParseException, Literal, CaselessLiteral \ -, Combine, Optional, nums, Forward, ZeroOrMore, \ - StringEnd, alphanums +import re, sys +from pyparsing import ( + Word, + alphas, + ParseException, + Literal, + CaselessLiteral, + Combine, + Optional, + nums, + Forward, + ZeroOrMore, + StringEnd, + alphanums, +) # Debugging flag can be set to either "debug_flag=True" or "debug_flag=False" -debug_flag=False +debug_flag = False -#---------------------------------------------------------------------------- +# ---------------------------------------------------------------------------- # Variables that hold intermediate parsing results and a couple of # helper functions. -exprStack = [] # Holds operators and operands parsed from input. -targetvar = None # Holds variable name to left of '=' sign in LA equation. +exprStack = [] # Holds operators and operands parsed from input. +targetvar = None # Holds variable name to left of '=' sign in LA equation. -def _pushFirst( str, loc, toks ): - if debug_flag: print("pushing ", toks[0], "str is ", str) - exprStack.append( toks[0] ) +def _pushFirst(str, loc, toks): + if debug_flag: + print("pushing ", toks[0], "str is ", str) + exprStack.append(toks[0]) -def _assignVar( str, loc, toks ): + +def _assignVar(str, loc, toks): global targetvar - targetvar = toks[0] + targetvar = toks[0] + -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # The following statements define the grammar for the parser. -point = Literal('.') -e = CaselessLiteral('E') -plusorminus = Literal('+') | Literal('-') +point = Literal(".") +e = CaselessLiteral("E") +plusorminus = Literal("+") | Literal("-") number = Word(nums) -integer = Combine( Optional(plusorminus) + number ) -floatnumber = Combine( integer + - Optional( point + Optional(number) ) + - Optional( e + integer ) - ) +integer = Combine(Optional(plusorminus) + number) +floatnumber = Combine( + integer + Optional(point + Optional(number)) + Optional(e + integer) +) lbracket = Literal("[") rbracket = Literal("]") @@ -100,57 +113,66 @@ def _assignVar( str, loc, toks ): ## can include references to array elements, rows and columns, e.g., a = b[i] + 5. ## Expressions within []'s are not presently supported, so a = b[i+1] will raise ## a ParseException. -ident = Combine(Word(alphas + '-',alphanums + '_') + \ - ZeroOrMore(lbracket + (Word(alphas + '-',alphanums + '_')|integer) + rbracket) \ - ) - -plus = Literal( "+" ) -minus = Literal( "-" ) -mult = Literal( "*" ) -div = Literal( "/" ) -outer = Literal( "@" ) -lpar = Literal( "(" ).suppress() -rpar = Literal( ")" ).suppress() -addop = plus | minus +ident = Combine( + Word(alphas + "-", alphanums + "_") + + ZeroOrMore(lbracket + (Word(alphas + "-", alphanums + "_") | integer) + rbracket) +) + +plus = Literal("+") +minus = Literal("-") +mult = Literal("*") +div = Literal("/") +outer = Literal("@") +lpar = Literal("(").suppress() +rpar = Literal(")").suppress() +addop = plus | minus multop = mult | div | outer -expop = Literal( "^" ) -assignop = Literal( "=" ) +expop = Literal("^") +assignop = Literal("=") expr = Forward() -atom = ( ( e | floatnumber | integer | ident ).setParseAction(_pushFirst) | - ( lpar + expr.suppress() + rpar ) - ) +atom = (e | floatnumber | integer | ident).setParseAction(_pushFirst) | ( + lpar + expr.suppress() + rpar +) factor = Forward() -factor << atom + ZeroOrMore( ( expop + factor ).setParseAction( _pushFirst ) ) +factor << atom + ZeroOrMore((expop + factor).setParseAction(_pushFirst)) -term = factor + ZeroOrMore( ( multop + factor ).setParseAction( _pushFirst ) ) -expr << term + ZeroOrMore( ( addop + term ).setParseAction( _pushFirst ) ) +term = factor + ZeroOrMore((multop + factor).setParseAction(_pushFirst)) +expr << term + ZeroOrMore((addop + term).setParseAction(_pushFirst)) equation = (ident + assignop).setParseAction(_assignVar) + expr + StringEnd() # End of grammar definition -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- ## The following are helper variables and functions used by the Binary Infix Operator ## Functions described below. -vprefix = 'V3_' +vprefix = "V3_" vplen = len(vprefix) -mprefix = 'M3_' +mprefix = "M3_" mplen = len(mprefix) ## We don't support unary negation for vectors and matrices -class UnaryUnsupportedError(Exception): pass +class UnaryUnsupportedError(Exception): + pass + def _isvec(ident): - if ident[0] == '-' and ident[1:vplen+1] == vprefix: - raise UnaryUnsupportedError - else: return ident[0:vplen] == vprefix + if ident[0] == "-" and ident[1 : vplen + 1] == vprefix: + raise UnaryUnsupportedError + else: + return ident[0:vplen] == vprefix + def _ismat(ident): - if ident[0] == '-' and ident[1:mplen+1] == mprefix: - raise UnaryUnsupportedError - else: return ident[0:mplen] == mprefix + if ident[0] == "-" and ident[1 : mplen + 1] == mprefix: + raise UnaryUnsupportedError + else: + return ident[0:mplen] == mprefix + + +def _isscalar(ident): + return not (_isvec(ident) or _ismat(ident)) -def _isscalar(ident): return not (_isvec(ident) or _ismat(ident)) ## Binary infix operator (BIO) functions. These are called when the stack evaluator ## pops a binary operator like '+' or '*". The stack evaluator pops the two operand, a and b, @@ -164,80 +186,121 @@ def _isscalar(ident): return not (_isvec(ident) or _ismat(ident)) ## the appropriate prefix is placed on the outer function for removal later as the stack evaluation ## recurses toward the final assignment statement. -def _addfunc(a,b): - if _isscalar(a) and _isscalar(b): return "(%s+%s)"%(a,b) - if _isvec(a) and _isvec(b): return "%svAdd(%s,%s)"%(vprefix,a[vplen:],b[vplen:]) - if _ismat(a) and _ismat(b): return "%smAdd(%s,%s)"%(mprefix,a[mplen:],b[mplen:]) - else: raise TypeError - -def _subfunc(a,b): - if _isscalar(a) and _isscalar(b): return "(%s-%s)"%(a,b) - if _isvec(a) and _isvec(b): return "%svSubtract(%s,%s)"%(vprefix,a[vplen:],b[vplen:]) - if _ismat(a) and _ismat(b): return "%smSubtract(%s,%s)"%(mprefix,a[mplen:],b[mplen:]) - else: raise TypeError - -def _mulfunc(a,b): - if _isscalar(a) and _isscalar(b): return "%s*%s"%(a,b) - if _isvec(a) and _isvec(b): return "vDot(%s,%s)"%(a[vplen:],b[vplen:]) - if _ismat(a) and _ismat(b): return "%smMultiply(%s,%s)"%(mprefix,a[mplen:],b[mplen:]) - if _ismat(a) and _isvec(b): return "%smvMultiply(%s,%s)"%(vprefix,a[mplen:],b[vplen:]) - if _ismat(a) and _isscalar(b): return "%smScale(%s,%s)"%(mprefix,a[mplen:],b) - if _isvec(a) and _isscalar(b): return "%svScale(%s,%s)"%(vprefix,a[mplen:],b) - else: raise TypeError - -def _outermulfunc(a,b): - ## The '@' operator is used for the vector outer product. - if _isvec(a) and _isvec(b): - return "%svOuterProduct(%s,%s)"%(mprefix,a[vplen:],b[vplen:]) - else: raise TypeError - -def _divfunc(a,b): - ## The '/' operator is used only for scalar division - if _isscalar(a) and _isscalar(b): return "%s/%s"%(a,b) - else: raise TypeError - -def _expfunc(a,b): - ## The '^' operator is used for exponentiation on scalars and - ## as a marker for unary operations on vectors and matrices. - if _isscalar(a) and _isscalar(b): return "pow(%s,%s)"%(str(a),str(b)) - if _ismat(a) and b=='-1': return "%smInverse(%s)"%(mprefix,a[mplen:]) - if _ismat(a) and b=='T': return "%smTranspose(%s)"%(mprefix,a[mplen:]) - if _ismat(a) and b=='Det': return "mDeterminant(%s)"%(a[mplen:]) - if _isvec(a) and b=='Mag': return "sqrt(vMagnitude2(%s))"%(a[vplen:]) - if _isvec(a) and b=='Mag2': return "vMagnitude2(%s)"%(a[vplen:]) - else: raise TypeError - -def _assignfunc(a,b): - ## The '=' operator is used for assignment - if _isscalar(a) and _isscalar(b): return "%s=%s"%(a,b) - if _isvec(a) and _isvec(b): return "vCopy(%s,%s)"%(a[vplen:],b[vplen:]) - if _ismat(a) and _ismat(b): return "mCopy(%s,%s)"%(a[mplen:],b[mplen:]) - else: raise TypeError + +def _addfunc(a, b): + if _isscalar(a) and _isscalar(b): + return "(%s+%s)" % (a, b) + if _isvec(a) and _isvec(b): + return "%svAdd(%s,%s)" % (vprefix, a[vplen:], b[vplen:]) + if _ismat(a) and _ismat(b): + return "%smAdd(%s,%s)" % (mprefix, a[mplen:], b[mplen:]) + else: + raise TypeError + + +def _subfunc(a, b): + if _isscalar(a) and _isscalar(b): + return "(%s-%s)" % (a, b) + if _isvec(a) and _isvec(b): + return "%svSubtract(%s,%s)" % (vprefix, a[vplen:], b[vplen:]) + if _ismat(a) and _ismat(b): + return "%smSubtract(%s,%s)" % (mprefix, a[mplen:], b[mplen:]) + else: + raise TypeError + + +def _mulfunc(a, b): + if _isscalar(a) and _isscalar(b): + return "%s*%s" % (a, b) + if _isvec(a) and _isvec(b): + return "vDot(%s,%s)" % (a[vplen:], b[vplen:]) + if _ismat(a) and _ismat(b): + return "%smMultiply(%s,%s)" % (mprefix, a[mplen:], b[mplen:]) + if _ismat(a) and _isvec(b): + return "%smvMultiply(%s,%s)" % (vprefix, a[mplen:], b[vplen:]) + if _ismat(a) and _isscalar(b): + return "%smScale(%s,%s)" % (mprefix, a[mplen:], b) + if _isvec(a) and _isscalar(b): + return "%svScale(%s,%s)" % (vprefix, a[mplen:], b) + else: + raise TypeError + + +def _outermulfunc(a, b): + ## The '@' operator is used for the vector outer product. + if _isvec(a) and _isvec(b): + return "%svOuterProduct(%s,%s)" % (mprefix, a[vplen:], b[vplen:]) + else: + raise TypeError + + +def _divfunc(a, b): + ## The '/' operator is used only for scalar division + if _isscalar(a) and _isscalar(b): + return "%s/%s" % (a, b) + else: + raise TypeError + + +def _expfunc(a, b): + ## The '^' operator is used for exponentiation on scalars and + ## as a marker for unary operations on vectors and matrices. + if _isscalar(a) and _isscalar(b): + return "pow(%s,%s)" % (str(a), str(b)) + if _ismat(a) and b == "-1": + return "%smInverse(%s)" % (mprefix, a[mplen:]) + if _ismat(a) and b == "T": + return "%smTranspose(%s)" % (mprefix, a[mplen:]) + if _ismat(a) and b == "Det": + return "mDeterminant(%s)" % (a[mplen:]) + if _isvec(a) and b == "Mag": + return "sqrt(vMagnitude2(%s))" % (a[vplen:]) + if _isvec(a) and b == "Mag2": + return "vMagnitude2(%s)" % (a[vplen:]) + else: + raise TypeError + + +def _assignfunc(a, b): + ## The '=' operator is used for assignment + if _isscalar(a) and _isscalar(b): + return "%s=%s" % (a, b) + if _isvec(a) and _isvec(b): + return "vCopy(%s,%s)" % (a[vplen:], b[vplen:]) + if _ismat(a) and _ismat(b): + return "mCopy(%s,%s)" % (a[mplen:], b[mplen:]) + else: + raise TypeError + ## End of BIO func definitions ##---------------------------------------------------------------------------- # Map operator symbols to corresponding BIO funcs -opn = { "+" : ( _addfunc ), - "-" : ( _subfunc ), - "*" : ( _mulfunc ), - "@" : ( _outermulfunc ), - "/" : ( _divfunc), - "^" : ( _expfunc ), } +opn = { + "+": (_addfunc), + "-": (_subfunc), + "*": (_mulfunc), + "@": (_outermulfunc), + "/": (_divfunc), + "^": (_expfunc), +} ##---------------------------------------------------------------------------- # Recursive function that evaluates the expression stack -def _evaluateStack( s ): - op = s.pop() - if op in "+-*/@^": - op2 = _evaluateStack( s ) - op1 = _evaluateStack( s ) - result = opn[op]( op1, op2 ) - if debug_flag: print(result) - return result - else: - return op +def _evaluateStack(s): + op = s.pop() + if op in "+-*/@^": + op2 = _evaluateStack(s) + op1 = _evaluateStack(s) + result = opn[op](op1, op2) + if debug_flag: + print(result) + return result + else: + return op + ##---------------------------------------------------------------------------- # The parse function that invokes all of the above. @@ -248,171 +311,221 @@ def parse(input_string): calls that implement the expression. """ - global exprStack + global exprStack global targetvar # Start with a blank exprStack and a blank targetvar exprStack = [] - targetvar=None - - if input_string != '': - # try parsing the input string - try: - L=equation.parseString( input_string ) - except ParseException as err: - print('Parse Failure', file=sys.stderr) - print(err.line, file=sys.stderr) - print(" "*(err.column-1) + "^", file=sys.stderr) - print(err, file=sys.stderr) - raise - - # show result of parsing the input string - if debug_flag: - print(input_string, "->", L) - print("exprStack=", exprStack) - - # Evaluate the stack of parsed operands, emitting C code. - try: - result=_evaluateStack(exprStack) - except TypeError: - print("Unsupported operation on right side of '%s'.\nCheck for missing or incorrect tags on non-scalar operands."%input_string, file=sys.stderr) - raise - except UnaryUnsupportedError: - print("Unary negation is not supported for vectors and matrices: '%s'"%input_string, file=sys.stderr) - raise - - # Create final assignment and print it. - if debug_flag: print("var=",targetvar) - if targetvar != None: - try: - result = _assignfunc(targetvar,result) - except TypeError: - print("Left side tag does not match right side of '%s'"%input_string, file=sys.stderr) - raise - except UnaryUnsupportedError: - print("Unary negation is not supported for vectors and matrices: '%s'"%input_string, file=sys.stderr) + targetvar = None + + if input_string != "": + # try parsing the input string + try: + L = equation.parseString(input_string) + except ParseException as err: + print("Parse Failure", file=sys.stderr) + print(err.line, file=sys.stderr) + print(" " * (err.column - 1) + "^", file=sys.stderr) + print(err, file=sys.stderr) raise - return result - else: - print("Empty left side in '%s'"%input_string, file=sys.stderr) - raise TypeError - -##----------------------------------------------------------------------------------- -def fprocess(infilep,outfilep): - """ - Scans an input file for LA equations between double square brackets, - e.g. [[ M3_mymatrix = M3_anothermatrix^-1 ]], and replaces the expression - with a comment containing the equation followed by nested function calls - that implement the equation as C code. A trailing semi-colon is appended. - The equation within [[ ]] should NOT end with a semicolon as that will raise - a ParseException. However, it is ok to have a semicolon after the right brackets. - - Other text in the file is unaltered. - - The arguments are file objects (NOT file names) opened for reading and - writing, respectively. - """ - pattern = r'\[\[\s*(.*?)\s*\]\]' - eqn = re.compile(pattern,re.DOTALL) - s = infilep.read() - def parser(mo): - ccode = parse(mo.group(1)) - return "/* %s */\n%s;\nLAParserBufferReset();\n"%(mo.group(1),ccode) - - content = eqn.sub(parser,s) - outfilep.write(content) - -##----------------------------------------------------------------------------------- -def test(): - """ - Tests the parsing of various supported expressions. Raises - an AssertError if the output is not what is expected. Prints the - input, expected output, and actual output for all tests. - """ - print("Testing LAParser") - testcases = [ - ("Scalar addition","a = b+c","a=(b+c)"), - ("Vector addition","V3_a = V3_b + V3_c","vCopy(a,vAdd(b,c))"), - ("Vector addition","V3_a=V3_b+V3_c","vCopy(a,vAdd(b,c))"), - ("Matrix addition","M3_a = M3_b + M3_c","mCopy(a,mAdd(b,c))"), - ("Matrix addition","M3_a=M3_b+M3_c","mCopy(a,mAdd(b,c))"), - ("Scalar subtraction","a = b-c","a=(b-c)"), - ("Vector subtraction","V3_a = V3_b - V3_c","vCopy(a,vSubtract(b,c))"), - ("Matrix subtraction","M3_a = M3_b - M3_c","mCopy(a,mSubtract(b,c))"), - ("Scalar multiplication","a = b*c","a=b*c"), - ("Scalar division","a = b/c","a=b/c"), - ("Vector multiplication (dot product)","a = V3_b * V3_c","a=vDot(b,c)"), - ("Vector multiplication (outer product)","M3_a = V3_b @ V3_c","mCopy(a,vOuterProduct(b,c))"), - ("Matrix multiplication","M3_a = M3_b * M3_c","mCopy(a,mMultiply(b,c))"), - ("Vector scaling","V3_a = V3_b * c","vCopy(a,vScale(b,c))"), - ("Matrix scaling","M3_a = M3_b * c","mCopy(a,mScale(b,c))"), - ("Matrix by vector multiplication","V3_a = M3_b * V3_c","vCopy(a,mvMultiply(b,c))"), - ("Scalar exponentiation","a = b^c","a=pow(b,c)"), - ("Matrix inversion","M3_a = M3_b^-1","mCopy(a,mInverse(b))"), - ("Matrix transpose","M3_a = M3_b^T","mCopy(a,mTranspose(b))"), - ("Matrix determinant","a = M3_b^Det","a=mDeterminant(b)"), - ("Vector magnitude squared","a = V3_b^Mag2","a=vMagnitude2(b)"), - ("Vector magnitude","a = V3_b^Mag","a=sqrt(vMagnitude2(b))"), - ("Complicated expression", "myscalar = (M3_amatrix * V3_bvector)^Mag + 5*(-xyz[i] + 2.03^2)","myscalar=(sqrt(vMagnitude2(mvMultiply(amatrix,bvector)))+5*(-xyz[i]+pow(2.03,2)))"), - ("Complicated Multiline", "myscalar = \n(M3_amatrix * V3_bvector)^Mag +\n 5*(xyz + 2.03^2)","myscalar=(sqrt(vMagnitude2(mvMultiply(amatrix,bvector)))+5*(xyz+pow(2.03,2)))") - - ] - - - all_passed = [True] - - def post_test(test, parsed): - - # copy exprStack to evaluate and clear before running next test - parsed_stack = exprStack[:] - exprStack.clear() - - name, testcase, expected = next(tc for tc in testcases if tc[1] == test) - - this_test_passed = False - try: - try: - result=_evaluateStack(parsed_stack) - except TypeError: - print("Unsupported operation on right side of '%s'.\nCheck for missing or incorrect tags on non-scalar operands."%input_string, file=sys.stderr) + # show result of parsing the input string + if debug_flag: + print(input_string, "->", L) + print("exprStack=", exprStack) + + # Evaluate the stack of parsed operands, emitting C code. + try: + result = _evaluateStack(exprStack) + except TypeError: + print( + "Unsupported operation on right side of '%s'.\nCheck for missing or incorrect tags on non-scalar operands." + % input_string, + file=sys.stderr, + ) raise - except UnaryUnsupportedError: - print("Unary negation is not supported for vectors and matrices: '%s'"%input_string, file=sys.stderr) + except UnaryUnsupportedError: + print( + "Unary negation is not supported for vectors and matrices: '%s'" + % input_string, + file=sys.stderr, + ) raise - # Create final assignment and print it. - if debug_flag: print("var=",targetvar) - if targetvar != None: - try: - result = _assignfunc(targetvar,result) - except TypeError: - print("Left side tag does not match right side of '%s'"%input_string, file=sys.stderr) + # Create final assignment and print it. + if debug_flag: + print("var=", targetvar) + if targetvar != None: + try: + result = _assignfunc(targetvar, result) + except TypeError: + print( + "Left side tag does not match right side of '%s'" % input_string, + file=sys.stderr, + ) raise - except UnaryUnsupportedError: - print("Unary negation is not supported for vectors and matrices: '%s'"%input_string, file=sys.stderr) + except UnaryUnsupportedError: + print( + "Unary negation is not supported for vectors and matrices: '%s'" + % input_string, + file=sys.stderr, + ) raise - else: - print("Empty left side in '%s'"%input_string, file=sys.stderr) + return result + else: + print("Empty left side in '%s'" % input_string, file=sys.stderr) raise TypeError - parsed['result'] = result - parsed['passed'] = this_test_passed = result == expected - finally: - all_passed[0] = all_passed[0] and this_test_passed - print('\n' + name) +##----------------------------------------------------------------------------------- +def fprocess(infilep, outfilep): + """ + Scans an input file for LA equations between double square brackets, + e.g. [[ M3_mymatrix = M3_anothermatrix^-1 ]], and replaces the expression + with a comment containing the equation followed by nested function calls + that implement the equation as C code. A trailing semi-colon is appended. + The equation within [[ ]] should NOT end with a semicolon as that will raise + a ParseException. However, it is ok to have a semicolon after the right brackets. + + Other text in the file is unaltered. + + The arguments are file objects (NOT file names) opened for reading and + writing, respectively. + """ + pattern = r"\[\[\s*(.*?)\s*\]\]" + eqn = re.compile(pattern, re.DOTALL) + s = infilep.read() - equation.runTests((t[1] for t in testcases), postParse=post_test) + def parser(mo): + ccode = parse(mo.group(1)) + return "/* %s */\n%s;\nLAParserBufferReset();\n" % (mo.group(1), ccode) + content = eqn.sub(parser, s) + outfilep.write(content) - ##TODO: Write testcases with invalid expressions and test that the expected - ## exceptions are raised. - print("Tests completed!") - print("PASSED" if all_passed[0] else "FAILED") - assert all_passed[0] +##----------------------------------------------------------------------------------- +def test(): + """ + Tests the parsing of various supported expressions. Raises + an AssertError if the output is not what is expected. Prints the + input, expected output, and actual output for all tests. + """ + print("Testing LAParser") + testcases = [ + ("Scalar addition", "a = b+c", "a=(b+c)"), + ("Vector addition", "V3_a = V3_b + V3_c", "vCopy(a,vAdd(b,c))"), + ("Vector addition", "V3_a=V3_b+V3_c", "vCopy(a,vAdd(b,c))"), + ("Matrix addition", "M3_a = M3_b + M3_c", "mCopy(a,mAdd(b,c))"), + ("Matrix addition", "M3_a=M3_b+M3_c", "mCopy(a,mAdd(b,c))"), + ("Scalar subtraction", "a = b-c", "a=(b-c)"), + ("Vector subtraction", "V3_a = V3_b - V3_c", "vCopy(a,vSubtract(b,c))"), + ("Matrix subtraction", "M3_a = M3_b - M3_c", "mCopy(a,mSubtract(b,c))"), + ("Scalar multiplication", "a = b*c", "a=b*c"), + ("Scalar division", "a = b/c", "a=b/c"), + ("Vector multiplication (dot product)", "a = V3_b * V3_c", "a=vDot(b,c)"), + ( + "Vector multiplication (outer product)", + "M3_a = V3_b @ V3_c", + "mCopy(a,vOuterProduct(b,c))", + ), + ("Matrix multiplication", "M3_a = M3_b * M3_c", "mCopy(a,mMultiply(b,c))"), + ("Vector scaling", "V3_a = V3_b * c", "vCopy(a,vScale(b,c))"), + ("Matrix scaling", "M3_a = M3_b * c", "mCopy(a,mScale(b,c))"), + ( + "Matrix by vector multiplication", + "V3_a = M3_b * V3_c", + "vCopy(a,mvMultiply(b,c))", + ), + ("Scalar exponentiation", "a = b^c", "a=pow(b,c)"), + ("Matrix inversion", "M3_a = M3_b^-1", "mCopy(a,mInverse(b))"), + ("Matrix transpose", "M3_a = M3_b^T", "mCopy(a,mTranspose(b))"), + ("Matrix determinant", "a = M3_b^Det", "a=mDeterminant(b)"), + ("Vector magnitude squared", "a = V3_b^Mag2", "a=vMagnitude2(b)"), + ("Vector magnitude", "a = V3_b^Mag", "a=sqrt(vMagnitude2(b))"), + ( + "Complicated expression", + "myscalar = (M3_amatrix * V3_bvector)^Mag + 5*(-xyz[i] + 2.03^2)", + "myscalar=(sqrt(vMagnitude2(mvMultiply(amatrix,bvector)))+5*(-xyz[i]+pow(2.03,2)))", + ), + ( + "Complicated Multiline", + "myscalar = \n(M3_amatrix * V3_bvector)^Mag +\n 5*(xyz + 2.03^2)", + "myscalar=(sqrt(vMagnitude2(mvMultiply(amatrix,bvector)))+5*(xyz+pow(2.03,2)))", + ), + ] + + all_passed = [True] + + def post_test(test, parsed): + + # copy exprStack to evaluate and clear before running next test + parsed_stack = exprStack[:] + exprStack.clear() + + name, testcase, expected = next(tc for tc in testcases if tc[1] == test) + + this_test_passed = False + try: + try: + result = _evaluateStack(parsed_stack) + except TypeError: + print( + "Unsupported operation on right side of '%s'.\nCheck for missing or incorrect tags on non-scalar operands." + % input_string, + file=sys.stderr, + ) + raise + except UnaryUnsupportedError: + print( + "Unary negation is not supported for vectors and matrices: '%s'" + % input_string, + file=sys.stderr, + ) + raise + + # Create final assignment and print it. + if debug_flag: + print("var=", targetvar) + if targetvar != None: + try: + result = _assignfunc(targetvar, result) + except TypeError: + print( + "Left side tag does not match right side of '%s'" + % input_string, + file=sys.stderr, + ) + raise + except UnaryUnsupportedError: + print( + "Unary negation is not supported for vectors and matrices: '%s'" + % input_string, + file=sys.stderr, + ) + raise + + else: + print("Empty left side in '%s'" % input_string, file=sys.stderr) + raise TypeError + + parsed["result"] = result + parsed["passed"] = this_test_passed = result == expected + + finally: + all_passed[0] = all_passed[0] and this_test_passed + print("\n" + name) + + equation.runTests((t[1] for t in testcases), postParse=post_test) + + ##TODO: Write testcases with invalid expressions and test that the expected + ## exceptions are raised. + + print("Tests completed!") + print("PASSED" if all_passed[0] else "FAILED") + assert all_passed[0] + ##---------------------------------------------------------------------------- ## The following is executed only when this module is executed as @@ -420,43 +533,44 @@ def post_test(test, parsed): ## and then enters an interactive loop where you ## can enter expressions and see the resulting C code as output. -if __name__ == '__main__': +if __name__ == "__main__": - import sys - if not sys.flags.interactive: - # run testcases - test() - sys.exit(0) + import sys - # input_string - input_string='' + if not sys.flags.interactive: + # run testcases + test() + sys.exit(0) - # Display instructions on how to use the program interactively - interactiveusage = """ + # input_string + input_string = "" + + # Display instructions on how to use the program interactively + interactiveusage = """ Entering interactive mode: Type in an equation to be parsed or 'quit' to exit the program. Type 'debug on' to print parsing details as each string is processed. Type 'debug off' to stop printing parsing details """ - print(interactiveusage) - input_string = input("> ") - - while input_string != 'quit': - if input_string == "debug on": - debug_flag = True - elif input_string == "debug off": - debug_flag = False - else: - try: - print(parse(input_string)) - except Exception: - pass - - # obtain new input string + print(interactiveusage) input_string = input("> ") - # if user types 'quit' then say goodbye - print("Good bye!") - import os - os._exit(0) - + while input_string != "quit": + if input_string == "debug on": + debug_flag = True + elif input_string == "debug off": + debug_flag = False + else: + try: + print(parse(input_string)) + except Exception: + pass + + # obtain new input string + input_string = input("> ") + + # if user types 'quit' then say goodbye + print("Good bye!") + import os + + os._exit(0) diff --git a/examples/SimpleCalc.py b/examples/SimpleCalc.py index 15a18170..7ace9aea 100644 --- a/examples/SimpleCalc.py +++ b/examples/SimpleCalc.py @@ -1,117 +1,124 @@ -# SimpleCalc.py -# -# Demonstration of the parsing module, -# Sample usage -# -# $ python SimpleCalc.py -# Type in the string to be parse or 'quit' to exit the program -# > g=67.89 + 7/5 -# 69.29 -# > g -# 69.29 -# > h=(6*g+8.8)-g -# 355.25 -# > h + 1 -# 356.25 -# > 87.89 + 7/5 -# 89.29 -# > ans+10 -# 99.29 -# > quit -# Good bye! -# -# - - - -# Uncomment the line below for readline support on interactive terminal -# import readline -from pyparsing import ParseException, Word, alphas, alphanums -import math - -# Debugging flag can be set to either "debug_flag=True" or "debug_flag=False" -debug_flag=False - -variables = {} - -from fourFn import BNF, exprStack, fn, opn -def evaluateStack( s ): - op = s.pop() - if op == 'unary -': - return -evaluateStack( s ) - if op in "+-*/^": - op2 = evaluateStack( s ) - op1 = evaluateStack( s ) - return opn[op]( op1, op2 ) - elif op == "PI": - return math.pi # 3.1415926535 - elif op == "E": - return math.e # 2.718281828 - elif op in fn: - return fn[op]( evaluateStack( s ) ) - elif op[0].isalpha(): - if op in variables: - return variables[op] - raise Exception("invalid identifier '%s'" % op) - else: - return float( op ) - -arithExpr = BNF() -ident = Word(alphas, alphanums).setName("identifier") -assignment = ident("varname") + '=' + arithExpr -pattern = assignment | arithExpr - -if __name__ == '__main__': - # input_string - input_string='' - - # Display instructions on how to quit the program - print("Type in the string to be parsed or 'quit' to exit the program") - input_string = input("> ") - - while input_string.strip().lower() != 'quit': - if input_string.strip().lower() == 'debug': - debug_flag=True - input_string = input("> ") - continue - - # Reset to an empty exprStack - del exprStack[:] - - if input_string != '': - # try parsing the input string - try: - L=pattern.parseString(input_string, parseAll=True) - except ParseException as err: - L=['Parse Failure', input_string, (str(err), err.line, err.column)] - - # show result of parsing the input string - if debug_flag: print(input_string, "->", L) - if len(L)==0 or L[0] != 'Parse Failure': - if debug_flag: print("exprStack=", exprStack) - - # calculate result , store a copy in ans , display the result to user - try: - result=evaluateStack(exprStack) - except Exception as e: - print(str(e)) - else: - variables['ans']=result - print(result) - - # Assign result to a variable if required - if L.varname: - variables[L.varname] = result - if debug_flag: print("variables=", variables) - else: - print('Parse Failure') - err_str, err_line, err_col = L[-1] - print(err_line) - print(" "*(err_col-1) + "^") - print(err_str) - - # obtain new input string - input_string = input("> ") - - # if user type 'quit' then say goodbye - print("Good bye!") +# SimpleCalc.py +# +# Demonstration of the parsing module, +# Sample usage +# +# $ python SimpleCalc.py +# Type in the string to be parse or 'quit' to exit the program +# > g=67.89 + 7/5 +# 69.29 +# > g +# 69.29 +# > h=(6*g+8.8)-g +# 355.25 +# > h + 1 +# 356.25 +# > 87.89 + 7/5 +# 89.29 +# > ans+10 +# 99.29 +# > quit +# Good bye! +# +# + + +# Uncomment the line below for readline support on interactive terminal +# import readline +from pyparsing import ParseException, Word, alphas, alphanums + +# Debugging flag can be set to either "debug_flag=True" or "debug_flag=False" +debug_flag = False + +variables = {} + +from fourFn import BNF, exprStack, evaluate_stack + +# from fourFn import BNF, exprStack, fn, opn +# def evaluateStack( s ): +# op = s.pop() +# if op == 'unary -': +# return -evaluateStack( s ) +# if op in "+-*/^": +# op2 = evaluateStack( s ) +# op1 = evaluateStack( s ) +# return opn[op]( op1, op2 ) +# elif op == "PI": +# return math.pi # 3.1415926535 +# elif op == "E": +# return math.e # 2.718281828 +# elif op in fn: +# return fn[op]( evaluateStack( s ) ) +# elif op[0].isalpha(): +# if op in variables: +# return variables[op] +# raise Exception("invalid identifier '%s'" % op) +# else: +# return float( op ) + +arithExpr = BNF() +ident = Word(alphas, alphanums).setName("identifier") +assignment = ident("varname") + "=" + arithExpr +pattern = assignment | arithExpr + +if __name__ == "__main__": + # input_string + input_string = "" + + # Display instructions on how to quit the program + print("Type in the string to be parsed or 'quit' to exit the program") + input_string = input("> ") + + while input_string.strip().lower() != "quit": + if input_string.strip().lower() == "debug": + debug_flag = True + input_string = input("> ") + continue + + # Reset to an empty exprStack + del exprStack[:] + + if input_string != "": + # try parsing the input string + try: + L = pattern.parseString(input_string, parseAll=True) + except ParseException as err: + L = ["Parse Failure", input_string, (str(err), err.line, err.column)] + + # show result of parsing the input string + if debug_flag: + print(input_string, "->", L) + if len(L) == 0 or L[0] != "Parse Failure": + if debug_flag: + print("exprStack=", exprStack) + + for i, ob in enumerate(exprStack): + if isinstance(ob, str) and ob in variables: + exprStack[i] = str(variables[ob]) + + # calculate result , store a copy in ans , display the result to user + try: + result = evaluate_stack(exprStack) + except Exception as e: + print(str(e)) + else: + variables["ans"] = result + print(result) + + # Assign result to a variable if required + if L.varname: + variables[L.varname] = result + if debug_flag: + print("variables=", variables) + else: + print("Parse Failure") + err_str, err_line, err_col = L[-1] + print(err_line) + print(" " * (err_col - 1) + "^") + print(err_str) + + # obtain new input string + input_string = input("> ") + + # if user type 'quit' then say goodbye + print("Good bye!") diff --git a/examples/TAP.py b/examples/TAP.py index 18f57fdc..788a656a 100644 --- a/examples/TAP.py +++ b/examples/TAP.py @@ -22,11 +22,24 @@ # Copyright 2008, by Paul McGuire # -from pyparsing import ParserElement,LineEnd,Optional,Word,nums,Regex,\ - Literal,CaselessLiteral,Group,OneOrMore,Suppress,restOfLine,\ - FollowedBy,empty - -__all__ = ['tapOutputParser', 'TAPTest', 'TAPSummary'] +from pyparsing import ( + ParserElement, + LineEnd, + Optional, + Word, + nums, + Regex, + Literal, + CaselessLiteral, + Group, + OneOrMore, + Suppress, + restOfLine, + FollowedBy, + empty, +) + +__all__ = ["tapOutputParser", "TAPTest", "TAPSummary"] # newlines are significant whitespace, so set default skippable # whitespace to just spaces and tabs @@ -34,51 +47,58 @@ NL = LineEnd().suppress() integer = Word(nums) -plan = '1..' + integer("ubound") +plan = "1.." + integer("ubound") -OK,NOT_OK = map(Literal,['ok','not ok']) -testStatus = (OK | NOT_OK) +OK, NOT_OK = map(Literal, ["ok", "not ok"]) +testStatus = OK | NOT_OK description = Regex("[^#\n]+") -description.setParseAction(lambda t:t[0].lstrip('- ')) - -TODO,SKIP = map(CaselessLiteral,'TODO SKIP'.split()) -directive = Group(Suppress('#') + (TODO + restOfLine | - FollowedBy(SKIP) + - restOfLine.copy().setParseAction(lambda t:['SKIP',t[0]]) )) +description.setParseAction(lambda t: t[0].lstrip("- ")) + +TODO, SKIP = map(CaselessLiteral, "TODO SKIP".split()) +directive = Group( + Suppress("#") + + ( + TODO + restOfLine + | FollowedBy(SKIP) + restOfLine.copy().setParseAction(lambda t: ["SKIP", t[0]]) + ) +) commentLine = Suppress("#") + empty + restOfLine testLine = Group( - Optional(OneOrMore(commentLine + NL))("comments") + - testStatus("passed") + - Optional(integer)("testNumber") + - Optional(description)("description") + - Optional(directive)("directive") - ) -bailLine = Group(Literal("Bail out!")("BAIL") + - empty + Optional(restOfLine)("reason")) + Optional(OneOrMore(commentLine + NL))("comments") + + testStatus("passed") + + Optional(integer)("testNumber") + + Optional(description)("description") + + Optional(directive)("directive") +) +bailLine = Group(Literal("Bail out!")("BAIL") + empty + Optional(restOfLine)("reason")) + +tapOutputParser = Optional(Group(plan)("plan") + NL) & Group( + OneOrMore((testLine | bailLine) + NL) +)("tests") -tapOutputParser = Optional(Group(plan)("plan") + NL) & \ - Group(OneOrMore((testLine|bailLine) + NL))("tests") -class TAPTest(object): - def __init__(self,results): +class TAPTest: + def __init__(self, results): self.num = results.testNumber - self.passed = (results.passed=="ok") + self.passed = results.passed == "ok" self.skipped = self.todo = False if results.directive: - self.skipped = (results.directive[0][0]=='SKIP') - self.todo = (results.directive[0][0]=='TODO') + self.skipped = results.directive[0][0] == "SKIP" + self.todo = results.directive[0][0] == "TODO" + @classmethod - def bailedTest(cls,num): + def bailedTest(cls, num): ret = TAPTest(empty.parseString("")) ret.num = num ret.skipped = True return ret -class TAPSummary(object): - def __init__(self,results): + +class TAPSummary: + def __init__(self, results): self.passedTests = [] self.failedTests = [] self.skippedTests = [] @@ -86,22 +106,22 @@ def __init__(self,results): self.bonusTests = [] self.bail = False if results.plan: - expected = list(range(1, int(results.plan.ubound)+1)) + expected = list(range(1, int(results.plan.ubound) + 1)) else: - expected = list(range(1,len(results.tests)+1)) + expected = list(range(1, len(results.tests) + 1)) - for i,res in enumerate(results.tests): + for i, res in enumerate(results.tests): # test for bail out if res.BAIL: - #~ print "Test suite aborted: " + res.reason - #~ self.failedTests += expected[i:] + # ~ print "Test suite aborted: " + res.reason + # ~ self.failedTests += expected[i:] self.bail = True - self.skippedTests += [ TAPTest.bailedTest(ii) for ii in expected[i:] ] + self.skippedTests += [TAPTest.bailedTest(ii) for ii in expected[i:]] self.bailReason = res.reason break - #~ print res.dump() - testnum = i+1 + # ~ print res.dump() + testnum = i + 1 if res.testNumber != "": if testnum != int(res.testNumber): print("ERROR! test %(testNumber)s out of sequence" % res) @@ -113,37 +133,43 @@ def __init__(self,results): self.passedTests.append(test) else: self.failedTests.append(test) - if test.skipped: self.skippedTests.append(test) - if test.todo: self.todoTests.append(test) - if test.todo and test.passed: self.bonusTests.append(test) + if test.skipped: + self.skippedTests.append(test) + if test.todo: + self.todoTests.append(test) + if test.todo and test.passed: + self.bonusTests.append(test) - self.passedSuite = not self.bail and (set(self.failedTests)-set(self.todoTests) == set()) + self.passedSuite = not self.bail and ( + set(self.failedTests) - set(self.todoTests) == set() + ) def summary(self, showPassed=False, showAll=False): - testListStr = lambda tl : "[" + ",".join(str(t.num) for t in tl) + "]" + testListStr = lambda tl: "[" + ",".join(str(t.num) for t in tl) + "]" summaryText = [] if showPassed or showAll: - summaryText.append( "PASSED: %s" % testListStr(self.passedTests) ) + summaryText.append("PASSED: %s" % testListStr(self.passedTests)) if self.failedTests or showAll: - summaryText.append( "FAILED: %s" % testListStr(self.failedTests) ) + summaryText.append("FAILED: %s" % testListStr(self.failedTests)) if self.skippedTests or showAll: - summaryText.append( "SKIPPED: %s" % testListStr(self.skippedTests) ) + summaryText.append("SKIPPED: %s" % testListStr(self.skippedTests)) if self.todoTests or showAll: - summaryText.append( "TODO: %s" % testListStr(self.todoTests) ) + summaryText.append("TODO: %s" % testListStr(self.todoTests)) if self.bonusTests or showAll: - summaryText.append( "BONUS: %s" % testListStr(self.bonusTests) ) + summaryText.append("BONUS: %s" % testListStr(self.bonusTests)) if self.passedSuite: - summaryText.append( "PASSED" ) + summaryText.append("PASSED") else: - summaryText.append( "FAILED" ) + summaryText.append("FAILED") return "\n".join(summaryText) + # create TAPSummary objects from tapOutput parsed results, by setting # class as parse action tapOutputParser.setParseAction(TAPSummary) -if __name__ == "__main__": +def main(): test1 = """\ 1..4 ok 1 - Input file opened @@ -210,8 +236,12 @@ def summary(self, showPassed=False, showAll=False): 1..7 """ - for test in (test1,test2,test3,test4,test5,test6): + for test in (test1, test2, test3, test4, test5, test6): print(test) tapResult = tapOutputParser.parseString(test)[0] print(tapResult.summary(showAll=True)) print() + + +if __name__ == "__main__": + main() diff --git a/examples/adventureEngine.py b/examples/adventureEngine.py index 8dee3918..4f27d793 100644 --- a/examples/adventureEngine.py +++ b/examples/adventureEngine.py @@ -2,86 +2,96 @@ # Copyright 2005-2006, Paul McGuire # # Updated 2012 - latest pyparsing API +# Updated 2023 - using PEP8 API names # -from pyparsing import * +import pyparsing as pp import random import string -def aOrAn( item ): - if item.desc[0] in "aeiou": + +def a_or_an(item): + if item.desc.startswith(tuple("aeiou")): return "an " + item.desc else: return "a " + item.desc -def enumerateItems(l): - if len(l) == 0: return "nothing" +def enumerate_items(items_list): + if not items_list: + return "nothing" + *all_but_last, last = items_list out = [] - if len(l) > 1: - out.append(', '.join(aOrAn(item) for item in l[:-1])) - out.append('and') - out.append(aOrAn(l[-1])) + if all_but_last: + out.append(", ".join(a_or_an(item) for item in all_but_last)) + if len(all_but_last) > 1: + out[-1] += ',' + out.append("and") + out.append(a_or_an(last)) return " ".join(out) -def enumerateDoors(l): - if len(l) == 0: return "" +def enumerate_doors(doors_list): + if not doors_list: + return "" + *all_but_last, last = doors_list out = [] - if len(l) > 1: - out.append(', '.join(l[:-1])) + if all_but_last: + out.append(", ".join(all_but_last)) + if len(all_but_last) > 1: + out[-1] += ',' out.append("and") - out.append(l[-1]) + out.append(last) return " ".join(out) -class Room(object): + +class Room: def __init__(self, desc): self.desc = desc self.inv = [] self.gameOver = False - self.doors = [None,None,None,None] + self.doors = [None, None, None, None] - def __getattr__(self,attr): - return \ - { - "n":self.doors[0], - "s":self.doors[1], - "e":self.doors[2], - "w":self.doors[3], - }[attr] + def __getattr__(self, attr): + return { + "n": self.doors[0], + "s": self.doors[1], + "e": self.doors[2], + "w": self.doors[3], + }[attr] - def enter(self,player): + def enter(self, player): if self.gameOver: player.gameOver = True - def addItem(self, it): + def add_item(self, it): self.inv.append(it) - def removeItem(self,it): + def remove_item(self, it): self.inv.remove(it) def describe(self): print(self.desc) - visibleItems = [ it for it in self.inv if it.isVisible ] + visibleItems = [it for it in self.inv if it.isVisible] if random.random() > 0.5: if len(visibleItems) > 1: is_form = "are" else: is_form = "is" - print("There {0} {1} here.".format(is_form, enumerateItems(visibleItems))) + print("There {} {} here.".format(is_form, enumerate_items(visibleItems))) else: - print("You see %s." % (enumerateItems(visibleItems))) + print("You see %s." % (enumerate_items(visibleItems))) class Exit(Room): def __init__(self): - super(Exit,self).__init__("") + super().__init__("") - def enter(self,player): + def enter(self, player): player.gameOver = True - -class Item(object): +class Item: items = {} + def __init__(self, desc): self.desc = desc self.isDeadly = False @@ -106,7 +116,7 @@ def breakItem(self): def isUsable(self, player, target): if self.usableConditionTest: - return self.usableConditionTest( player, target ) + return self.usableConditionTest(player, target) else: return False @@ -114,94 +124,98 @@ def useItem(self, player, target): if self.useAction: self.useAction(player, self, target) + class OpenableItem(Item): def __init__(self, desc, contents=None): - super(OpenableItem,self).__init__(desc) + super().__init__(desc) self.isOpenable = True self.isOpened = False if contents is not None: if isinstance(contents, Item): - self.contents = [contents,] + self.contents = [ + contents, + ] else: self.contents = contents else: self.contents = [] - def openItem(self, player): + def open_item(self, player): if not self.isOpened: self.isOpened = not self.isOpened if self.contents is not None: for item in self.contents: - player.room.addItem( item ) + player.room.add_item(item) self.contents = [] self.desc = "open " + self.desc - def closeItem(self, player): + def close_item(self, player): if self.isOpened: self.isOpened = not self.isOpened if self.desc.startswith("open "): self.desc = self.desc[5:] -class Command(object): +class Command: "Base class for commands" + def __init__(self, verb, verbProg): self.verb = verb self.verbProg = verbProg @staticmethod - def helpDescription(): + def help_description(): return "" - def _doCommand(self, player): + def _do_command(self, player): pass - def __call__(self, player ): - print(self.verbProg.capitalize()+"...") - self._doCommand(player) + def __call__(self, player): + print(self.verbProg.capitalize() + "...") + self._do_command(player) class MoveCommand(Command): def __init__(self, quals): - super(MoveCommand,self).__init__("MOVE", "moving") + super().__init__("MOVE", "moving") self.direction = quals.direction[0] @staticmethod - def helpDescription(): + def help_description(): return """MOVE or GO - go NORTH, SOUTH, EAST, or WEST (can abbreviate as 'GO N' and 'GO W', or even just 'E' and 'S')""" - def _doCommand(self, player): + def _do_command(self, player): rm = player.room nextRoom = rm.doors[ { - "N":0, - "S":1, - "E":2, - "W":3, + "N": 0, + "S": 1, + "E": 2, + "W": 3, }[self.direction] - ] + ] if nextRoom: - player.moveTo( nextRoom ) + player.moveTo(nextRoom) else: print("Can't go that way.") class TakeCommand(Command): def __init__(self, quals): - super(TakeCommand,self).__init__("TAKE", "taking") + super().__init__("TAKE", "taking") self.subject = quals.item @staticmethod - def helpDescription(): + def help_description(): return "TAKE or PICKUP or PICK UP - pick up an object (but some are deadly)" - def _doCommand(self, player): + def _do_command(self, player): rm = player.room subj = Item.items[self.subject] if subj in rm.inv and subj.isVisible: if subj.isTakeable: - rm.removeItem(subj) + rm.remove_item(subj) player.take(subj) else: print(subj.cantTakeMessage) @@ -211,55 +225,82 @@ def _doCommand(self, player): class DropCommand(Command): def __init__(self, quals): - super(DropCommand,self).__init__("DROP", "dropping") + super().__init__("DROP", "dropping") self.subject = quals.item @staticmethod - def helpDescription(): + def help_description(): return "DROP or LEAVE - drop an object (but fragile items may break)" - def _doCommand(self, player): + def _do_command(self, player): rm = player.room subj = Item.items[self.subject] if subj in player.inv: - rm.addItem(subj) + rm.add_item(subj) player.drop(subj) else: - print("You don't have %s." % (aOrAn(subj))) + print("You don't have %s." % (a_or_an(subj))) + class InventoryCommand(Command): def __init__(self, quals): - super(InventoryCommand,self).__init__("INV", "taking inventory") + super().__init__("INV", "taking inventory") @staticmethod - def helpDescription(): + def help_description(): return "INVENTORY or INV or I - lists what items you have" - def _doCommand(self, player): - print("You have %s." % enumerateItems( player.inv )) + def _do_command(self, player): + print("You have %s." % enumerate_items(player.inv)) + class LookCommand(Command): def __init__(self, quals): - super(LookCommand,self).__init__("LOOK", "looking") + super().__init__("LOOK", "looking") @staticmethod - def helpDescription(): + def help_description(): return "LOOK or L - describes the current room and any objects in it" - def _doCommand(self, player): + def _do_command(self, player): player.room.describe() + +class ExamineCommand(Command): + def __init__(self, quals): + super().__init__("EXAMINE", "examining") + self.subject = Item.items[quals.item] + + @staticmethod + def help_description(): + return "EXAMINE or EX or X - look closely at an object" + + def _do_command(self, player): + msg = random.choice( + [ + "It's {}.", + "It's just {}.", + "It's a beautiful {1}.", + "It's a rare and beautiful {1}.", + "It's a rare {1}.", + "Just {}, nothing special...", + "{0}, just {0}." + ] + ) + print(msg.format(a_or_an(self.subject), self.subject).capitalize()) + + class DoorsCommand(Command): def __init__(self, quals): - super(DoorsCommand,self).__init__("DOORS", "looking for doors") + super().__init__("DOORS", "looking for doors") @staticmethod - def helpDescription(): + def help_description(): return "DOORS - display what doors are visible from this room" - def _doCommand(self, player): + def _do_command(self, player): rm = player.room - numDoors = sum([1 for r in rm.doors if r is not None]) + numDoors = sum(1 for r in rm.doors if r is not None) if numDoors == 0: reply = "There are no doors in any direction." else: @@ -267,16 +308,19 @@ def _doCommand(self, player): reply = "There is a door to the " else: reply = "There are doors to the " - doorNames = [ {0:"north", 1:"south", 2:"east", 3:"west"}[i] - for i,d in enumerate(rm.doors) if d is not None ] - #~ print doorNames - reply += enumerateDoors( doorNames ) + doorNames = [ + {0: "north", 1: "south", 2: "east", 3: "west"}[i] + for i, d in enumerate(rm.doors) + if d is not None + ] + reply += enumerate_doors(doorNames) reply += "." print(reply) + class UseCommand(Command): def __init__(self, quals): - super(UseCommand,self).__init__("USE", "using") + super().__init__("USE", "using") self.subject = Item.items[quals.usedObj] if quals.targetObj: self.target = Item.items[quals.targetObj] @@ -284,36 +328,37 @@ def __init__(self, quals): self.target = None @staticmethod - def helpDescription(): + def help_description(): return "USE or U - use an object, optionally IN or ON another object" - def _doCommand(self, player): + def _do_command(self, player): rm = player.room availItems = rm.inv + player.inv if self.subject in availItems: - if self.subject.isUsable( player, self.target ): - self.subject.useItem( player, self.target ) + if self.subject.isUsable(player, self.target): + self.subject.useItem(player, self.target) else: print("You can't use that here.") else: print("There is no %s here to use." % self.subject) + class OpenCommand(Command): def __init__(self, quals): - super(OpenCommand,self).__init__("OPEN", "opening") + super().__init__("OPEN", "opening") self.subject = Item.items[quals.item] @staticmethod - def helpDescription(): + def help_description(): return "OPEN or O - open an object" - def _doCommand(self, player): + def _do_command(self, player): rm = player.room - availItems = rm.inv+player.inv + availItems = rm.inv + player.inv if self.subject in availItems: if self.subject.isOpenable: if not self.subject.isOpened: - self.subject.openItem( player ) + self.subject.open_item(player) else: print("It's already open.") else: @@ -321,22 +366,23 @@ def _doCommand(self, player): else: print("There is no %s here to open." % self.subject) + class CloseCommand(Command): def __init__(self, quals): - super(CloseCommand,self).__init__("CLOSE", "closing") + super().__init__("CLOSE", "closing") self.subject = Item.items[quals.item] @staticmethod - def helpDescription(): + def help_description(): return "CLOSE or CL - close an object" - def _doCommand(self, player): + def _do_command(self, player): rm = player.room - availItems = rm.inv+player.inv + availItems = rm.inv + player.inv if self.subject in availItems: if self.subject.isOpenable: if self.subject.isOpened: - self.subject.closeItem( player ) + self.subject.close_item(player) else: print("You can't close that, it's not open.") else: @@ -344,27 +390,29 @@ def _doCommand(self, player): else: print("There is no %s here to close." % self.subject) + class QuitCommand(Command): def __init__(self, quals): - super(QuitCommand,self).__init__("QUIT", "quitting") + super().__init__("QUIT", "quitting") @staticmethod - def helpDescription(): + def help_description(): return "QUIT or Q - ends the game" - def _doCommand(self, player): + def _do_command(self, player): print("Ok....") player.gameOver = True + class HelpCommand(Command): def __init__(self, quals): - super(HelpCommand,self).__init__("HELP", "helping") + super().__init__("HELP", "helping") @staticmethod - def helpDescription(): + def help_description(): return "HELP or H or ? - displays this help message" - def _doCommand(self, player): + def _do_command(self, player): print("Enter any of the following commands (not case sensitive):") for cmd in [ InventoryCommand, @@ -375,101 +423,122 @@ def _doCommand(self, player): CloseCommand, MoveCommand, LookCommand, + ExamineCommand, DoorsCommand, QuitCommand, HelpCommand, - ]: - print(" - %s" % cmd.helpDescription()) + ]: + print(" - %s" % cmd.help_description()) print() -class AppParseException(ParseException): + +class AppParseException(pp.ParseException): pass -class Parser(object): + +class Parser: def __init__(self): - self.bnf = self.makeBNF() - - def makeBNF(self): - invVerb = oneOf("INV INVENTORY I", caseless=True) - dropVerb = oneOf("DROP LEAVE", caseless=True) - takeVerb = oneOf("TAKE PICKUP", caseless=True) | \ - (CaselessLiteral("PICK") + CaselessLiteral("UP") ) - moveVerb = oneOf("MOVE GO", caseless=True) | empty - useVerb = oneOf("USE U", caseless=True) - openVerb = oneOf("OPEN O", caseless=True) - closeVerb = oneOf("CLOSE CL", caseless=True) - quitVerb = oneOf("QUIT Q", caseless=True) - lookVerb = oneOf("LOOK L", caseless=True) - doorsVerb = CaselessLiteral("DOORS") - helpVerb = oneOf("H HELP ?",caseless=True) - - itemRef = OneOrMore(Word(alphas)).setParseAction( self.validateItemName ) - nDir = oneOf("N NORTH",caseless=True).setParseAction(replaceWith("N")) - sDir = oneOf("S SOUTH",caseless=True).setParseAction(replaceWith("S")) - eDir = oneOf("E EAST",caseless=True).setParseAction(replaceWith("E")) - wDir = oneOf("W WEST",caseless=True).setParseAction(replaceWith("W")) + self.bnf = self.make_bnf() + + def make_bnf(self): + invVerb = pp.one_of("INV INVENTORY I", caseless=True) + dropVerb = pp.one_of("DROP LEAVE", caseless=True) + takeVerb = pp.one_of("TAKE PICKUP", caseless=True) | ( + pp.CaselessLiteral("PICK") + pp.CaselessLiteral("UP") + ) + moveVerb = pp.one_of("MOVE GO", caseless=True) | pp.Empty() + useVerb = pp.one_of("USE U", caseless=True) + openVerb = pp.one_of("OPEN O", caseless=True) + closeVerb = pp.one_of("CLOSE CL", caseless=True) + quitVerb = pp.one_of("QUIT Q", caseless=True) + lookVerb = pp.one_of("LOOK L", caseless=True) + doorsVerb = pp.CaselessLiteral("DOORS") + helpVerb = pp.one_of("H HELP ?", caseless=True) + + itemRef = pp.OneOrMore(pp.Word(pp.alphas)).set_parse_action(self.validate_item_name).setName("item_ref") + nDir = pp.one_of("N NORTH", caseless=True).set_parse_action(pp.replace_with("N")) + sDir = pp.one_of("S SOUTH", caseless=True).set_parse_action(pp.replace_with("S")) + eDir = pp.one_of("E EAST", caseless=True).set_parse_action(pp.replace_with("E")) + wDir = pp.one_of("W WEST", caseless=True).set_parse_action(pp.replace_with("W")) moveDirection = nDir | sDir | eDir | wDir invCommand = invVerb dropCommand = dropVerb + itemRef("item") takeCommand = takeVerb + itemRef("item") - useCommand = useVerb + itemRef("usedObj") + \ - Optional(oneOf("IN ON",caseless=True)) + \ - Optional(itemRef,default=None)("targetObj") + useCommand = ( + useVerb + + itemRef("usedObj") + + pp.Opt(pp.one_of("IN ON", caseless=True)) + + pp.Opt(itemRef, default=None)("targetObj") + ) openCommand = openVerb + itemRef("item") closeCommand = closeVerb + itemRef("item") - moveCommand = moveVerb + moveDirection("direction") + moveCommand = (moveVerb | "") + moveDirection("direction") quitCommand = quitVerb lookCommand = lookVerb - doorsCommand = doorsVerb + examineCommand = pp.one_of("EXAMINE EX X", caseless=True) + itemRef("item") + doorsCommand = doorsVerb.setName("DOORS") helpCommand = helpVerb # attach command classes to expressions - invCommand.setParseAction(InventoryCommand) - dropCommand.setParseAction(DropCommand) - takeCommand.setParseAction(TakeCommand) - useCommand.setParseAction(UseCommand) - openCommand.setParseAction(OpenCommand) - closeCommand.setParseAction(CloseCommand) - moveCommand.setParseAction(MoveCommand) - quitCommand.setParseAction(QuitCommand) - lookCommand.setParseAction(LookCommand) - doorsCommand.setParseAction(DoorsCommand) - helpCommand.setParseAction(HelpCommand) + invCommand.set_parse_action(InventoryCommand) + dropCommand.set_parse_action(DropCommand) + takeCommand.set_parse_action(TakeCommand) + useCommand.set_parse_action(UseCommand) + openCommand.set_parse_action(OpenCommand) + closeCommand.set_parse_action(CloseCommand) + moveCommand.set_parse_action(MoveCommand) + quitCommand.set_parse_action(QuitCommand) + lookCommand.set_parse_action(LookCommand) + examineCommand.set_parse_action(ExamineCommand) + doorsCommand.set_parse_action(DoorsCommand) + helpCommand.set_parse_action(HelpCommand) # define parser using all command expressions - return ( invCommand | - useCommand | - openCommand | - closeCommand | - dropCommand | - takeCommand | - moveCommand | - lookCommand | - doorsCommand | - helpCommand | - quitCommand )("command") + LineEnd() - - def validateItemName(self,s,l,t): + parser = pp.ungroup( + invCommand + | useCommand + | openCommand + | closeCommand + | dropCommand + | takeCommand + | moveCommand + | lookCommand + | examineCommand + | doorsCommand + | helpCommand + | quitCommand + )("command") + + return parser + + def validate_item_name(self, s, l, t): iname = " ".join(t) if iname not in Item.items: - raise AppParseException(s,l,"No such item '%s'." % iname) + raise AppParseException(s, l, "No such item '%s'." % iname) return iname - def parseCmd(self, cmdstr): + def parse_cmd(self, cmdstr): try: - ret = self.bnf.parseString(cmdstr) + ret = self.bnf.parse_string(cmdstr) return ret except AppParseException as pe: print(pe.msg) - except ParseException as pe: - print(random.choice([ "Sorry, I don't understand that.", - "Huh?", - "Excuse me?", - "???", - "What?" ] )) - -class Player(object): + except pp.ParseException as pe: + print( + random.choice( + [ + "Sorry, I don't understand that.", + "Huh?", + "Excuse me?", + "???", + "What?", + ] + ) + ) + + +class Player: def __init__(self, name): self.name = name self.gameOver = False @@ -485,20 +554,20 @@ def moveTo(self, rm): else: rm.describe() - def take(self,it): + def take(self, it): if it.isDeadly: print("Aaaagh!...., the %s killed me!" % it) self.gameOver = True else: self.inv.append(it) - def drop(self,it): + def drop(self, it): self.inv.remove(it) if it.isFragile: it.breakItem() -def createRooms( rm ): +def createRooms(rm): """ create rooms, using multiline string showing map layout string contains symbols for the following: @@ -521,8 +590,8 @@ def createRooms( rm ): # scan through input string looking for connections between rooms rows = rm.split("\n") - for row,line in enumerate(rows): - for col,c in enumerate(line): + for row, line in enumerate(rows): + for col, c in enumerate(line): if c in string.ascii_letters: room = ret[c] n = None @@ -533,116 +602,134 @@ def createRooms( rm ): # look in neighboring cells for connection symbols (must take # care to guard that neighboring cells exist before testing # contents) - if col > 0 and line[col-1] in "<-": - other = line[col-2] + if col > 0 and line[col - 1] in "<-": + other = line[col - 2] w = ret[other] - if col < len(line)-1 and line[col+1] in "->": - other = line[col+2] + if col < len(line) - 1 and line[col + 1] in "->": + other = line[col + 2] e = ret[other] - if row > 1 and col < len(rows[row-1]) and rows[row-1][col] in '|^': - other = rows[row-2][col] + if row > 1 and col < len(rows[row - 1]) and rows[row - 1][col] in "|^": + other = rows[row - 2][col] n = ret[other] - if row < len(rows)-1 and col < len(rows[row+1]) and rows[row+1][col] in '|.': - other = rows[row+2][col] + if ( + row < len(rows) - 1 + and col < len(rows[row + 1]) + and rows[row + 1][col] in "|." + ): + other = rows[row + 2][col] s = ret[other] # set connections to neighboring rooms - room.doors=[n,s,e,w] + room.doors = [n, s, e, w] return ret + # put items in rooms -def putItemInRoom(i,r): - if isinstance(r,str): +def putItemInRoom(i, r): + if isinstance(r, str): r = rooms[r] - r.addItem( Item.items[i] ) + r.add_item(Item.items[i]) -def playGame(p,startRoom): + +def playGame(p, startRoom): # create parser parser = Parser() - p.moveTo( startRoom ) + p.moveTo(startRoom) while not p.gameOver: cmdstr = input(">> ") - cmd = parser.parseCmd(cmdstr) + cmd = parser.parse_cmd(cmdstr) if cmd is not None: - cmd.command( p ) + cmd.command(p) print() print("You ended the game with:") for i in p.inv: - print(" -", aOrAn(i)) - - -#==================== -# start game definition -roomMap = """ - d-Z - | - f-c-e - . | - q'+"'")) | ('u' + Word(hexnums, exact=4)) | SGL_PRINTABLE) +ESC = BSLASH + ( + oneOf(list(r"nrtbf\">" + "'")) | ("u" + Word(hexnums, exact=4)) | SGL_PRINTABLE +) LITERAL_CHAR = ESC | ~(APOS | BSLASH) + SGL_PRINTABLE CHAR_LITERAL = APOS + LITERAL_CHAR + APOS STRING_LITERAL = APOS + Combine(OneOrMore(LITERAL_CHAR)) + APOS DOUBLE_QUOTE_STRING_LITERAL = '"' + ZeroOrMore(LITERAL_CHAR) + '"' -DOUBLE_ANGLE_STRING_LITERAL = '<<' + ZeroOrMore(SGL_PRINTABLE) + '>>' -TOKEN_REF = Word(alphas.upper(), alphanums+'_') -RULE_REF = Word(alphas.lower(), alphanums+'_') -ACTION_ESC = (BSLASH.suppress() + APOS - | BSLASH.suppress() - | BSLASH.suppress() + (~(APOS | QUOTE) + SGL_PRINTABLE) - ) -ACTION_CHAR_LITERAL = (APOS + (ACTION_ESC | ~(BSLASH | APOS) + SGL_PRINTABLE) + APOS) -ACTION_STRING_LITERAL = (QUOTE + ZeroOrMore(ACTION_ESC | ~(BSLASH | QUOTE) + SGL_PRINTABLE) + QUOTE) +DOUBLE_ANGLE_STRING_LITERAL = "<<" + ZeroOrMore(SGL_PRINTABLE) + ">>" +TOKEN_REF = Word(alphas.upper(), alphanums + "_") +RULE_REF = Word(alphas.lower(), alphanums + "_") +ACTION_ESC = ( + BSLASH.suppress() + APOS + | BSLASH.suppress() + | BSLASH.suppress() + (~(APOS | QUOTE) + SGL_PRINTABLE) +) +ACTION_CHAR_LITERAL = APOS + (ACTION_ESC | ~(BSLASH | APOS) + SGL_PRINTABLE) + APOS +ACTION_STRING_LITERAL = ( + QUOTE + ZeroOrMore(ACTION_ESC | ~(BSLASH | QUOTE) + SGL_PRINTABLE) + QUOTE +) SRC = SRC_.suppress() + ACTION_STRING_LITERAL("file") + INT("line") id = TOKEN_REF | RULE_REF -SL_COMMENT = Suppress('//') + Suppress('$ANTLR') + SRC | ZeroOrMore(~EOL + Word(printables)) + EOL +SL_COMMENT = ( + Suppress("//") + Suppress("$ANTLR") + SRC + | ZeroOrMore(~EOL + Word(printables)) + EOL +) ML_COMMENT = cStyleComment -WS = OneOrMore(Suppress(' ') | Suppress('\t') | (Optional(Suppress('\r')) + Literal('\n'))) +WS = OneOrMore( + Suppress(" ") | Suppress("\t") | (Optional(Suppress("\r")) + Literal("\n")) +) WS_LOOP = ZeroOrMore(SL_COMMENT | ML_COMMENT) NESTED_ARG_ACTION = Forward() -NESTED_ARG_ACTION << (LBRACK - + ZeroOrMore(NESTED_ARG_ACTION - | ACTION_STRING_LITERAL - | ACTION_CHAR_LITERAL) - + RBRACK) +NESTED_ARG_ACTION << ( + LBRACK + + ZeroOrMore(NESTED_ARG_ACTION | ACTION_STRING_LITERAL | ACTION_CHAR_LITERAL) + + RBRACK +) ARG_ACTION = NESTED_ARG_ACTION NESTED_ACTION = Forward() -NESTED_ACTION << (LBRACE - + ZeroOrMore(NESTED_ACTION - | SL_COMMENT - | ML_COMMENT - | ACTION_STRING_LITERAL - | ACTION_CHAR_LITERAL) - + RBRACE) -ACTION = NESTED_ACTION + Optional('?') +NESTED_ACTION << ( + LBRACE + + ZeroOrMore( + NESTED_ACTION + | SL_COMMENT + | ML_COMMENT + | ACTION_STRING_LITERAL + | ACTION_CHAR_LITERAL + ) + + RBRACE +) +ACTION = NESTED_ACTION + Optional("?") SCOPE = SCOPE_.suppress() -OPTIONS = OPTIONS_.suppress() + LBRACE # + WS_LOOP + Suppress('{') -TOKENS = TOKENS_.suppress() + LBRACE # + WS_LOOP + Suppress('{') +OPTIONS = OPTIONS_.suppress() + LBRACE # + WS_LOOP + Suppress('{') +TOKENS = TOKENS_.suppress() + LBRACE # + WS_LOOP + Suppress('{') TREE_BEGIN = ROOT + LPAR -RANGE = Suppress('..') -REWRITE = Suppress('->') +RANGE = Suppress("..") +REWRITE = Suppress("->") # General Parser Definitions # Grammar heading -optionValue = id | STRING_LITERAL | CHAR_LITERAL | INT | Literal('*').setName("s") +optionValue = id | STRING_LITERAL | CHAR_LITERAL | INT | Literal("*").setName("s") option = Group(id("id") + EQ + optionValue("value"))("option") optionsSpec = OPTIONS + Group(OneOrMore(option + SEMI))("options") + RBRACE -tokenSpec = Group(TOKEN_REF("token_ref") - + (EQ + (STRING_LITERAL | CHAR_LITERAL)("lit")))("token") + SEMI +tokenSpec = ( + Group(TOKEN_REF("token_ref") + (EQ + (STRING_LITERAL | CHAR_LITERAL)("lit")))( + "token" + ) + + SEMI +) tokensSpec = TOKENS + Group(OneOrMore(tokenSpec))("tokens") + RBRACE attrScope = SCOPE_.suppress() + id + ACTION grammarType = LEXER + PARSER + TREE actionScopeName = id | LEXER("l") | PARSER("p") -action = AT + Optional(actionScopeName + Suppress('::')) + id + ACTION - -grammarHeading = (Optional(ML_COMMENT("ML_COMMENT")) - + Optional(grammarType) - + GRAMMAR - + id("grammarName") + SEMI - + Optional(optionsSpec) - + Optional(tokensSpec) - + ZeroOrMore(attrScope) - + ZeroOrMore(action)) +action = AT + Optional(actionScopeName + Suppress("::")) + id + ACTION + +grammarHeading = ( + Optional(ML_COMMENT("ML_COMMENT")) + + Optional(grammarType) + + GRAMMAR + + id("grammarName") + + SEMI + + Optional(optionsSpec) + + Optional(tokensSpec) + + ZeroOrMore(attrScope) + + ZeroOrMore(action) +) modifier = PROTECTED | PUBLIC | PRIVATE | FRAGMENT ruleAction = AT + id + ACTION throwsSpec = THROWS.suppress() + delimitedList(id) -ruleScopeSpec = ((SCOPE_.suppress() + ACTION) - | (SCOPE_.suppress() + delimitedList(id) + SEMI) - | (SCOPE_.suppress() + ACTION + SCOPE_.suppress() + delimitedList(id) + SEMI)) +ruleScopeSpec = ( + (SCOPE_.suppress() + ACTION) + | (SCOPE_.suppress() + delimitedList(id) + SEMI) + | (SCOPE_.suppress() + ACTION + SCOPE_.suppress() + delimitedList(id) + SEMI) +) unary_op = oneOf("^ !") notTerminal = CHAR_LITERAL | TOKEN_REF | STRING_LITERAL -terminal = (CHAR_LITERAL | TOKEN_REF + Optional(ARG_ACTION) | STRING_LITERAL | '.') + Optional(unary_op) +terminal = ( + CHAR_LITERAL | TOKEN_REF + Optional(ARG_ACTION) | STRING_LITERAL | "." +) + Optional(unary_op) block = Forward() notSet = TIL + (notTerminal | block) rangeNotPython = CHAR_LITERAL("c1") + RANGE + CHAR_LITERAL("c2") -atom = Group((rangeNotPython + Optional(unary_op)("op")) - | terminal - | (notSet + Optional(unary_op)("op")) - | (RULE_REF + Optional(ARG_ACTION("arg")) + Optional(unary_op)("op")) - ) +atom = Group( + (rangeNotPython + Optional(unary_op)("op")) + | terminal + | (notSet + Optional(unary_op)("op")) + | (RULE_REF + Optional(ARG_ACTION("arg")) + Optional(unary_op)("op")) +) element = Forward() -treeSpec = ROOT + LPAR + element*(2,) + RPAR +treeSpec = ROOT + LPAR + element * (2,) + RPAR ebnfSuffix = oneOf("? * +") -ebnf = block + Optional(ebnfSuffix("op") | '=>') -elementNoOptionSpec = ((id("result_name") + oneOf('= +=')("labelOp") + atom("atom") + Optional(ebnfSuffix)) - | (id("result_name") + oneOf('= +=')("labelOp") + block + Optional(ebnfSuffix)) - | atom("atom") + Optional(ebnfSuffix) - | ebnf - | ACTION - | (treeSpec + Optional(ebnfSuffix)) - ) # | SEMPRED ( '=>' -> GATED_SEMPRED | -> SEMPRED ) +ebnf = block + Optional(ebnfSuffix("op") | "=>") +elementNoOptionSpec = ( + (id("result_name") + oneOf("= +=")("labelOp") + atom("atom") + Optional(ebnfSuffix)) + | (id("result_name") + oneOf("= +=")("labelOp") + block + Optional(ebnfSuffix)) + | atom("atom") + Optional(ebnfSuffix) + | ebnf + | ACTION + | (treeSpec + Optional(ebnfSuffix)) +) # | SEMPRED ( '=>' -> GATED_SEMPRED | -> SEMPRED ) element <<= Group(elementNoOptionSpec)("element") # Do not ask me why group is needed twice... seems like the xml that you see is not always the real structure? alternative = Group(Group(OneOrMore(element))("elements")) -rewrite = Optional(Literal('TODO REWRITE RULES TODO')) -block <<= (LPAR - + Optional(Optional(optionsSpec("opts")) + COLON) - + Group(alternative('a1') - + rewrite - + Group(ZeroOrMore(VERT - + alternative('a2') - + rewrite))("alternatives"))("block") - + RPAR) -altList = alternative('a1') + rewrite + Group(ZeroOrMore(VERT + alternative('a2') + rewrite))("alternatives") +rewrite = Optional(Literal("TODO REWRITE RULES TODO")) +block <<= ( + LPAR + + Optional(Optional(optionsSpec("opts")) + COLON) + + Group( + alternative("a1") + + rewrite + + Group(ZeroOrMore(VERT + alternative("a2") + rewrite))("alternatives") + )("block") + + RPAR +) +altList = ( + alternative("a1") + + rewrite + + Group(ZeroOrMore(VERT + alternative("a2") + rewrite))("alternatives") +) exceptionHandler = CATCH.suppress() + ARG_ACTION + ACTION finallyClause = FINALLY.suppress() + ACTION exceptionGroup = (OneOrMore(exceptionHandler) + Optional(finallyClause)) | finallyClause -ruleHeading = (Optional(ML_COMMENT)("ruleComment") - + Optional(modifier)("modifier") - + id("ruleName") - + Optional("!") - + Optional(ARG_ACTION("arg")) - + Optional(Suppress('returns') + ARG_ACTION("rt")) - + Optional(throwsSpec) - + Optional(optionsSpec) - + Optional(ruleScopeSpec) - + ZeroOrMore(ruleAction)) +ruleHeading = ( + Optional(ML_COMMENT)("ruleComment") + + Optional(modifier)("modifier") + + id("ruleName") + + Optional("!") + + Optional(ARG_ACTION("arg")) + + Optional(Suppress("returns") + ARG_ACTION("rt")) + + Optional(throwsSpec) + + Optional(optionsSpec) + + Optional(ruleScopeSpec) + + ZeroOrMore(ruleAction) +) rule = Group(ruleHeading + COLON + altList + SEMI + Optional(exceptionGroup))("rule") grammarDef = grammarHeading + Group(OneOrMore(rule))("rules") + def grammar(): return grammarDef + def __antlrAlternativesConverter(pyparsingRules, antlrBlock): rule = None - if hasattr(antlrBlock, 'alternatives') and antlrBlock.alternatives != '' and len(antlrBlock.alternatives) > 0: + if ( + hasattr(antlrBlock, "alternatives") + and antlrBlock.alternatives != "" + and len(antlrBlock.alternatives) > 0 + ): alternatives = [] alternatives.append(__antlrAlternativeConverter(pyparsingRules, antlrBlock.a1)) for alternative in antlrBlock.alternatives: - alternatives.append(__antlrAlternativeConverter(pyparsingRules, alternative)) + alternatives.append( + __antlrAlternativeConverter(pyparsingRules, alternative) + ) rule = MatchFirst(alternatives)("anonymous_or") - elif hasattr(antlrBlock, 'a1') and antlrBlock.a1 != '': + elif hasattr(antlrBlock, "a1") and antlrBlock.a1 != "": rule = __antlrAlternativeConverter(pyparsingRules, antlrBlock.a1) else: - raise Exception('Not yet implemented') + raise Exception("Not yet implemented") assert rule != None return rule + def __antlrAlternativeConverter(pyparsingRules, antlrAlternative): elementList = [] for element in antlrAlternative.elements: rule = None - if hasattr(element.atom, 'c1') and element.atom.c1 != '': - regex = r'['+str(element.atom.c1[0])+'-'+str(element.atom.c2[0]+']') + if hasattr(element.atom, "c1") and element.atom.c1 != "": + regex = r"[" + str(element.atom.c1[0]) + "-" + str(element.atom.c2[0] + "]") rule = Regex(regex)("anonymous_regex") - elif hasattr(element, 'block') and element.block != '': + elif hasattr(element, "block") and element.block != "": rule = __antlrAlternativesConverter(pyparsingRules, element.block) else: ruleRef = element.atom[0] assert ruleRef in pyparsingRules rule = pyparsingRules[ruleRef](ruleRef) - if hasattr(element, 'op') and element.op != '': - if element.op == '+': + if hasattr(element, "op") and element.op != "": + if element.op == "+": rule = Group(OneOrMore(rule))("anonymous_one_or_more") - elif element.op == '*': + elif element.op == "*": rule = Group(ZeroOrMore(rule))("anonymous_zero_or_more") - elif element.op == '?': + elif element.op == "?": rule = Optional(rule) else: - raise Exception('rule operator not yet implemented : ' + element.op) + raise Exception("rule operator not yet implemented : " + element.op) rule = rule elementList.append(rule) if len(elementList) > 1: @@ -207,6 +310,7 @@ def __antlrAlternativeConverter(pyparsingRules, antlrAlternative): assert rule is not None return rule + def __antlrRuleConverter(pyparsingRules, antlrRule): rule = None rule = __antlrAlternativesConverter(pyparsingRules, antlrRule) @@ -214,6 +318,7 @@ def __antlrRuleConverter(pyparsingRules, antlrRule): rule(antlrRule.ruleName) return rule + def antlrConverter(antlrGrammarTree): pyparsingRules = {} @@ -226,7 +331,7 @@ def antlrConverter(antlrGrammarTree): antlrRules = {} for antlrRule in antlrGrammarTree.rules: antlrRules[antlrRule.ruleName] = antlrRule - pyparsingRules[antlrRule.ruleName] = Forward() # antlr is a top down grammar + pyparsingRules[antlrRule.ruleName] = Forward() # antlr is a top down grammar for antlrRuleName, antlrRule in list(antlrRules.items()): pyparsingRule = __antlrRuleConverter(pyparsingRules, antlrRule) assert pyparsingRule != None @@ -234,6 +339,7 @@ def antlrConverter(antlrGrammarTree): return pyparsingRules + if __name__ == "__main__": text = """\ diff --git a/examples/antlr_grammar_tests.py b/examples/antlr_grammar_tests.py index 57d6cb61..17d8fa02 100644 --- a/examples/antlr_grammar_tests.py +++ b/examples/antlr_grammar_tests.py @@ -1,21 +1,20 @@ -''' +""" Created on 4 sept. 2010 @author: luca Submitted by Luca DallOlio, September, 2010 -''' +""" import unittest from . import antlr_grammar -class Test(unittest.TestCase): - +class Test(unittest.TestCase): def testOptionsSpec(self): text = """options { language = Python; }""" - antlr_grammar.optionsSpec.parseString(text) #@UndefinedVariable + antlr_grammar.optionsSpec.parseString(text) # @UndefinedVariable def testTokensSpec(self): text = """tokens { @@ -24,23 +23,23 @@ def testTokensSpec(self): MULT = '*' ; DIV = '/' ; }""" - antlr_grammar.tokensSpec.parseString(text) #@UndefinedVariable + antlr_grammar.tokensSpec.parseString(text) # @UndefinedVariable def testBlock(self): text = """( PLUS | MINUS )""" - antlr_grammar.block.parseString(text) #@UndefinedVariable + antlr_grammar.block.parseString(text) # @UndefinedVariable def testRule(self): text = """expr : term ( ( PLUS | MINUS ) term )* ;""" - antlr_grammar.rule.parseString(text) #@UndefinedVariable + antlr_grammar.rule.parseString(text) # @UndefinedVariable def testLexerRule(self): text = """fragment DIGIT : '0'..'9' ;""" - antlr_grammar.rule.parseString(text) #@UndefinedVariable + antlr_grammar.rule.parseString(text) # @UndefinedVariable def testLexerRule2(self): text = """WHITESPACE : ( '\t' | ' ' | '\r' | '\n'| '\u000C' )+ { $channel = HIDDEN; } ;""" - #antlr_grammar.rule.parseString(text) #@UndefinedVariable + # antlr_grammar.rule.parseString(text) #@UndefinedVariable def testGrammar(self): text = """grammar SimpleCalc; @@ -76,16 +75,28 @@ def testGrammar(self): /* WHITESPACE : ( '\t' | ' ' | '\r' | '\n'| '\u000C' )+ { $channel = HIDDEN; } ; */ fragment DIGIT : '0'..'9' ;""" - antlrGrammarTree = antlr_grammar.grammarDef.parseString(text) #@UndefinedVariable + antlrGrammarTree = antlr_grammar.grammarDef.parseString( + text + ) # @UndefinedVariable pyparsingRules = antlr_grammar.antlrConverter(antlrGrammarTree) pyparsingRule = pyparsingRules["expr"] pyparsingTree = pyparsingRule.parseString("2 - 5 * 42 + 7 / 25") pyparsingTreeList = pyparsingTree.asList() print(pyparsingTreeList) - self.assertEqual(pyparsingTreeList, - [[[['2'], []], [['-', [['5'], [['*', ['4', '2']]]]], ['+', [['7'], [['/', ['2', '5']]]]]]]] - ) + self.assertEqual( + pyparsingTreeList, + [ + [ + [["2"], []], + [ + ["-", [["5"], [["*", ["4", "2"]]]]], + ["+", [["7"], [["/", ["2", "5"]]]]], + ], + ] + ], + ) + if __name__ == "__main__": - #import sys;sys.argv = ['', 'Test.testOptionsSpec'] + # import sys;sys.argv = ['', 'Test.testOptionsSpec'] unittest.main() diff --git a/examples/apicheck.py b/examples/apicheck.py index cd35a9a7..366ad066 100644 --- a/examples/apicheck.py +++ b/examples/apicheck.py @@ -9,25 +9,28 @@ from pyparsing import * # define punctuation and simple tokens for locating API calls -LBRACK,RBRACK,LBRACE,RBRACE = map(Suppress,"[]{}") -ident = Word(alphas,alphanums+"_") | QuotedString("{",endQuoteChar="}") +LBRACK, RBRACK, LBRACE, RBRACE = map(Suppress, "[]{}") +ident = Word(alphas, alphanums + "_") | QuotedString("{", endQuoteChar="}") arg = "$" + ident # define an API call with a specific number of arguments - using '-' # will ensure that after matching procname, an incorrect number of args will # raise a ParseSyntaxException, which will interrupt the scanString def apiProc(name, numargs): - return LBRACK + Keyword(name)("procname") - arg*numargs + RBRACK + return LBRACK + Keyword(name)("procname") - arg * numargs + RBRACK + # create an apiReference, listing all API functions to be scanned for, and # their respective number of arguments. Beginning the overall expression # with FollowedBy allows us to quickly rule out non-api calls while scanning, # since all of the api calls begin with a "[" -apiRef = FollowedBy("[") + MatchFirst([ - apiProc("procname1", 2), - apiProc("procname2", 1), - apiProc("procname3", 2), - ]) +apiRef = FollowedBy("[") + MatchFirst( + [ + apiProc("procname1", 2), + apiProc("procname2", 1), + apiProc("procname3", 2), + ] +) test = """[ procname1 $par1 $par2 ] other code here @@ -45,13 +48,13 @@ def apiProc(name, numargs): api_scanner = apiRef.scanString(test) while 1: try: - t,s,e = next(api_scanner) - print("found %s on line %d" % (t.procname, lineno(s,test))) + t, s, e = next(api_scanner) + print("found %s on line %d" % (t.procname, lineno(s, test))) except ParseSyntaxException as pe: print("invalid arg count on line", pe.lineno) - print(pe.lineno,':',pe.line) + print(pe.lineno, ":", pe.line) # reset api scanner to start after this exception location - test = "\n"*(pe.lineno-1)+test[pe.loc+1:] + test = "\n" * (pe.lineno - 1) + test[pe.loc + 1 :] api_scanner = apiRef.scanString(test) except StopIteration: break diff --git a/examples/bf.py b/examples/bf.py new file mode 100644 index 00000000..76144295 --- /dev/null +++ b/examples/bf.py @@ -0,0 +1,159 @@ +# bf.py +# +# Brainf*ck interpreter demo +# +# BF instructions (symbols): +# + - increment value at the current pointer +# - - decrement value at the current pointer +# > - increment pointer +# < - decrement pointer +# , - input new byte value, store at the current pointer +# . - output the byte at the current pointer +# [] - evaluate value at current pointer, if nonzero, execute all statements in []'s and repeat +# +import pyparsing as pp + +# define the basic parser + +# define Literals for each symbol in the BF langauge +PLUS, MINUS, GT, LT, INP, OUT, LBRACK, RBRACK = pp.Literal.using_each("+-<>,.[]") + +# use a pyparsing Forward for the recursive definition of an instruction that can +# itself contain instructions +instruction_expr = pp.Forward().set_name("instruction") + +# define a LOOP expression for the instructions enclosed in brackets; use a +# pyparsing Group to wrap the instructions in a sub-list +LOOP = pp.Group(LBRACK + instruction_expr[...] + RBRACK) + +# use '<<=' operator to insert expression definition into existing Forward +instruction_expr <<= PLUS | MINUS | GT | LT | INP | OUT | LOOP + +program_expr = instruction_expr[...].set_name("program") + +# ignore everything that is not a BF symbol +ignore_chars = pp.Word(pp.printables, exclude_chars="+-<>,.[]") +program_expr.ignore(ignore_chars) + + +class BFEngine: + """ + Brainf*ck execution environment, with a memory array and pointer. + """ + def __init__(self, memory_size: int = 1024): + self._ptr = 0 + self._memory_size = memory_size + self._memory = [0] * self._memory_size + + @property + def ptr(self): + return self._ptr + + @ptr.setter + def ptr(self, value): + self._ptr = value % self._memory_size + + @property + def at_ptr(self): + return self._memory[self._ptr] + + @at_ptr.setter + def at_ptr(self, value): + self._memory[self._ptr] = value % 256 + + def output_value_at_ptr(self): + print(chr(self.at_ptr), end="") + + def input_value(self): + input_char = input() or "\0" + self.at_ptr = ord(input_char[0]) + + def reset(self): + self._ptr = 0 + self._memory[:] = [0] * self._memory_size + + def dump_state(self): + for i in range(30): + print(f"{self._memory[i]:3d} ", end="") + print() + + if self.ptr < 30: + print(f" {' ' * self.ptr}^") + + +# define executable classes for each instruction + +class Instruction: + """Abstract class for all instruction classes to implement.""" + def __init__(self, tokens): + self.tokens = tokens + + def execute(self, bf_engine: BFEngine): + raise NotImplementedError() + + +class IncrPtr(Instruction): + def execute(self, bf_engine: BFEngine): + bf_engine.ptr += 1 + + +class DecrPtr(Instruction): + def execute(self, bf_engine: BFEngine): + bf_engine.ptr -= 1 + + +class IncrPtrValue(Instruction): + def execute(self, bf_engine: BFEngine): + bf_engine.at_ptr += 1 + + +class DecrPtrValue(Instruction): + def execute(self, bf_engine: BFEngine): + bf_engine.at_ptr -= 1 + + +class OutputPtrValue(Instruction): + def execute(self, bf_engine: BFEngine): + bf_engine.output_value_at_ptr() + + +class InputPtrValue(Instruction): + def execute(self, bf_engine: BFEngine): + bf_engine.input_value() + + +class RunInstructionLoop(Instruction): + def __init__(self, tokens): + super().__init__(tokens) + self.instructions = self.tokens[0][1:-1] + + def execute(self, bf_engine: BFEngine): + while bf_engine.at_ptr: + for i in self.instructions: + i.execute(bf_engine) + + +# add parse actions to all BF instruction expressions +PLUS.add_parse_action(IncrPtrValue) +MINUS.add_parse_action(DecrPtrValue) +GT.add_parse_action(IncrPtr) +LT.add_parse_action(DecrPtr) +OUT.add_parse_action(OutputPtrValue) +INP.add_parse_action(InputPtrValue) +LOOP.add_parse_action(RunInstructionLoop) + + +@program_expr.add_parse_action +def run_program(tokens): + bf = BFEngine() + for t in tokens: + t.execute(bf) + print() + + +# generate railroad diagram +program_expr.create_diagram("bf.html") + +# execute an example BF program +hw = "+[-->-[>>+>-----<<]<--<---]>-.>>>+.>>..+++[.>]<<<<.+++.------.<<-.>>>>+." +program_expr.parse_string(hw) diff --git a/examples/bigquery_view_parser.py b/examples/bigquery_view_parser.py new file mode 100644 index 00000000..9215225e --- /dev/null +++ b/examples/bigquery_view_parser.py @@ -0,0 +1,1790 @@ +# bigquery_view_parser.py +# +# A parser to extract table names from BigQuery view definitions. +# This is based on the `select_parser.py` sample in pyparsing: +# https://github.com/pyparsing/pyparsing/blob/master/examples/select_parser.py +# +# Michael Smedberg +# +import sys +import textwrap + +from pyparsing import ParserElement, Suppress, Forward, CaselessKeyword +from pyparsing import MatchFirst, alphas, alphanums, Combine, Word +from pyparsing import QuotedString, CharsNotIn, Optional, Group +from pyparsing import oneOf, delimitedList, restOfLine, cStyleComment +from pyparsing import infixNotation, opAssoc, Regex, nums + +sys.setrecursionlimit(3000) + +ParserElement.enablePackrat() + + +class BigQueryViewParser: + """Parser to extract table info from BigQuery view definitions + + Based on the BNF and examples posted at + https://cloud.google.com/bigquery/docs/reference/legacy-sql + """ + + _parser = None + _table_identifiers = set() + _with_aliases = set() + + def get_table_names(self, sql_stmt): + table_identifiers, with_aliases = self._parse(sql_stmt) + + # Table names and alias names might differ by case, but that's not + # relevant- aliases are not case sensitive + lower_aliases = BigQueryViewParser.lowercase_set_of_tuples(with_aliases) + tables = { + x + for x in table_identifiers + if not BigQueryViewParser.lowercase_of_tuple(x) in lower_aliases + } + + # Table names ARE case sensitive as described at + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity + return tables + + def _parse(self, sql_stmt): + BigQueryViewParser._table_identifiers.clear() + BigQueryViewParser._with_aliases.clear() + BigQueryViewParser._get_parser().parseString(sql_stmt, parseAll=True) + + return BigQueryViewParser._table_identifiers, BigQueryViewParser._with_aliases + + @classmethod + def lowercase_of_tuple(cls, tuple_to_lowercase): + return tuple(x.lower() if x else None for x in tuple_to_lowercase) + + @classmethod + def lowercase_set_of_tuples(cls, set_of_tuples): + return {BigQueryViewParser.lowercase_of_tuple(x) for x in set_of_tuples} + + @classmethod + def _get_parser(cls): + if cls._parser is not None: + return cls._parser + + ParserElement.enablePackrat() + + LPAR, RPAR, COMMA, LBRACKET, RBRACKET, LT, GT = map(Suppress, "(),[]<>") + QUOT, APOS, ACC, DOT, SEMI = map(Suppress, "\"'`.;") + ungrouped_select_stmt = Forward().setName("select statement") + + QUOTED_QUOT = QuotedString('"') + QUOTED_APOS = QuotedString("'") + QUOTED_ACC = QuotedString("`") + QUOTED_BRACKETS = QuotedString("[", endQuoteChar="]") + + # fmt: off + # keywords + ( + UNION, ALL, AND, INTERSECT, EXCEPT, COLLATE, ASC, DESC, ON, USING, NATURAL, + INNER, CROSS, LEFT, RIGHT, OUTER, FULL, JOIN, AS, INDEXED, NOT, SELECT, + DISTINCT, FROM, WHERE, GROUP, BY, HAVING, ORDER, BY, LIMIT, OFFSET, OR, + CAST, ISNULL, NOTNULL, NULL, IS, BETWEEN, ELSE, END, CASE, WHEN, THEN, + EXISTS, COLLATE, IN, LIKE, GLOB, REGEXP, MATCH, ESCAPE, CURRENT_TIME, + CURRENT_DATE, CURRENT_TIMESTAMP, WITH, EXTRACT, PARTITION, ROWS, RANGE, + UNBOUNDED, PRECEDING, CURRENT, ROW, FOLLOWING, OVER, INTERVAL, DATE_ADD, + DATE_SUB, ADDDATE, SUBDATE, REGEXP_EXTRACT, SPLIT, ORDINAL, FIRST_VALUE, + LAST_VALUE, NTH_VALUE, LEAD, LAG, PERCENTILE_CONT, PRECENTILE_DISC, RANK, + DENSE_RANK, PERCENT_RANK, CUME_DIST, NTILE, ROW_NUMBER, DATE, TIME, DATETIME, + TIMESTAMP, UNNEST, INT64, NUMERIC, FLOAT64, BOOL, BYTES, GEOGRAPHY, ARRAY, + STRUCT, SAFE_CAST, ANY_VALUE, ARRAY_AGG, ARRAY_CONCAT_AGG, AVG, BIT_AND, + BIT_OR, BIT_XOR, COUNT, COUNTIF, LOGICAL_AND, LOGICAL_OR, MAX, MIN, + STRING_AGG, SUM, CORR, COVAR_POP, COVAR_SAMP, STDDEV_POP, STDDEV_SAMP, + STDDEV, VAR_POP, VAR_SAMP, VARIANCE, TIMESTAMP_ADD, TIMESTAMP_SUB, + GENERATE_ARRAY, GENERATE_DATE_ARRAY, GENERATE_TIMESTAMP_ARRAY, FOR, + SYSTEM_TIME, OF, WINDOW, RESPECT, IGNORE, NULLS, IF, CONTAINS, + ) = map( + CaselessKeyword, + """ + UNION, ALL, AND, INTERSECT, EXCEPT, COLLATE, ASC, DESC, ON, USING, NATURAL, + INNER, CROSS, LEFT, RIGHT, OUTER, FULL, JOIN, AS, INDEXED, NOT, SELECT, + DISTINCT, FROM, WHERE, GROUP, BY, HAVING, ORDER, BY, LIMIT, OFFSET, OR, + CAST, ISNULL, NOTNULL, NULL, IS, BETWEEN, ELSE, END, CASE, WHEN, THEN, + EXISTS, COLLATE, IN, LIKE, GLOB, REGEXP, MATCH, ESCAPE, CURRENT_TIME, + CURRENT_DATE, CURRENT_TIMESTAMP, WITH, EXTRACT, PARTITION, ROWS, RANGE, + UNBOUNDED, PRECEDING, CURRENT, ROW, FOLLOWING, OVER, INTERVAL, DATE_ADD, + DATE_SUB, ADDDATE, SUBDATE, REGEXP_EXTRACT, SPLIT, ORDINAL, FIRST_VALUE, + LAST_VALUE, NTH_VALUE, LEAD, LAG, PERCENTILE_CONT, PRECENTILE_DISC, RANK, + DENSE_RANK, PERCENT_RANK, CUME_DIST, NTILE, ROW_NUMBER, DATE, TIME, DATETIME, + TIMESTAMP, UNNEST, INT64, NUMERIC, FLOAT64, BOOL, BYTES, GEOGRAPHY, ARRAY, + STRUCT, SAFE_CAST, ANY_VALUE, ARRAY_AGG, ARRAY_CONCAT_AGG, AVG, BIT_AND, + BIT_OR, BIT_XOR, COUNT, COUNTIF, LOGICAL_AND, LOGICAL_OR, MAX, MIN, + STRING_AGG, SUM, CORR, COVAR_POP, COVAR_SAMP, STDDEV_POP, STDDEV_SAMP, + STDDEV, VAR_POP, VAR_SAMP, VARIANCE, TIMESTAMP_ADD, TIMESTAMP_SUB, + GENERATE_ARRAY, GENERATE_DATE_ARRAY, GENERATE_TIMESTAMP_ARRAY, FOR, + SYSTEM_TIME, OF, WINDOW, RESPECT, IGNORE, NULLS, IF, CONTAINS, + """.replace(",", "").split(), + ) + + keyword_nonfunctions = MatchFirst( + (UNION, ALL, INTERSECT, EXCEPT, COLLATE, ASC, DESC, ON, USING, + NATURAL, INNER, CROSS, LEFT, RIGHT, OUTER, FULL, JOIN, AS, INDEXED, + NOT, SELECT, DISTINCT, FROM, WHERE, GROUP, BY, HAVING, ORDER, BY, + LIMIT, OFFSET, CAST, ISNULL, NOTNULL, NULL, IS, BETWEEN, ELSE, END, + CASE, WHEN, THEN, EXISTS, COLLATE, IN, LIKE, GLOB, REGEXP, MATCH, + STRUCT, WINDOW, SYSTEM_TIME, IF, FOR, + ) + ) + + keyword = keyword_nonfunctions | MatchFirst( + (ESCAPE, CURRENT_TIME, CURRENT_DATE, CURRENT_TIMESTAMP, DATE_ADD, + DATE_SUB, ADDDATE, SUBDATE, INTERVAL, STRING_AGG, REGEXP_EXTRACT, + SPLIT, ORDINAL, UNNEST, SAFE_CAST, PARTITION, TIMESTAMP_ADD, + TIMESTAMP_SUB, ARRAY, GENERATE_ARRAY, GENERATE_DATE_ARRAY, + GENERATE_TIMESTAMP_ARRAY, SYSTEM_TIME, CONTAINS, + ) + ) + + # fmt: on + + identifier_word = Word(alphas + "_@#", alphanums + "@$#_") + identifier = ~keyword + identifier_word.copy() + collation_name = identifier.copy() + # NOTE: Column names can be keywords. Doc says they cannot, but in practice it seems to work. + column_name = identifier_word.copy() + qualified_column_name = Combine( + column_name + ("." + column_name)[..., 6], adjacent=False + ) + # NOTE: As with column names, column aliases can be keywords, e.g. functions like `current_time`. Other + # keywords, e.g. `from` make parsing pretty difficult (e.g. "SELECT a from from b" is confusing.) + column_alias = ~keyword_nonfunctions + column_name.copy() + table_name = identifier.copy() + table_alias = identifier.copy() + index_name = identifier.copy() + function_name = identifier.copy() + parameter_name = identifier.copy() + # NOTE: The expression in a CASE statement can be an integer. E.g. this is valid SQL: + # select CASE 1 WHEN 1 THEN -1 ELSE -2 END from test_table + unquoted_case_identifier = ~keyword + Word(alphanums + "$_") + quoted_case_identifier = QUOTED_QUOT | QUOTED_ACC + case_identifier = quoted_case_identifier | unquoted_case_identifier + case_expr = ( + Optional(case_identifier + DOT) + + Optional(case_identifier + DOT) + + case_identifier + ) + + # expression + expr = Forward().setName("expression") + + integer = Regex(r"[+-]?\d+") + numeric_literal = Regex(r"[+-]?\d*\.?\d+([eE][+-]?\d+)?") + string_literal = QUOTED_APOS | QUOTED_QUOT | QUOTED_ACC + regex_literal = "r" + string_literal + blob_literal = Regex(r"[xX]'[0-9A-Fa-f]+'") + date_or_time_literal = (DATE | TIME | DATETIME | TIMESTAMP) + string_literal + literal_value = ( + numeric_literal + | string_literal + | regex_literal + | blob_literal + | date_or_time_literal + | NULL + | CURRENT_TIME + Optional(LPAR + Optional(string_literal) + RPAR) + | CURRENT_DATE + Optional(LPAR + Optional(string_literal) + RPAR) + | CURRENT_TIMESTAMP + Optional(LPAR + Optional(string_literal) + RPAR) + ) + bind_parameter = Word("?", nums) | Combine(oneOf(": @ $") + parameter_name) + type_name = oneOf( + """TEXT REAL INTEGER BLOB NULL TIMESTAMP STRING DATE + INT64 NUMERIC FLOAT64 BOOL BYTES DATETIME GEOGRAPHY TIME ARRAY + STRUCT""", + caseless=True, + ) + date_part = oneOf( + """DAY DAY_HOUR DAY_MICROSECOND DAY_MINUTE DAY_SECOND + HOUR HOUR_MICROSECOND HOUR_MINUTE HOUR_SECOND MICROSECOND MINUTE + MINUTE_MICROSECOND MINUTE_SECOND MONTH QUARTER SECOND + SECOND_MICROSECOND WEEK YEAR YEAR_MONTH""", + caseless=True, + as_keyword=True, + ) + datetime_operators = ( + DATE_ADD | DATE_SUB | ADDDATE | SUBDATE | TIMESTAMP_ADD | TIMESTAMP_SUB + ) + + grouping_term = expr.copy() + ordering_term = Group( + expr("order_key") + + Optional(COLLATE + collation_name("collate")) + + Optional(ASC | DESC)("direction") + )("ordering_term") + + function_arg = expr.copy()("function_arg") + function_args = Optional( + "*" + | Optional(DISTINCT) + + delimitedList(function_arg) + + Optional((RESPECT | IGNORE) + NULLS) + )("function_args") + function_call = ( + (function_name | keyword)("function_name") + + LPAR + + Group(function_args)("function_args_group") + + RPAR + ) + + navigation_function_name = ( + FIRST_VALUE + | LAST_VALUE + | NTH_VALUE + | LEAD + | LAG + | PERCENTILE_CONT + | PRECENTILE_DISC + ) + aggregate_function_name = ( + ANY_VALUE + | ARRAY_AGG + | ARRAY_CONCAT_AGG + | AVG + | BIT_AND + | BIT_OR + | BIT_XOR + | COUNT + | COUNTIF + | LOGICAL_AND + | LOGICAL_OR + | MAX + | MIN + | STRING_AGG + | SUM + ) + statistical_aggregate_function_name = ( + CORR + | COVAR_POP + | COVAR_SAMP + | STDDEV_POP + | STDDEV_SAMP + | STDDEV + | VAR_POP + | VAR_SAMP + | VARIANCE + ) + numbering_function_name = ( + RANK | DENSE_RANK | PERCENT_RANK | CUME_DIST | NTILE | ROW_NUMBER + ) + analytic_function_name = ( + navigation_function_name + | aggregate_function_name + | statistical_aggregate_function_name + | numbering_function_name + )("analytic_function_name") + partition_expression_list = delimitedList(grouping_term)( + "partition_expression_list" + ) + window_frame_boundary_start = ( + UNBOUNDED + PRECEDING + | numeric_literal + (PRECEDING | FOLLOWING) + | CURRENT + ROW + ) + window_frame_boundary_end = ( + UNBOUNDED + FOLLOWING + | numeric_literal + (PRECEDING | FOLLOWING) + | CURRENT + ROW + ) + window_frame_clause = (ROWS | RANGE) + ( + ((UNBOUNDED + PRECEDING) | (numeric_literal + PRECEDING) | (CURRENT + ROW)) + | (BETWEEN + window_frame_boundary_start + AND + window_frame_boundary_end) + ) + window_name = identifier.copy()("window_name") + window_specification = ( + Optional(window_name) + + Optional(PARTITION + BY + partition_expression_list) + + Optional(ORDER + BY + delimitedList(ordering_term)) + + Optional(window_frame_clause)("window_specification") + ) + analytic_function = ( + analytic_function_name + + LPAR + + function_args + + RPAR + + OVER + + (window_name | LPAR + Optional(window_specification) + RPAR) + )("analytic_function") + + string_agg_term = ( + STRING_AGG + + LPAR + + Optional(DISTINCT) + + expr + + Optional(COMMA + string_literal) + + Optional( + ORDER + BY + expr + Optional(ASC | DESC) + Optional(LIMIT + integer) + ) + + RPAR + )("string_agg") + array_literal = ( + Optional(ARRAY + Optional(LT + delimitedList(type_name) + GT)) + + LBRACKET + + delimitedList(expr) + + RBRACKET + ) + interval = INTERVAL + expr + date_part + array_generator = ( + GENERATE_ARRAY + + LPAR + + numeric_literal + + COMMA + + numeric_literal + + COMMA + + numeric_literal + + RPAR + ) + date_array_generator = ( + (GENERATE_DATE_ARRAY | GENERATE_TIMESTAMP_ARRAY) + + LPAR + + expr("start_date") + + COMMA + + expr("end_date") + + Optional(COMMA + interval) + + RPAR + ) + + explicit_struct = ( + STRUCT + + Optional(LT + delimitedList(type_name) + GT) + + LPAR + + Optional(delimitedList(expr + Optional(AS + identifier))) + + RPAR + ) + + case_when = WHEN + expr.copy()("when") + case_then = THEN + expr.copy()("then") + case_clauses = Group((case_when + case_then)[...]) + case_else = ELSE + expr.copy()("else") + case_stmt = ( + CASE + + Optional(case_expr.copy()) + + case_clauses("case_clauses") + + Optional(case_else) + + END + )("case") + + expr_term = ( + (analytic_function)("analytic_function") + | (CAST + LPAR + expr + AS + type_name + RPAR)("cast") + | (SAFE_CAST + LPAR + expr + AS + type_name + RPAR)("safe_cast") + | (Optional(EXISTS) + LPAR + ungrouped_select_stmt + RPAR)("subselect") + | (literal_value)("literal") + | (bind_parameter)("bind_parameter") + | (EXTRACT + LPAR + expr + FROM + expr + RPAR)("extract") + | case_stmt + | (datetime_operators + LPAR + expr + COMMA + interval + RPAR)( + "date_operation" + ) + | string_agg_term("string_agg_term") + | array_literal("array_literal") + | array_generator("array_generator") + | date_array_generator("date_array_generator") + | explicit_struct("explicit_struct") + | function_call("function_call") + | qualified_column_name("column") + ) + Optional(LBRACKET + (OFFSET | ORDINAL) + LPAR + expr + RPAR + RBRACKET)( + "offset_ordinal" + ) + + struct_term = LPAR + delimitedList(expr_term) + RPAR + + UNARY, BINARY, TERNARY = 1, 2, 3 + expr <<= infixNotation( + (expr_term | struct_term), + [ + (oneOf("- + ~") | NOT, UNARY, opAssoc.RIGHT), + (ISNULL | NOTNULL | NOT + NULL, UNARY, opAssoc.LEFT), + ("||", BINARY, opAssoc.LEFT), + (oneOf("* / %"), BINARY, opAssoc.LEFT), + (oneOf("+ -"), BINARY, opAssoc.LEFT), + (oneOf("<< >> & |"), BINARY, opAssoc.LEFT), + (oneOf("= > < >= <= <> != !< !> =="), BINARY, opAssoc.LEFT), + ( + IS + Optional(NOT) + | Optional(NOT) + IN + | Optional(NOT) + LIKE + | GLOB + | MATCH + | REGEXP + | CONTAINS, + BINARY, + opAssoc.LEFT, + ), + ((BETWEEN, AND), TERNARY, opAssoc.LEFT), + ( + Optional(NOT) + + IN + + LPAR + + Group(ungrouped_select_stmt | delimitedList(expr)) + + RPAR, + UNARY, + opAssoc.LEFT, + ), + (AND, BINARY, opAssoc.LEFT), + (OR, BINARY, opAssoc.LEFT), + ], + ) + quoted_expr = ( + expr | QUOT + expr + QUOT | APOS + expr + APOS | ACC + expr + ACC + )("quoted_expr") + + compound_operator = ( + UNION + Optional(ALL | DISTINCT) + | INTERSECT + DISTINCT + | EXCEPT + DISTINCT + | INTERSECT + | EXCEPT + )("compound_operator") + + join_constraint = Group( + Optional( + ON + expr + | USING + LPAR + Group(delimitedList(qualified_column_name)) + RPAR + ) + )("join_constraint") + + join_op = ( + COMMA + | Group( + Optional(NATURAL) + + Optional( + INNER + | CROSS + | LEFT + OUTER + | LEFT + | RIGHT + OUTER + | RIGHT + | FULL + OUTER + | OUTER + | FULL + ) + + JOIN + ) + )("join_op") + + join_source = Forward() + + # We support three kinds of table identifiers. + # + # First, dot delimited info like project.dataset.table, where + # each component follows the rules described in the BigQuery + # docs, namely: + # Contain letters (upper or lower case), numbers, and underscores + # + # Second, a dot delimited quoted string. Since it's quoted, we'll be + # liberal w.r.t. what characters we allow. E.g.: + # `project.dataset.name-with-dashes` + # + # Third, a series of quoted strings, delimited by dots, e.g.: + # `project`.`dataset`.`name-with-dashes` + # + # We also support combinations, like: + # project.dataset.`name-with-dashes` + # `project`.`dataset.name-with-dashes` + + def record_table_identifier(t): + identifier_list = t.asList() + padded_list = [None] * (3 - len(identifier_list)) + identifier_list + cls._table_identifiers.add(tuple(padded_list)) + + standard_table_part = ~keyword + Word(alphanums + "_") + quoted_project_part = QUOTED_QUOT | QUOTED_APOS | QUOTED_ACC + quoted_table_part = ( + QUOT + CharsNotIn('".') + QUOT + | APOS + CharsNotIn("'.") + APOS + | ACC + CharsNotIn("`.") + ACC + ) + quoted_table_parts_identifier = ( + Optional( + (quoted_project_part("project") | standard_table_part("project")) + DOT + ) + + Optional( + (quoted_table_part("dataset") | standard_table_part("dataset")) + DOT + ) + + (quoted_table_part("table") | standard_table_part("table")) + ).setParseAction(record_table_identifier) + + def record_quoted_table_identifier(t): + identifier_list = t[0].split(".") + *first, second, third = identifier_list + first = ".".join(first) or None + identifier_list = [first, second, third] + padded_list = [None] * (3 - len(identifier_list)) + identifier_list + cls._table_identifiers.add(tuple(padded_list)) + + quotable_table_parts_identifier = ( + QUOTED_QUOT | QUOTED_APOS | QUOTED_ACC | QUOTED_BRACKETS + ).setParseAction(record_quoted_table_identifier) + + table_identifier = ( + quoted_table_parts_identifier | quotable_table_parts_identifier + ).setName("table_identifier") + single_source = ( + ( + table_identifier + + Optional(Optional(AS) + table_alias("table_alias*")) + + Optional(FOR - SYSTEM_TIME + AS + OF + expr) + + Optional(INDEXED + BY + index_name("name") | NOT + INDEXED) + )("index") + | (LPAR + ungrouped_select_stmt + RPAR) + | (LPAR + join_source + RPAR) + | (UNNEST + LPAR + expr + RPAR) + ) + Optional(Optional(AS) + table_alias) + + join_source <<= single_source + (join_op + single_source + join_constraint)[...] + + over_partition = (PARTITION + BY + delimitedList(partition_expression_list))( + "over_partition" + ) + over_order = ORDER + BY + delimitedList(ordering_term) + over_unsigned_value_specification = expr + over_window_frame_preceding = ( + UNBOUNDED + PRECEDING + | over_unsigned_value_specification + PRECEDING + | CURRENT + ROW + ) + over_window_frame_following = ( + UNBOUNDED + FOLLOWING + | over_unsigned_value_specification + FOLLOWING + | CURRENT + ROW + ) + over_window_frame_bound = ( + over_window_frame_preceding | over_window_frame_following + ) + over_window_frame_between = ( + BETWEEN + over_window_frame_bound + AND + over_window_frame_bound + ) + over_window_frame_extent = ( + over_window_frame_preceding | over_window_frame_between + ) + over_row_or_range = (ROWS | RANGE) + over_window_frame_extent + over = ( + OVER + + LPAR + + Optional(over_partition) + + Optional(over_order) + + Optional(over_row_or_range) + + RPAR + )("over") + if_term = IF - LPAR + expr + COMMA + expr + COMMA + expr + RPAR + + result_column = Optional(table_name + ".") + "*" + Optional( + EXCEPT + LPAR + delimitedList(column_name) + RPAR + ) | Group(quoted_expr + Optional(over)) + + window_select_clause = ( + WINDOW + identifier + AS + LPAR + window_specification + RPAR + ) + + with_stmt = Forward().setName("with statement") + ungrouped_select_no_with = ( + SELECT + + Optional(DISTINCT | ALL) + + Group( + delimitedList( + (~FROM + ~IF + result_column | if_term) + + Optional(Optional(AS) + column_alias), + allow_trailing_delim=True, + ) + )("columns") + + Optional(FROM + join_source("from*")) + + Optional(WHERE + expr) + + Optional( + GROUP + BY + Group(delimitedList(grouping_term))("group_by_terms") + ) + + Optional(HAVING + expr("having_expr")) + + Optional( + ORDER + BY + Group(delimitedList(ordering_term))("order_by_terms") + ) + + Optional(delimitedList(window_select_clause)) + ) + select_no_with = ungrouped_select_no_with | ( + LPAR + ungrouped_select_no_with + RPAR + ) + select_core = Optional(with_stmt) + select_no_with + grouped_select_core = select_core | (LPAR + select_core + RPAR) + + ungrouped_select_stmt <<= ( + grouped_select_core + + (compound_operator + grouped_select_core)[...] + + Optional( + LIMIT + + (Group(expr + OFFSET + expr) | Group(expr + COMMA + expr) | expr)( + "limit" + ) + ) + )("select") + select_stmt = ( + ungrouped_select_stmt | (LPAR + ungrouped_select_stmt + RPAR) + ) + Optional(SEMI) + + # define comment format, and ignore them + sql_comment = oneOf("-- #") + restOfLine | cStyleComment + select_stmt.ignore(sql_comment) + + def record_with_alias(t): + identifier_list = t.asList() + padded_list = [None] * (3 - len(identifier_list)) + identifier_list + cls._with_aliases.add(tuple(padded_list)) + + with_clause = Group( + identifier.setParseAction(record_with_alias) + + AS + + LPAR + + select_stmt + + RPAR + ) + with_stmt <<= WITH + delimitedList(with_clause) + with_stmt.ignore(sql_comment) + + cls._parser = select_stmt + return cls._parser + + def test(self, sql_stmt, expected_tables, verbose=False): + def print_(*args): + if verbose: + print(*args) + + print_(textwrap.dedent(sql_stmt.strip())) + found_tables = self.get_table_names(sql_stmt) + print_(found_tables) + expected_tables_set = set(expected_tables) + + if expected_tables_set != found_tables: + raise Exception( + f"Test {test_index} failed- expected {expected_tables_set} but got {found_tables}" + ) + print_() + + +if __name__ == "__main__": + # fmt: off + TEST_CASES = [ + [ + """\ + SELECT x FROM y.a, b + """, + [ + (None, "y", "a"), + (None, None, "b"), + ], + ], + [ + """\ + SELECT x FROM y.a JOIN b + """, + [ + (None, "y", "a"), + (None, None, "b"), + ], + ], + [ + """\ + select * from xyzzy where z > 100 + """, + [ + (None, None, "xyzzy"), + ], + ], + [ + """\ + select * from xyzzy where z > 100 order by zz + """, + [ + (None, None, "xyzzy"), + ], + ], + [ + """\ + select * from xyzzy + """, + [ + (None, None, "xyzzy"), + ], + ], + [ + """\ + select z.* from xyzzy + """, + [ + (None, None, "xyzzy"), + ], + ], + [ + """\ + select a, b from test_table where 1=1 and b='yes' + """, + [ + (None, None, "test_table"), + ], + ], + [ + """\ + select a, b from test_table where 1=1 and b in (select bb from foo) + """, + [ + (None, None, "test_table"), + (None, None, "foo"), + ], + ], + [ + """\ + select z.a, b from test_table where 1=1 and b in (select bb from foo) + """, + [ + (None, None, "test_table"), + (None, None, "foo"), + ], + ], + [ + """\ + select z.a, b from test_table where 1=1 and b in (select bb from foo) order by b,c desc,d + """, + [ + (None, None, "test_table"), + (None, None, "foo"), + ], + ], + [ + """\ + select z.a, b from test_table left join test2_table where 1=1 and b in (select bb from foo) + """, + [ + (None, None, "test_table"), + (None, None, "test2_table"), + (None, None, "foo"), + ], + ], + [ + """\ + select a, db.table.b as BBB from db.table where 1=1 and BBB='yes' + """, + [ + (None, "db", "table"), + ], + ], + [ + """\ + select a, db.table.b as BBB from test_table,db.table where 1=1 and BBB='yes' + """, + [ + (None, None, "test_table"), + (None, "db", "table"), + ], + ], + [ + """\ + select a, db.table.b as BBB from test_table,db.table where 1=1 and BBB='yes' limit 50 + """, + [ + (None, None, "test_table"), + (None, "db", "table"), + ], + ], + [ + """\ + select a, b from test_table where (1=1 or 2=3) and b='yes' group by zx having b=2 order by 1 + """, + [ + (None, None, "test_table"), + ], + ], + [ + """\ + select + a, + b + # this is a comment + from + test_table + # another comment + where (1=1 or 2=3) and b='yes' + #yup, a comment + group by zx having b=2 order by 1 + """, + [ + (None, None, "test_table"), + ], + ], + [ + """\ + SELECT COUNT(DISTINCT foo) FROM bar JOIN baz ON bar.baz_id = baz.id + """, + [ + (None, None, "bar"), + (None, None, "baz"), + ], + ], + [ + """\ + SELECT COUNT(DISTINCT foo) FROM bar, baz WHERE bar.baz_id = baz.id + """, + [ + (None, None, "bar"), + (None, None, "baz"), + ], + ], + [ + """\ + WITH one AS (SELECT id FROM foo) SELECT one.id + """, + [ + (None, None, "foo"), + ], + ], + [ + """\ + WITH one AS (SELECT id FROM foo), two AS (select id FROM bar) SELECT one.id, two.id + """, + [ + (None, None, "foo"), + (None, None, "bar"), + ], + ], + [ + """\ + SELECT x, + RANK() OVER (ORDER BY x ASC) AS rank, + DENSE_RANK() OVER (ORDER BY x ASC) AS dense_rank, + ROW_NUMBER() OVER (PARTITION BY x ORDER BY y) AS row_num + FROM a + """, + [ + (None, None, "a"), + ], + ], + [ + """\ + SELECT x, COUNT(*) OVER ( ORDER BY x + RANGE BETWEEN 2 PRECEDING AND 2 FOLLOWING ) AS count_x + FROM T + """, + [ + (None, None, "T"), + ], + ], + [ + """\ + SELECT firstname, department, startdate, + RANK() OVER ( PARTITION BY department ORDER BY startdate ) AS rank + FROM Employees + """, + [ + (None, None, "Employees"), + ], + ], + # A fragment from https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """\ + SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34' + """, + [], + ], + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """\ + WITH finishers AS + (SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34') + SELECT name, + FORMAT_TIMESTAMP('%X', finish_time) AS finish_time, + division, + FORMAT_TIMESTAMP('%X', fastest_time) AS fastest_time, + TIMESTAMP_DIFF(finish_time, fastest_time, SECOND) AS delta_in_seconds + FROM ( + SELECT name, + finish_time, + division, + FIRST_VALUE(finish_time) + OVER (PARTITION BY division ORDER BY finish_time ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS fastest_time + FROM finishers) + """, + [], + ], + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """\ + WITH finishers AS + (SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34') + SELECT name, + FORMAT_TIMESTAMP('%X', finish_time) AS finish_time, + division, + FORMAT_TIMESTAMP('%X', slowest_time) AS slowest_time, + TIMESTAMP_DIFF(slowest_time, finish_time, SECOND) AS delta_in_seconds + FROM ( + SELECT name, + finish_time, + division, + LAST_VALUE(finish_time) + OVER (PARTITION BY division ORDER BY finish_time ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS slowest_time + FROM finishers) + """, + [], + ], + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """\ + WITH finishers AS + (SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34') + SELECT name, + FORMAT_TIMESTAMP('%X', finish_time) AS finish_time, + division, + FORMAT_TIMESTAMP('%X', fastest_time) AS fastest_time, + FORMAT_TIMESTAMP('%X', second_fastest) AS second_fastest + FROM ( + SELECT name, + finish_time, + division,finishers, + FIRST_VALUE(finish_time) + OVER w1 AS fastest_time, + NTH_VALUE(finish_time, 2) + OVER w1 as second_fastest + FROM finishers + WINDOW w1 AS ( + PARTITION BY division ORDER BY finish_time ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)) + """, + [], + ], + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """\ + WITH finishers AS + (SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34') + SELECT name, + finish_time, + division, + LEAD(name) + OVER (PARTITION BY division ORDER BY finish_time ASC) AS followed_by + FROM finishers + """, + [], + ], + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """\ + WITH finishers AS + (SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34') + SELECT name, + finish_time, + division, + LEAD(name, 2) + OVER (PARTITION BY division ORDER BY finish_time ASC) AS two_runners_back + FROM finishers + """, + [], + ], + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """\ + WITH finishers AS + (SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34') + SELECT name, + finish_time, + division, + LAG(name) + OVER (PARTITION BY division ORDER BY finish_time ASC) AS preceding_runner + FROM finishers + """, + [], + ], + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """\ + SELECT + PERCENTILE_CONT(x, 0) OVER() AS min, + PERCENTILE_CONT(x, 0.01) OVER() AS percentile1, + PERCENTILE_CONT(x, 0.5) OVER() AS median, + PERCENTILE_CONT(x, 0.9) OVER() AS percentile90, + PERCENTILE_CONT(x, 1) OVER() AS max + FROM UNNEST([0, 3, NULL, 1, 2]) AS x LIMIT 1 + """, + [], + ], + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """\ + SELECT + x, + PERCENTILE_DISC(x, 0) OVER() AS min, + PERCENTILE_DISC(x, 0.5) OVER() AS median, + PERCENTILE_DISC(x, 1) OVER() AS max + FROM UNNEST(['c', NULL, 'b', 'a']) AS x + """, + [], + ], + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/timestamp_functions + [ + """\ + SELECT + TIMESTAMP "2008-12-25 15:30:00 UTC" as original, + TIMESTAMP_ADD(TIMESTAMP "2008-12-25 15:30:00 UTC", INTERVAL 10 MINUTE) AS later + """, + [], + ], + # Previously hosted on https://cloud.google.com/bigquery/docs/reference/standard-sql/timestamp_functions, but + # appears to no longer be there + [ + """\ + WITH date_hour_slots AS ( + SELECT + [ + STRUCT( + " 00:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 01:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 02:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 03:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 04:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 05:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 06:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 07:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 08:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY ) as dt_range), + STRUCT( + " 09:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 10:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 11:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 12:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 13:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 14:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 15:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 16:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 17:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 18:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 19:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 20:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 21:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 22:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 23:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range) + ] + AS full_timestamps) + SELECT + dt AS dates, hrs, CAST(CONCAT( CAST(dt as STRING), CAST(hrs as STRING)) as TIMESTAMP) as timestamp_value + FROM `date_hour_slots`, date_hour_slots.full_timestamps LEFT JOIN full_timestamps.dt_range as dt + """, + [ + (None, "date_hour_slots", "full_timestamps"), + (None, "full_timestamps", "dt_range"), + ], + ], + [ + """\ + SELECT + [foo], + ARRAY[foo], + ARRAY[foo, bar], + STRUCT(1, 3), + STRUCT(2, 'foo'), + current_date(), + GENERATE_ARRAY(5, NULL, 1), + GENERATE_DATE_ARRAY('2016-10-05', '2016-10-01', INTERVAL 1 DAY), + GENERATE_DATE_ARRAY('2016-10-05', NULL), + GENERATE_DATE_ARRAY('2016-01-01', '2016-12-31', INTERVAL 2 MONTH), + GENERATE_DATE_ARRAY('2000-02-01',current_date(), INTERVAL 1 DAY), + GENERATE_TIMESTAMP_ARRAY('2016-10-05 00:00:00', '2016-10-05 00:00:02', INTERVAL 1 SECOND) + FROM + bar + """, + [ + (None, None, "bar"), + ], + ], + [ + """\ + SELECT GENERATE_ARRAY(start, 5) AS example_array + FROM UNNEST([3, 4, 5]) AS start + """, + [], + ], + [ + """\ + WITH StartsAndEnds AS ( + SELECT DATE '2016-01-01' AS date_start, DATE '2016-01-31' AS date_end + UNION ALL SELECT DATE "2016-04-01", DATE "2016-04-30" + UNION ALL SELECT DATE "2016-07-01", DATE "2016-07-31" + UNION ALL SELECT DATE "2016-10-01", DATE "2016-10-31" + ) + SELECT GENERATE_DATE_ARRAY(date_start, date_end, INTERVAL 1 WEEK) AS date_range + FROM StartsAndEnds + """, + [], + ], + [ + """\ + SELECT GENERATE_TIMESTAMP_ARRAY(start_timestamp, end_timestamp, INTERVAL 1 HOUR) + AS timestamp_array + FROM + (SELECT + TIMESTAMP '2016-10-05 00:00:00' AS start_timestamp, + TIMESTAMP '2016-10-05 02:00:00' AS end_timestamp + UNION ALL + SELECT + TIMESTAMP '2016-10-05 12:00:00' AS start_timestamp, + TIMESTAMP '2016-10-05 14:00:00' AS end_timestamp + UNION ALL + SELECT + TIMESTAMP '2016-10-05 23:59:00' AS start_timestamp, + TIMESTAMP '2016-10-06 01:59:00' AS end_timestamp) + """, + [], + ], + [ + """\ + SELECT DATE_SUB(current_date("-08:00"), INTERVAL 2 DAY) + """, + [], + ], + [ + """\ + SELECT + case when (a) then b else c end + FROM d + """, + [ + (None, None, "d"), + ], + ], + [ + """\ + SELECT + e, + case when (f) then g else h end + FROM i + """, + [ + (None, None, "i"), + ], + ], + [ + """\ + SELECT + case when j then k else l end + FROM m + """, + [ + (None, None, "m",), + ], + ], + [ + """\ + SELECT + n, + case when o then p else q end + FROM r + """, + [ + (None, None, "r"), + ], + ], + [ + """\ + SELECT + case s when (t) then u else v end + FROM w + """, + [ + (None, None, "w"), + ], + ], + [ + """\ + SELECT + x, + case y when (z) then aa else ab end + FROM ac + """, + [ + (None, None, "ac"), + ], + ], + [ + """\ + SELECT + case ad when ae then af else ag end + FROM ah + """, + [ + (None, None, "ah"), + ], + ], + [ + """\ + SELECT + ai, + case aj when ak then al else am end + FROM an + """, + [ + (None, None, "an"), + ], + ], + [ + """\ + WITH + ONE AS (SELECT x FROM y), + TWO AS (select a FROM b) + SELECT y FROM onE JOIN TWo + """, + [ + (None, None, "y"), + (None, None, "b"), + ], + ], + [ + """\ + SELECT + a, + (SELECT b FROM oNE) + FROM OnE + """, + [ + (None, None, "oNE"), + (None, None, "OnE"), + ], + ], + [ + """\ + SELECT * FROM `a.b.c` + """, + [ + ("a", "b", "c"), + ], + ], + [ + """\ + SELECT * FROM `b.c` + """, + [ + (None, "b", "c"), + ], + ], + [ + """\ + SELECT * FROM `c` + """, + [ + (None, None, "c"), + ], + ], + [ + """\ + SELECT * FROM a.b.c + """, + [ + ("a", "b", "c"), + ], + ], + [ + """\ + SELECT * FROM "a"."b"."c" + """, + [ + ("a", "b", "c"), + ], + ], + [ + """\ + SELECT * FROM 'a'.'b'.'c' + """, + [ + ("a", "b", "c"), + ], + ], + [ + """\ + SELECT * FROM `a`.`b`.`c` + """, + [ + ("a", "b", "c"), + ], + ], + [ + """\ + SELECT * FROM "a.b.c" + """, + [ + ("a", "b", "c"), + ], + ], + [ + """\ + SELECT * FROM 'a.b.c' + """, + [ + ("a", "b", "c"), + ], + ], + [ + """\ + SELECT * FROM `a.b.c` + """, + [ + ("a", "b", "c"), + ], + ], + [ + """\ + SELECT t2.a + FROM t2 FOR SYSTEM_TIME AS OF t1.timestamp_column + """, + [ + (None, None, "t2"), + ], + ], + [ + """\ + SELECT * + FROM t1 + WHERE t1.a IN (SELECT t2.a + FROM t2 FOR SYSTEM_TIME AS OF t1.timestamp_column) + """, + [ + (None, None, "t1"), + (None, None, "t2"), + ], + ], + [ + """\ + WITH a AS (SELECT b FROM c) + SELECT d FROM A JOIN e ON f = g JOIN E ON h = i + """, + [ + (None, None, "c"), + (None, None, "e"), + (None, None, "E"), + ], + ], + [ + """\ + with + a as ( + ( + select b from + ( + select c from d + ) + Union all + ( + select e from f + ) + ) + ) + + select g from h + """, + [ + (None, None, "d"), + (None, None, "f"), + (None, None, "h"), + ], + ], + [ + """\ + select + a AS ESCAPE, + b AS CURRENT_TIME, + c AS CURRENT_DATE, + d AS CURRENT_TIMESTAMP, + e AS DATE_ADD + FROM x + """, + [ + (None, None, "x"), + ], + ], + [ + """\ + WITH x AS ( + SELECT a + FROM b + WINDOW w as (PARTITION BY a) + ) + SELECT y FROM z + """, + [ + (None, None, "b"), + (None, None, "z") + ], + ], + [ + """\ + SELECT DISTINCT + FIRST_VALUE(x IGNORE NULLS) OVER (PARTITION BY y) + FROM z + """, + [ + (None, None, "z") + ], + ], + [ + """\ + SELECT a . b . c + FROM d + """, + [ + (None, None, "d") + ], + ], + [ + """\ + WITH a AS ( + SELECT b FROM c + UNION ALL + ( + WITH d AS ( + SELECT e FROM f + ) + SELECT g FROM d + ) + ) + SELECT h FROM a + """, + [ + (None, None, "c"), + (None, None, "f") + ], + ], + [ + """\ + WITH a AS ( + SELECT b FROM c + UNION ALL + ( + WITH d AS ( + SELECT e FROM f + ) + SELECT g FROM d + ) + ) + (SELECT h FROM a) + """, + [ + (None, None, "c"), + (None, None, "f") + ], + ], + [ + """\ + SELECT * FROM a.b.`c` + """, + [ + ("a", "b", "c"), + ], + ], + [ + """\ + SELECT * FROM 'a'.b.`c` + """, + [ + ("a", "b", "c"), + ], + ], + # from https://cloud.google.com/bigquery/docs/reference/legacy-sql + [ + """\ + SELECT + word, + word_count, + RANK() OVER (PARTITION BY corpus ORDER BY word_count DESC) rank, + FROM + [bigquery-public-data:samples.shakespeare] + WHERE + corpus='othello' and length(word) > 10 + LIMIT 5 + """, + [ + (None, 'bigquery-public-data:samples', 'shakespeare'), + ], + ], + [ + """\ + SELECT + word, + word_count, + RATIO_TO_REPORT(word_count) OVER (PARTITION BY corpus ORDER BY word_count DESC) r_to_r, + FROM + [bigquery-public-data:samples.shakespeare] + WHERE + corpus='othello' and length(word) > 10 + LIMIT 5 + """, + [ + (None, 'bigquery-public-data:samples', 'shakespeare'), + ], + ], + [ + """\ + SELECT + word, + word_count, + ROW_NUMBER() OVER (PARTITION BY corpus ORDER BY word_count DESC) row_num, + FROM + [bigquery-public-data:samples.shakespeare] + WHERE + corpus='othello' and length(word) > 10 + LIMIT 5 + """, + [ + (None, 'bigquery-public-data:samples', 'shakespeare'), + ], + ], + [ + """\ + SELECT + TO_BASE64(SHA1(title)) + FROM + [bigquery-public-data:samples.wikipedia] + LIMIT + 100; + """, + [ + (None, 'bigquery-public-data:samples', 'wikipedia'), + ], + ], + [ + """\ + SELECT + CASE + WHEN state IN ('WA', 'OR', 'CA', 'AK', 'HI', 'ID', + 'MT', 'WY', 'NV', 'UT', 'CO', 'AZ', 'NM') + THEN 'West' + WHEN state IN ('OK', 'TX', 'AR', 'LA', 'TN', 'MS', 'AL', + 'KY', 'GA', 'FL', 'SC', 'NC', 'VA', 'WV', + 'MD', 'DC', 'DE') + THEN 'South' + WHEN state IN ('ND', 'SD', 'NE', 'KS', 'MN', 'IA', + 'MO', 'WI', 'IL', 'IN', 'MI', 'OH') + THEN 'Midwest' + WHEN state IN ('NY', 'PA', 'NJ', 'CT', + 'RI', 'MA', 'VT', 'NH', 'ME') + THEN 'Northeast' + ELSE 'None' + END as region, + average_mother_age, + average_father_age, + state, year + FROM + (SELECT + year, state, + SUM(mother_age)/COUNT(mother_age) as average_mother_age, + SUM(father_age)/COUNT(father_age) as average_father_age + FROM + [bigquery-public-data:samples.natality] + WHERE + father_age < 99 + GROUP BY + year, state) + ORDER BY + year + LIMIT 5; + """, + [ + (None, 'bigquery-public-data:samples', 'natality'), + ], + ], + [ + """\ + SELECT + /* Replace white spaces in the title with underscores. */ + REGEXP_REPLACE(title, r'\s+', '_') AS regexp_title, revisions + FROM + (SELECT title, COUNT(revision_id) as revisions + FROM + [bigquery-public-data:samples.wikipedia] + WHERE + wp_namespace=0 + /* Match titles that start with 'G', end with + * 'e', and contain at least two 'o's. + */ + AND REGEXP_MATCH(title, r'^G.*o.*o.*e$') + GROUP BY + title + ORDER BY + revisions DESC + LIMIT 100);""", + [ + (None, 'bigquery-public-data:samples', 'wikipedia'), + ], + ], + [ + """\ + SELECT + page_title, + /* Populate these columns as True or False, */ + /* depending on the condition */ + IF (page_title CONTAINS 'search', + INTEGER(total), 0) AS search, + IF (page_title CONTAINS 'Earth' OR + page_title CONTAINS 'Maps', INTEGER(total), 0) AS geo, + FROM + /* Subselect to return top revised Wikipedia articles */ + /* containing 'Google', followed by additional text. */ + (SELECT + TOP (title, 5) as page_title, + COUNT (*) as total + FROM + [bigquery-public-data:samples.wikipedia] + WHERE + REGEXP_MATCH (title, r'^Google.+') AND wp_namespace = 0 + ); + """, + [ + (None, 'bigquery-public-data:samples', 'wikipedia'), + ] + ], + [ + """\ + SELECT + title, + HASH(title) AS hash_value, + IF(ABS(HASH(title)) % 2 == 1, 'True', 'False') + AS included_in_sample + FROM + [bigquery-public-data:samples.wikipedia] + WHERE + wp_namespace = 0 + LIMIT 5; + """, + [ + (None, 'bigquery-public-data:samples', 'wikipedia'), + ] + ], + [ + """\ + with t as (select CASE when EXTRACT(dayofweek FROM CURRENT_DATETIME()) == 1 then "S" end) select * from t + """, + [], + ], + ] + # fmt: on + + parser = BigQueryViewParser() + for test_index, test_case in enumerate(TEST_CASES): + sql, expected = test_case + parser.test(sql_stmt=sql, expected_tables=expected, verbose=True) diff --git a/examples/booleansearchparser.py b/examples/booleansearchparser.py new file mode 100644 index 00000000..cefba016 --- /dev/null +++ b/examples/booleansearchparser.py @@ -0,0 +1,451 @@ +""" +Boolean Search query parser (Based on searchparser: https://github.com/pyparsing/pyparsing/blob/master/examples/searchparser.py) + +version 2018-07-22 + +This search query parser uses the excellent Pyparsing module +(http://pyparsing.sourceforge.net/) to parse search queries by users. +It handles: + +* 'and', 'or' and implicit 'and' operators; +* parentheses; +* quoted strings; +* wildcards at the end of a search term (help*); +* wildcards at the beginning of a search term (*lp); +* non-western languages + +Requirements: +* Python +* Pyparsing + +SAMPLE USAGE: +from booleansearchparser import BooleanSearchParser +from __future__ import print_function +bsp = BooleanSearchParser() +text = u"wildcards at the beginning of a search term " +exprs= [ + u"*cards and term", #True + u"wild* and term", #True + u"not terms", #True + u"terms or begin", #False +] +for expr in exprs: + print (bsp.match(text,expr)) + +#non-western samples +text = u"안녕하세요, 당신은 어떠세요?" +exprs= [ + u"*신은 and 어떠세요", #True + u"not 당신은", #False + u"당신 or 당", #False +] +for expr in exprs: + print (bsp.match(text,expr)) +------------------------------------------------------------------------------- +Copyright (c) 2006, Estrate, the Netherlands +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of Estrate nor the names of its contributors may be used + to endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CONTRIBUTORS: +- Steven Mooij +- Rudolph Froger +- Paul McGuire +- Guiem Bosch +- Francesc Garcia + +TODO: +- add more docs +- ask someone to check my English texts +- add more kinds of wildcards ('*' at the beginning and '*' inside a word)? + +""" +from pyparsing import ( + Word, + alphanums, + CaselessKeyword, + Group, + Forward, + Suppress, + OneOrMore, + one_of, +) +import re + + +# Updated on 02 Dec 2021 according to ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt +# (includes characters not found in the BasicMultilingualPlane) +alphabet_ranges = [ + # CYRILIC: https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block) + [int("0400", 16), int("04FF", 16)], + # ARABIC: https://en.wikipedia.org/wiki/Arabic_(Unicode_block) (Arabic (0600–06FF)+ Syriac (0700–074F)+ Arabic Supplement (0750–077F)) + [int("0600", 16), int("07FF", 16)], + # THAI: https://en.wikipedia.org/wiki/Thai_(Unicode_block) + [int("0E00", 16), int("0E7F", 16)], + # JAPANESE : https://en.wikipedia.org/wiki/Japanese_writing_system (Hiragana (3040–309F) + Katakana (30A0–30FF)) + [int("3040", 16), int("30FF", 16)], + # Enclosed CJK Letters and Months + [int("3200", 16), int("32FF", 16)], + # CHINESE: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + [int("4E00", 16), int("9FFF", 16)], + # KOREAN : https://en.wikipedia.org/wiki/Hangul + [int("1100", 16), int("11FF", 16)], + [int("3130", 16), int("318F", 16)], + [int("A960", 16), int("A97F", 16)], + [int("AC00", 16), int("D7AF", 16)], + [int("D7B0", 16), int("D7FF", 16)], + # Halfwidth and Fullwidth Forms + [int("FF00", 16), int("FFEF", 16)], +] + + +class BooleanSearchParser: + def __init__(self, only_parse=False): + self._methods = { + "and": self.evaluateAnd, + "or": self.evaluateOr, + "not": self.evaluateNot, + "parenthesis": self.evaluateParenthesis, + "quotes": self.evaluateQuotes, + "word": self.evaluateWord, + "wordwildcardprefix": self.evaluateWordWildcardPrefix, + "wordwildcardsufix": self.evaluateWordWildcardSufix, + } + self._parser = self.parser() + self.text = "" + self.words = [] + + def parser(self): + """ + This function returns a parser. + The grammar should be like most full text search engines (Google, Tsearch, Lucene). + + Grammar: + - a query consists of alphanumeric words, with an optional '*' + wildcard at the end or the beginning of a word + - a sequence of words between quotes is a literal string + - words can be used together by using operators ('and' or 'or') + - words with operators can be grouped with parenthesis + - a word or group of words can be preceded by a 'not' operator + - the 'and' operator precedes an 'or' operator + - if an operator is missing, use an 'and' operator + """ + operatorOr = Forward() + + alphabet = alphanums + + # support for non-western alphabets + for lo, hi in alphabet_ranges: + alphabet += "".join(chr(c) for c in range(lo, hi + 1) if not chr(c).isspace()) + + operatorWord = Group(Word(alphabet + "*")).set_results_name("word*") + + operatorQuotesContent = Forward() + operatorQuotesContent << ((operatorWord + operatorQuotesContent) | operatorWord) + + operatorQuotes = ( + Group(Suppress('"') + operatorQuotesContent + Suppress('"')).set_results_name( + "quotes" + ) + | operatorWord + ) + + operatorParenthesis = ( + Group(Suppress("(") + operatorOr + Suppress(")")).set_results_name( + "parenthesis" + ) + | operatorQuotes + ) + + operatorNot = Forward() + operatorNot << ( + Group(Suppress(CaselessKeyword("not")) + operatorNot).set_results_name( + "not" + ) + | operatorParenthesis + ) + + operatorAnd = Forward() + operatorAnd << ( + Group( + operatorNot + Suppress(CaselessKeyword("and")) + operatorAnd + ).set_results_name("and") + | Group( + operatorNot + OneOrMore(~one_of("and or") + operatorAnd) + ).set_results_name("and") + | operatorNot + ) + + operatorOr << ( + Group( + operatorAnd + Suppress(CaselessKeyword("or")) + operatorOr + ).set_results_name("or") + | operatorAnd + ) + + return operatorOr.parse_string + + def evaluateAnd(self, argument): + return all(self.evaluate(arg) for arg in argument) + + def evaluateOr(self, argument): + return any(self.evaluate(arg) for arg in argument) + + def evaluateNot(self, argument): + return self.GetNot(self.evaluate(argument[0])) + + def evaluateParenthesis(self, argument): + return self.evaluate(argument[0]) + + def evaluateQuotes(self, argument): + """Evaluate quoted strings + + First is does an 'and' on the individual search terms, then it asks the + function GetQuoted to only return the subset of ID's that contain the + literal string. + """ + # r = set() + r = False + search_terms = [] + for item in argument: + search_terms.append(item[0]) + r = r and self.evaluate(item) + return self.GetQuotes(" ".join(search_terms), r) + + def evaluateWord(self, argument): + wildcard_count = argument[0].count("*") + if wildcard_count > 0: + if wildcard_count == 1 and argument[0].startswith("*"): + return self.GetWordWildcard(argument[0][1:], method="endswith") + if wildcard_count == 1 and argument[0].endswith("*"): + return self.GetWordWildcard(argument[0][:-1], method="startswith") + else: + _regex = argument[0].replace("*", ".+") + matched = False + for w in self.words: + matched = bool(re.search(_regex, w)) + if matched: + break + return matched + + return self.GetWord(argument[0]) + + def evaluateWordWildcardPrefix(self, argument): + return self.GetWordWildcard(argument[0], method="endswith") + + def evaluateWordWildcardSufix(self, argument): + return self.GetWordWildcard(argument[0], method="startswith") + + def evaluate(self, argument): + return self._methods[argument.getName()](argument) + + def Parse(self, query): + return self.evaluate(self._parser(query)[0]) + + def GetWord(self, word): + return word in self.words + + def GetWordWildcard(self, word, method="startswith"): + matched = False + for w in self.words: + matched = getattr(w, method)(word) + if matched: + break + return matched + + """ + def GetKeyword(self, name, value): + return set() + + def GetBetween(self, min, max): + print (min,max) + return set() + """ + + def GetQuotes(self, search_string, tmp_result): + return search_string in self.text + + def GetNot(self, not_set): + return not not_set + + def _split_words(self, text): + words = [] + """ + >>> import string + >>> string.punctuation + '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' + """ + # it will keep @, # and + # usernames and hashtags can contain dots, so a double check is done + r = re.compile(r"[\s{}]+".format(re.escape("!\"$%&'()*+,-/:;<=>?[\\]^`{|}~"))) + _words = r.split(text) + for _w in _words: + if "." in _w and not _w.startswith("#") and not _w.startswith("@"): + for __w in _w.split("."): + words.append(__w) + continue + + words.append(_w) + + return words + + def match(self, text, expr): + self.text = text + self.words = self._split_words(text) + + return self.Parse(expr) + + +class ParserTest(BooleanSearchParser): + """Tests the parser with some search queries + tests contains a dictionary with tests and expected results. + """ + + def Test(self): + # fmt: off + exprs = { + "0": "help", + "1": "help or hulp", + "2": "help and hulp", + "3": "help hulp", + "4": "help and hulp or hilp", + "5": "help or hulp and hilp", + "6": "help or hulp or hilp or halp", + "7": "(help or hulp) and (hilp or halp)", + "8": "help and (hilp or halp)", + "9": "(help and (hilp or halp)) or hulp", + "10": "not help", + "11": "not hulp and halp", + "12": "not (help and halp)", + "13": '"help me please"', + "14": '"help me please" or hulp', + "15": '"help me please" or (hulp and halp)', + "16": "help*", + "17": "help or hulp*", + "18": "help* and hulp", + "19": "help and hulp* or hilp", + "20": "help* or hulp or hilp or halp", + "21": "(help or hulp*) and (hilp* or halp)", + "22": "help* and (hilp* or halp*)", + "23": "(help and (hilp* or halp)) or hulp*", + "24": "not help* and halp", + "25": "not (help* and helpe*)", + "26": '"help* me please"', + "27": '"help* me* please" or hulp*', + "28": '"help me please*" or (hulp and halp)', + "29": '"help me please" not (hulp and halp)', + "30": '"help me please" hulp', + "31": "help and hilp and not holp", + "32": "help hilp not holp", + "33": "help hilp and not holp", + "34": "*lp and halp", + "35": "*신은 and 어떠세요", + } + + texts_matcheswith = { + "halp thinks he needs help": [ + "25", "22", "20", "21", "11", "17", "16", "23", "34", "1", + "0", "5", "7", "6", "9", "8", + ], + "he needs halp": ["24", "25", "20", "11", "10", "12", "34", "6"], + "help": ["25", "20", "12", "17", "16", "1", "0", "5", "6"], + "help hilp": [ + "25", "22", "20", "32", "21", "12", "17", "16", "19", "31", + "23", "1", "0", "5", "4", "7", "6", "9", "8", "33", + ], + "help me please hulp": [ + "30", "25", "27", "20", "13", "12", "15", "14", "17", "16", + "19", "18", "23", "29", "1", "0", "3", "2", "5", "4", "6", "9", + ], + "helper": ["20", "10", "12", "16"], + "hulp hilp": [ + "25", "27", "20", "21", "10", "12", "14", "17", "19", "23", + "1", "5", "4", "7", "6", "9", + ], + "nothing": ["25", "10", "12"], + "안녕하세요, 당신은 어떠세요?": ["10", "12", "25", "35"], + } + # fmt: on + + all_ok = True + for text, matches in texts_matcheswith.items(): + _matches = [] + for _id, expr in exprs.items(): + if self.match(text, expr): + _matches.append(_id) + + test_passed = sorted(matches) == sorted(_matches) + if test_passed: + print("Passed", repr(text)) + else: + print("Failed", repr(text), "expected", matches, "matched", _matches) + + all_ok = all_ok and test_passed + + # Tests for non western characters, should fail with + # pyparsing.exceptions.ParseException under the previous + # configuration + non_western_exprs = { + "0": "*", + "1": "ヿ", # Edge character + "2": "亀", # Character in CJK block + "3": "ヿ or 亀", + "4": "ヿ and 亀", + "5": "not ヿ" + } + + non_western_texts_matcheswith = { + "안녕하세요, 당신은 어떠세요?": ["0", "5"], + "ヿ": ["0", "1", "3"], + "亀": ["0", "2", "3", "5"], + "亀 ヿ": ["0", "1", "2", "3", "4"], + } + + for text, matches in non_western_texts_matcheswith.items(): + _matches = [] + for _id, expr in non_western_exprs.items(): + if self.match(text, expr): + _matches.append(_id) + + test_passed = sorted(matches) == sorted(_matches) + if test_passed: + print("Passed", repr(text)) + else: + print("Failed", repr(text), "expected", matches, "matched", _matches) + + all_ok = all_ok and test_passed + + return all_ok + + +def main(): + if ParserTest().Test(): + print("All tests OK") + else: + print("One or more tests FAILED") + raise Exception("One or more tests FAILED") + + +if __name__ == "__main__": + main() diff --git a/examples/btpyparse.py b/examples/btpyparse.py index 4fbbac81..3531761d 100644 --- a/examples/btpyparse.py +++ b/examples/btpyparse.py @@ -10,28 +10,40 @@ Simplified BSD license """ -from pyparsing import (Regex, Suppress, ZeroOrMore, Group, Optional, Forward, - SkipTo, CaselessLiteral, Dict) +from pyparsing import ( + Regex, + Suppress, + ZeroOrMore, + Group, + Optional, + Forward, + SkipTo, + CaselessLiteral, + Dict, +) + + +class Macro: + """Class to encapsulate undefined macro references""" - -class Macro(object): - """ Class to encapsulate undefined macro references """ def __init__(self, name): self.name = name + def __repr__(self): return 'Macro("%s")' % self.name + def __eq__(self, other): return self.name == other.name - def __ne__(self, other): - return self.name != other.name # Character literals -LCURLY,RCURLY,LPAREN,RPAREN,QUOTE,COMMA,AT,EQUALS,HASH = map(Suppress,'{}()",@=#') +LCURLY, RCURLY, LPAREN, RPAREN, QUOTE, COMMA, AT, EQUALS, HASH = map( + Suppress, '{}()",@=#' +) def bracketed(expr): - """ Return matcher for `expr` between curly brackets or parentheses """ + """Return matcher for `expr` between curly brackets or parentheses""" return (LPAREN + expr + RPAREN) | (LCURLY + expr + RCURLY) @@ -50,31 +62,30 @@ def bracketed(expr): quoted_string = QUOTE + ZeroOrMore(quoted_item) + QUOTE # Numbers can just be numbers. Only integers though. -number = Regex('[0-9]+') +number = Regex("[0-9]+") # Basis characters (by exclusion) for variable / field names. The following # list of characters is from the btparse documentation -any_name = Regex('[^\\s"#%\'(),={}]+') +any_name = Regex("[^\\s\"#%'(),={}]+") # btparse says, and the test bibs show by experiment, that macro and field names # cannot start with a digit. In fact entry type names cannot start with a digit # either (see tests/bibs). Cite keys can start with a digit -not_digname = Regex('[^\\d\\s"#%\'(),={}][^\\s"#%\'(),={}]*') +not_digname = Regex("[^\\d\\s\"#%'(),={}][^\\s\"#%'(),={}]*") # Comment comments out to end of line -comment = (AT + CaselessLiteral('comment') + - Regex(r"[\s{(].*").leaveWhitespace()) +comment = AT + CaselessLiteral("comment") + Regex(r"[\s{(].*").leaveWhitespace() # The name types with their digiteyness not_dig_lower = not_digname.copy().setParseAction(lambda t: t[0].lower()) macro_def = not_dig_lower.copy() -macro_ref = not_dig_lower.copy().setParseAction(lambda t : Macro(t[0].lower())) +macro_ref = not_dig_lower.copy().setParseAction(lambda t: Macro(t[0].lower())) field_name = not_dig_lower.copy() # Spaces in names mean they cannot clash with field names -entry_type = not_dig_lower('entry_type') -cite_key = any_name('cite_key') +entry_type = not_dig_lower("entry_type") +cite_key = any_name("cite_key") # Number has to be before macro name -string = (number | macro_ref | quoted_string | curly_string) +string = number | macro_ref | quoted_string | curly_string # There can be hash concatenation field_value = string + ZeroOrMore(HASH + string) @@ -82,25 +93,21 @@ def bracketed(expr): entry_contents = Dict(ZeroOrMore(field_def + COMMA) + Optional(field_def)) # Entry is surrounded either by parentheses or curlies -entry = (AT + entry_type + bracketed(cite_key + COMMA + entry_contents)) +entry = AT + entry_type + bracketed(cite_key + COMMA + entry_contents) # Preamble is a macro-like thing with no name -preamble = AT + CaselessLiteral('preamble') + bracketed(field_value) +preamble = AT + CaselessLiteral("preamble") + bracketed(field_value) # Macros (aka strings) macro_contents = macro_def + EQUALS + field_value -macro = AT + CaselessLiteral('string') + bracketed(macro_contents) +macro = AT + CaselessLiteral("string") + bracketed(macro_contents) # Implicit comments -icomment = SkipTo('@').setParseAction(lambda t : t.insert(0, 'icomment')) +icomment = SkipTo("@").setParseAction(lambda t: t.insert(0, "icomment")) # entries are last in the list (other than the fallback) because they have # arbitrary start patterns that would match comments, preamble or macro -definitions = Group(comment | - preamble | - macro | - entry | - icomment) +definitions = Group(comment | preamble | macro | entry | icomment) # Start symbol bibfile = ZeroOrMore(definitions) @@ -110,7 +117,7 @@ def parse_str(str): return bibfile.parseString(str) -if __name__ == '__main__': +if __name__ == "__main__": # Run basic test txt = """ Some introductory text @@ -126,4 +133,4 @@ def parse_str(str): number = {2} } """ - print('\n\n'.join(defn.dump() for defn in parse_str(txt))) + print("\n\n".join(defn.dump() for defn in parse_str(txt))) diff --git a/examples/builtin_parse_action_demo.py b/examples/builtin_parse_action_demo.py index 3ec6af8d..fed6e2a3 100644 --- a/examples/builtin_parse_action_demo.py +++ b/examples/builtin_parse_action_demo.py @@ -5,14 +5,13 @@ # Simple example of using builtin functions as parse actions. # -from pyparsing import * - -integer = Word(nums).setParseAction(lambda t : int(t[0])) +import pyparsing as pp +ppc = pp.common # make an expression that will match a list of ints (which # will be converted to actual ints by the parse action attached # to integer) -nums = OneOrMore(integer) +nums = ppc.integer[...] test = "2 54 34 2 211 66 43 2 0" @@ -20,10 +19,9 @@ # try each of these builtins as parse actions for fn in (sum, max, min, len, sorted, reversed, list, tuple, set, any, all): - fn_name = fn.__name__ if fn is reversed: # reversed returns an iterator, we really want to show the list of items - fn = lambda x : list(reversed(x)) + fn = lambda x: list(reversed(x)) # show how each builtin works as a free-standing parse action - print(fn_name, nums.setParseAction(fn).parseString(test)) + print(fn.__name__, nums.set_parse_action(fn).parse_string(test)) diff --git a/examples/cLibHeader.py b/examples/cLibHeader.py index 6bb1c25a..10a0c770 100644 --- a/examples/cLibHeader.py +++ b/examples/cLibHeader.py @@ -6,7 +6,17 @@ # Copyright, 2012 - Paul McGuire # -from pyparsing import Word, alphas, alphanums, Combine, oneOf, Optional, delimitedList, Group, Keyword +from pyparsing import ( + Word, + alphas, + alphanums, + Combine, + oneOf, + Optional, + delimitedList, + Group, + Keyword, +) testdata = """ int func1(float *vec, int len, double arg1); @@ -14,12 +24,12 @@ """ ident = Word(alphas, alphanums + "_") -vartype = Combine( oneOf("float double int char") + Optional(Word("*")), adjacent = False) +vartype = Combine(oneOf("float double int char") + Optional(Word("*")), adjacent=False) arglist = delimitedList(Group(vartype("type") + ident("name"))) functionCall = Keyword("int") + ident("name") + "(" + arglist("args") + ")" + ";" -for fn,s,e in functionCall.scanString(testdata): +for fn, s, e in functionCall.scanString(testdata): print(fn.name) for a in fn.args: print(" - %(name)s (%(type)s)" % a) diff --git a/examples/chemicalFormulas.py b/examples/chemicalFormulas.py deleted file mode 100644 index 753901b8..00000000 --- a/examples/chemicalFormulas.py +++ /dev/null @@ -1,84 +0,0 @@ -# -*- coding: utf-8 -*- -# -# chemicalFormulas.py -# -# Copyright (c) 2003,2019 Paul McGuire -# - -import pyparsing as pp - -atomicWeight = { - "O" : 15.9994, - "H" : 1.00794, - "Na" : 22.9897, - "Cl" : 35.4527, - "C" : 12.0107, - } - -digits = "0123456789" - -# Version 1 -element = pp.Word(pp.alphas.upper(), pp.alphas.lower(), max=2) -# for stricter matching, use this Regex instead -# element = Regex("A[cglmrstu]|B[aehikr]?|C[adeflmorsu]?|D[bsy]|" -# "E[rsu]|F[emr]?|G[ade]|H[efgos]?|I[nr]?|Kr?|L[airu]|" -# "M[dgnot]|N[abdeiop]?|Os?|P[abdmortu]?|R[abefghnu]|" -# "S[bcegimnr]?|T[abcehilm]|U(u[bhopqst])?|V|W|Xe|Yb?|Z[nr]") -elementRef = pp.Group(element + pp.Optional(pp.Word(digits), default="1")) -formula = elementRef[...] - -fn = lambda elemList : sum(atomicWeight[elem]*int(qty) for elem,qty in elemList) -formula.runTests("""\ - H2O - C6H5OH - NaCl - """, - fullDump=False, postParse=lambda _, tokens: "Molecular weight: {0}".format(fn(tokens))) -print() - -# Version 2 - access parsed items by results name -elementRef = pp.Group(element("symbol") + pp.Optional(pp.Word(digits), default="1")("qty")) -formula = elementRef[...] - -fn = lambda elemList : sum(atomicWeight[elem.symbol]*int(elem.qty) for elem in elemList) -formula.runTests("""\ - H2O - C6H5OH - NaCl - """, - fullDump=False, postParse=lambda _, tokens: "Molecular weight: {0}".format(fn(tokens))) -print() - -# Version 3 - convert integers during parsing process -integer = pp.Word(digits).setParseAction(lambda t:int(t[0])) -elementRef = pp.Group(element("symbol") + pp.Optional(integer, default=1)("qty")) -formula = elementRef[...] - -fn = lambda elemList : sum(atomicWeight[elem.symbol]*elem.qty for elem in elemList) -formula.runTests("""\ - H2O - C6H5OH - NaCl - """, - fullDump=False, postParse=lambda _, tokens: "Molecular weight: {0}".format(fn(tokens))) -print() - -# Version 4 - parse and convert integers as subscript digits -subscript_digits = "₀₁₂₃₄₅₆₇₈₉" -subscript_int_map = dict((e[1], e[0]) for e in enumerate(subscript_digits)) -def cvt_subscript_int(s): - ret = 0 - for c in s[0]: - ret = ret*10 + subscript_int_map[c] - return ret -subscript_int = pp.Word(subscript_digits).addParseAction(cvt_subscript_int) - -elementRef = pp.Group(element("symbol") + pp.Optional(subscript_int, default=1)("qty")) -formula = elementRef[...] -formula.runTests("""\ - H₂O - C₆H₅OH - NaCl - """, - fullDump=False, postParse=lambda _, tokens: "Molecular weight: {0}".format(fn(tokens))) -print() diff --git a/examples/chemical_formulas.py b/examples/chemical_formulas.py new file mode 100644 index 00000000..16d4bb43 --- /dev/null +++ b/examples/chemical_formulas.py @@ -0,0 +1,119 @@ +# +# chemicalFormulas.py +# +# Copyright (c) 2003,2019 Paul McGuire +# + +import pyparsing as pp + +atomic_weight = { + "O": 15.9994, + "H": 1.00794, + "Na": 22.9897, + "Cl": 35.4527, + "C": 12.0107, +} + +digits = "0123456789" + +# Version 1 +element = pp.Word(pp.alphas.upper(), pp.alphas.lower(), max=2).set_name("element") +# for stricter matching, use this Regex instead +# element = Regex("A[cglmrstu]|B[aehikr]?|C[adeflmorsu]?|D[bsy]|" +# "E[rsu]|F[emr]?|G[ade]|H[efgos]?|I[nr]?|Kr?|L[airu]|" +# "M[dgnot]|N[abdeiop]?|Os?|P[abdmortu]?|R[abefghnu]|" +# "S[bcegimnr]?|T[abcehilm]|U(u[bhopqst])?|V|W|Xe|Yb?|Z[nr]") +element_ref = pp.Group(element + pp.Opt(pp.Word(digits), default="1")) +formula = element_ref[...] + + +def sum_atomic_weights(element_list): + return sum(atomic_weight[elem] * int(qty) for elem, qty in element_list) + + +formula.run_tests( + """\ + H2O + C6H5OH + NaCl + """, + full_dump=False, + post_parse=lambda _, tokens: f"Molecular weight: {sum_atomic_weights(tokens)}", +) +print() + + +# Version 2 - access parsed items by results name +element_ref = pp.Group( + element("symbol") + pp.Opt(pp.Word(digits), default="1")("qty") +) +formula = element_ref[...] + + +def sum_atomic_weights_by_results_name(element_list): + return sum(atomic_weight[elem.symbol] * int(elem.qty) for elem in element_list) + + +formula.run_tests( + """\ + H2O + C6H5OH + NaCl + """, + full_dump=False, + post_parse=lambda _, tokens: + f"Molecular weight: {sum_atomic_weights_by_results_name(tokens)}", +) +print() + +# Version 3 - convert integers during parsing process +integer = pp.Word(digits).set_name("integer") +integer.add_parse_action(lambda t: int(t[0])) +element_ref = pp.Group(element("symbol") + pp.Opt(integer, default=1)("qty")) +formula = element_ref[...].set_name("chemical_formula") + + +def sum_atomic_weights_by_results_name_with_converted_ints(element_list): + return sum(atomic_weight[elem.symbol] * int(elem.qty) for elem in element_list) + + +formula.run_tests( + """\ + H2O + C6H5OH + NaCl + """, + full_dump=False, + post_parse=lambda _, tokens: + f"Molecular weight: {sum_atomic_weights_by_results_name_with_converted_ints(tokens)}", +) +print() + +# Version 4 - parse and convert integers as subscript digits +subscript_digits = "₀₁₂₃₄₅₆₇₈₉" +subscript_int_map = {e[1]: e[0] for e in enumerate(subscript_digits)} + + +def cvt_subscript_int(s): + ret = 0 + for c in s[0]: + ret = ret * 10 + subscript_int_map[c] + return ret + + +subscript_int = pp.Word(subscript_digits).set_name("subscript") +subscript_int.add_parse_action(cvt_subscript_int) + +element_ref = pp.Group(element("symbol") + pp.Opt(subscript_int, default=1)("qty")) +formula = element_ref[1, ...].set_name("chemical_formula") +formula.run_tests( + """\ + H₂O + C₆H₅OH + NaCl + """, + full_dump=False, + post_parse=lambda _, tokens: + f"Molecular weight: {sum_atomic_weights_by_results_name_with_converted_ints(tokens)}", +) +print() diff --git a/examples/commasep.py b/examples/commasep.py index 067647dc..c3557b61 100644 --- a/examples/commasep.py +++ b/examples/commasep.py @@ -10,6 +10,7 @@ # import pyparsing as pp + ppc = pp.pyparsing_common testData = [ @@ -19,6 +20,6 @@ "John Doe, 123 Main St., Cleveland, Ohio", "Jane Doe, 456 St. James St., Los Angeles , California ", "", - ] +] ppc.comma_separated_list.runTests(testData) diff --git a/examples/configParse.py b/examples/configParse.py index db7b6c70..02727e90 100644 --- a/examples/configParse.py +++ b/examples/configParse.py @@ -6,13 +6,24 @@ # Copyright (c) 2003, Paul McGuire # -from pyparsing import \ - Literal, Word, ZeroOrMore, Group, Dict, Optional, \ - printables, ParseException, restOfLine, empty +from pyparsing import ( + Literal, + Word, + ZeroOrMore, + Group, + Dict, + Optional, + printables, + ParseException, + restOfLine, + empty, +) import pprint inibnf = None + + def inifile_BNF(): global inibnf @@ -22,50 +33,53 @@ def inifile_BNF(): lbrack = Literal("[").suppress() rbrack = Literal("]").suppress() equals = Literal("=").suppress() - semi = Literal(";") + semi = Literal(";") - comment = semi + Optional( restOfLine ) + comment = semi + Optional(restOfLine) - nonrbrack = "".join( [ c for c in printables if c != "]" ] ) + " \t" - nonequals = "".join( [ c for c in printables if c != "=" ] ) + " \t" + nonrbrack = "".join([c for c in printables if c != "]"]) + " \t" + nonequals = "".join([c for c in printables if c != "="]) + " \t" - sectionDef = lbrack + Word( nonrbrack ) + rbrack - keyDef = ~lbrack + Word( nonequals ) + equals + empty + restOfLine + sectionDef = lbrack + Word(nonrbrack) + rbrack + keyDef = ~lbrack + Word(nonequals) + equals + empty + restOfLine # strip any leading or trailing blanks from key def stripKey(tokens): tokens[0] = tokens[0].strip() + keyDef.setParseAction(stripKey) # using Dict will allow retrieval of named data fields as attributes of the parsed results - inibnf = Dict( ZeroOrMore( Group( sectionDef + Dict( ZeroOrMore( Group( keyDef ) ) ) ) ) ) + inibnf = Dict(ZeroOrMore(Group(sectionDef + Dict(ZeroOrMore(Group(keyDef)))))) - inibnf.ignore( comment ) + inibnf.ignore(comment) return inibnf pp = pprint.PrettyPrinter(2) -def test( strng ): + +def test(strng): print(strng) try: iniFile = open(strng) - iniData = "".join( iniFile.readlines() ) + iniData = "".join(iniFile.readlines()) bnf = inifile_BNF() - tokens = bnf.parseString( iniData ) - pp.pprint( tokens.asList() ) + tokens = bnf.parseString(iniData) + pp.pprint(tokens.asList()) except ParseException as err: print(err.line) - print(" "*(err.column-1) + "^") + print(" " * (err.column - 1) + "^") print(err) iniFile.close() print() return tokens + if __name__ == "__main__": - ini = test("setup.ini") - print("ini['Startup']['modemid'] =", ini['Startup']['modemid']) - print("ini.Startup =", ini.Startup) - print("ini.Startup.modemid =", ini.Startup.modemid) + ini = test("setup.ini") + print("ini['Startup']['modemid'] =", ini["Startup"]["modemid"]) + print("ini.Startup =", ini.Startup) + print("ini.Startup.modemid =", ini.Startup.modemid) diff --git a/examples/cpp_enum_parser.py b/examples/cpp_enum_parser.py index ca2c04b7..77eb3a73 100644 --- a/examples/cpp_enum_parser.py +++ b/examples/cpp_enum_parser.py @@ -9,9 +9,10 @@ # # -from pyparsing import * +import pyparsing as pp + # sample string with enums and other stuff -sample = ''' +sample = """ stuff before enum hello { Zero, @@ -31,22 +32,22 @@ zeta = 50 }; at the end - ''' + """ # syntax we don't want to see in the final parse tree -LBRACE,RBRACE,EQ,COMMA = map(Suppress,"{}=,") -_enum = Suppress('enum') -identifier = Word(alphas,alphanums+'_') -integer = Word(nums) -enumValue = Group(identifier('name') + Optional(EQ + integer('value'))) -enumList = Group(enumValue + ZeroOrMore(COMMA + enumValue)) -enum = _enum + identifier('enum') + LBRACE + enumList('names') + RBRACE +LBRACE, RBRACE, EQ, COMMA = pp.Suppress.using_each("{}=,") +_enum = pp.Suppress("enum") +identifier = pp.Word(pp.alphas + "_", pp.alphanums + "_") +integer = pp.Word(pp.nums) +enumValue = pp.Group(identifier("name") + pp.Optional(EQ + integer("value"))) +enumList = pp.Group(enumValue + (COMMA + enumValue)[...]) +enum = _enum + identifier("enum") + LBRACE + enumList("names") + RBRACE # find instances of enums ignoring other syntax -for item,start,stop in enum.scanString(sample): - id = 0 +for item, start, stop in enum.scan_string(sample): + idx = 0 for entry in item.names: - if entry.value != '': - id = int(entry.value) - print('%s_%s = %d' % (item.enum.upper(),entry.name.upper(),id)) - id += 1 + if entry.value != "": + idx = int(entry.value) + print("%s_%s = %d" % (item.enum.upper(), entry.name.upper(), idx)) + idx += 1 diff --git a/examples/cuneiform_python.py b/examples/cuneiform_python.py new file mode 100644 index 00000000..9d4e74d5 --- /dev/null +++ b/examples/cuneiform_python.py @@ -0,0 +1,104 @@ +# +# cuneiform_python.py +# +# Example showing how to create a custom Unicode set for parsing +# +# Copyright Paul McGuire, 2021 +# +from typing import List, Tuple +import pyparsing as pp + + +class Cuneiform(pp.unicode_set): + """Unicode set for Cuneiform Character Range""" + + _ranges: List[Tuple[int, ...]] = [ + (0x10380, 0x103d5), + (0x12000, 0x123FF), + (0x12400, 0x1247F), + ] + + +# list out all valid identifier characters +# print(Cuneiform.identchars) + + +""" +Simple Cuneiform Python language transformer + +Define Cuneiform "words" + print: 𒄑𒉿𒅔𒋫 + hello: 𒀄𒂖𒆷𒁎 + world: 𒍟𒁎𒉿𒆷𒀳 + def: 𒁴𒈫 +""" + +# uncomment to show parse-time debugging +# pp.enable_diag(pp.Diagnostics.enable_debug_on_named_expressions) + +# define a MINIMAL Python parser +LPAR, RPAR, COLON, EQ = map(pp.Suppress, "():=") +def_ = pp.Keyword("𒁴𒈫", ident_chars=Cuneiform.identbodychars).set_name("def") +any_keyword = def_ +ident = (~any_keyword) + pp.Word( + Cuneiform.identchars, Cuneiform.identbodychars, asKeyword=True +) +str_expr = pp.infix_notation( + pp.QuotedString('"') | pp.common.integer, + [ + ("*", 2, pp.OpAssoc.LEFT), + ("+", 2, pp.OpAssoc.LEFT), + ], +) + +rvalue = pp.Forward() +fn_call = (ident + pp.Group(LPAR + pp.Optional(rvalue) + RPAR)).set_name("fn_call") + +rvalue <<= fn_call | ident | str_expr | pp.common.number +assignment_stmt = ident + EQ + rvalue + +stmt = pp.Group(fn_call | assignment_stmt).set_name("stmt") + +fn_def = pp.Group( + def_ + ident + pp.Group(LPAR + pp.Optional(rvalue) + RPAR) + COLON +).set_name("fn_def") +fn_body = pp.IndentedBlock(stmt).set_name("fn_body") +fn_expr = pp.Group(fn_def + pp.Group(fn_body)) + +script = fn_expr[...] + stmt[...] + + +# parse some Python written in Cuneiform +cuneiform_hello_world = r""" +𒁴𒈫 𒀄𒂖𒆷𒁎(): + 𒀁 = "𒀄𒂖𒆷𒁎, 𒍟𒁎𒉿𒆷𒀳!\n" * 3 + 𒄑𒉿𒅔𒋫(𒀁) + +𒀄𒂖𒆷𒁎()""" +script.parseString(cuneiform_hello_world).pprint(width=40) + + +# use transform_string to convert keywords and builtins to runnable Python +names_map = { + "𒄑𒉿𒅔𒋫": "print", +} +ident.add_parse_action(lambda t: names_map.get(t[0], t[0])) +def_.add_parse_action(lambda: "def") + +print("\nconvert Cuneiform Python to executable Python") +transformed = ( + # always put ident last + (def_ | ident) + .ignore(pp.quoted_string) + .transform_string(cuneiform_hello_world) + .strip() +) +print( + "=================\n" + + cuneiform_hello_world.strip() + + "\n=================\n" + + transformed + + "\n=================\n" +) +print("# run transformed Python") +exec(transformed) diff --git a/examples/datetimeParseActions.py b/examples/datetime_parse_actions.py similarity index 63% rename from examples/datetimeParseActions.py rename to examples/datetime_parse_actions.py index e5ae2b99..ff386562 100644 --- a/examples/datetimeParseActions.py +++ b/examples/datetime_parse_actions.py @@ -1,69 +1,84 @@ -# parseActions.py -# -# A sample program a parser to match a date string of the form "YYYY/MM/DD", -# and return it as a datetime, or raise an exception if not a valid date. -# -# Copyright 2012, Paul T. McGuire -# -from datetime import datetime -import pyparsing as pp -from pyparsing import pyparsing_common as ppc - -# define an integer string, and a parse action to convert it -# to an integer at parse time -integer = pp.Word(pp.nums).setName("integer") -def convertToInt(tokens): - # no need to test for validity - we can't get here - # unless tokens[0] contains all numeric digits - return int(tokens[0]) -integer.setParseAction(convertToInt) -# or can be written as one line as -#integer = Word(nums).setParseAction(lambda t: int(t[0])) - -# define a pattern for a year/month/day date -date_expr = integer('year') + '/' + integer('month') + '/' + integer('day') -date_expr.ignore(pp.pythonStyleComment) - -def convertToDatetime(s,loc,tokens): - try: - # note that the year, month, and day fields were already - # converted to ints from strings by the parse action defined - # on the integer expression above - return datetime(tokens.year, tokens.month, tokens.day).date() - except Exception as ve: - errmsg = "'%s/%s/%s' is not a valid date, %s" % \ - (tokens.year, tokens.month, tokens.day, ve) - raise pp.ParseException(s, loc, errmsg) -date_expr.setParseAction(convertToDatetime) - - -date_expr.runTests("""\ - 2000/1/1 - - # invalid month - 2000/13/1 - - # 1900 was not a leap year - 1900/2/29 - - # but 2000 was - 2000/2/29 - """) - - -# if dates conform to ISO8601, use definitions in pyparsing_common -date_expr = ppc.iso8601_date.setParseAction(ppc.convertToDate()) -date_expr.ignore(pp.pythonStyleComment) - -date_expr.runTests("""\ - 2000-01-01 - - # invalid month - 2000-13-01 - - # 1900 was not a leap year - 1900-02-29 - - # but 2000 was - 2000-02-29 - """) +# parseActions.py +# +# A sample program a parser to match a date string of the form "YYYY/MM/DD", +# and return it as a datetime, or raise an exception if not a valid date. +# +# Copyright 2012, Paul T. McGuire +# +from datetime import datetime +import pyparsing as pp +from pyparsing import pyparsing_common as ppc + +# define an integer string, and a parse action to convert it +# to an integer at parse time +integer = pp.Word(pp.nums).set_name("integer") + + +def convert_to_int(tokens): + # no need to test for validity - we can't get here + # unless tokens[0] contains all numeric digits + return int(tokens[0]) + + +integer.set_parse_action(convert_to_int) +# or can be written as one line as +# integer = Word(nums).set_parse_action(lambda t: int(t[0])) + +# define a pattern for a year/month/day date +date_expr = integer("year") + "/" + integer("month") + "/" + integer("day") +date_expr.ignore(pp.python_style_comment) + + +def convert_to_datetime(s, loc, tokens): + try: + # note that the year, month, and day fields were already + # converted to ints from strings by the parse action defined + # on the integer expression above + return datetime(tokens.year, tokens.month, tokens.day).date() + except Exception as ve: + errmsg = "'%s/%s/%s' is not a valid date, %s" % ( + tokens.year, + tokens.month, + tokens.day, + ve, + ) + raise pp.ParseException(s, loc, errmsg) + + +date_expr.set_parse_action(convert_to_datetime) + + +date_expr.run_tests( + """\ + 2000/1/1 + + # invalid month + 2000/13/1 + + # 1900 was not a leap year + 1900/2/29 + + # but 2000 was + 2000/2/29 + """ +) + + +# if dates conform to ISO8601, use definitions in pyparsing_common +date_expr = ppc.iso8601_date.set_parse_action(ppc.convert_to_date()) +date_expr.ignore(pp.python_style_comment) + +date_expr.run_tests( + """\ + 2000-01-01 + + # invalid month + 2000-13-01 + + # 1900 was not a leap year + 1900-02-29 + + # but 2000 was + 2000-02-29 + """ +) diff --git a/examples/decaf_parser.py b/examples/decaf_parser.py index e6b1abb4..d0a376df 100644 --- a/examples/decaf_parser.py +++ b/examples/decaf_parser.py @@ -12,46 +12,75 @@ """ Program ::= Decl+ Decl ::= VariableDecl | FunctionDecl | ClassDecl | InterfaceDecl - VariableDecl ::= Variable ; - Variable ::= Type ident - Type ::= int | double | bool | string | ident | Type [] - FunctionDecl ::= Type ident ( Formals ) StmtBlock | void ident ( Formals ) StmtBlock - Formals ::= Variable+, | e - ClassDecl ::= class ident { Field* } - Field ::= VariableDecl | FunctionDecl - InterfaceDecl ::= interface ident { Prototype* } - Prototype ::= Type ident ( Formals ) ; | void ident ( Formals ) ; - StmtBlock ::= { VariableDecl* Stmt* } - Stmt ::= ; | IfStmt | WhileStmt | ForStmt | BreakStmt | ReturnStmt | PrintStmt | StmtBlock - IfStmt ::= if ( Expr ) Stmt - WhileStmt ::= while ( Expr ) Stmt - ForStmt ::= for ( ; Expr ; ) Stmt - ReturnStmt ::= return ; - BreakStmt ::= break ; - PrintStmt ::= Print ( Expr+, ) ; + VariableDecl ::= Variable ; + Variable ::= Type ident + Type ::= int | double | bool | string | ident | Type [] + FunctionDecl ::= Type ident ( Formals ) StmtBlock | void ident ( Formals ) StmtBlock + Formals ::= Variable+, | e + ClassDecl ::= class ident { Field* } + Field ::= VariableDecl | FunctionDecl + InterfaceDecl ::= interface ident { Prototype* } + Prototype ::= Type ident ( Formals ) ; | void ident ( Formals ) ; + StmtBlock ::= { VariableDecl* Stmt* } + Stmt ::= ; | IfStmt | WhileStmt | ForStmt | BreakStmt | ReturnStmt | PrintStmt | StmtBlock + IfStmt ::= if ( Expr ) Stmt + WhileStmt ::= while ( Expr ) Stmt + ForStmt ::= for ( ; Expr ; ) Stmt + ReturnStmt ::= return ; + BreakStmt ::= break ; + PrintStmt ::= Print ( Expr+, ) ; Expr ::= LValue = Expr | Constant | LValue | this | Call - | ( Expr ) - | Expr + Expr | Expr - Expr | Expr * Expr | Expr / Expr | Expr % Expr | - Expr - | Expr < Expr | Expr <= Expr | Expr > Expr | Expr >= Expr | Expr == Expr | Expr != Expr - | Expr && Expr | Expr || Expr | ! Expr - | ReadInteger ( ) | ReadLine ( ) | new ident | NewArray ( Expr , Typev) - LValue ::= ident | Expr . ident | Expr [ Expr ] - Call ::= ident ( Actuals ) | Expr . ident ( Actuals ) - Actuals ::= Expr+, | e + | ( Expr ) + | Expr + Expr | Expr - Expr | Expr * Expr | Expr / Expr | Expr % Expr | - Expr + | Expr < Expr | Expr <= Expr | Expr > Expr | Expr >= Expr | Expr == Expr | Expr != Expr + | Expr && Expr | Expr || Expr | ! Expr + | ReadInteger ( ) | ReadLine ( ) | new ident | NewArray ( Expr , Typev) + LValue ::= ident | Expr . ident | Expr [ Expr ] + Call ::= ident ( Actuals ) | Expr . ident ( Actuals ) + Actuals ::= Expr+, | e Constant ::= intConstant | doubleConstant | boolConstant | stringConstant | null """ import pyparsing as pp from pyparsing import pyparsing_common as ppc + pp.ParserElement.enablePackrat() # keywords -keywords = (VOID, INT, DOUBLE, BOOL, STRING, CLASS, INTERFACE, NULL, THIS, EXTENDS, IMPLEMENTS, FOR, WHILE, - IF, ELSE, RETURN, BREAK, NEW, NEWARRAY, PRINT, READINTEGER, READLINE, TRUE, FALSE) = map(pp.Keyword, - """void int double bool string class interface null this extends implements or while - if else return break new NewArray Print ReadInteger ReadLine true false""".split()) +keywords = ( + VOID, + INT, + DOUBLE, + BOOL, + STRING, + CLASS, + INTERFACE, + NULL, + THIS, + EXTENDS, + IMPLEMENTS, + FOR, + WHILE, + IF, + ELSE, + RETURN, + BREAK, + NEW, + NEWARRAY, + PRINT, + READINTEGER, + READLINE, + TRUE, + FALSE, +) = map( + pp.Keyword, + """void int double bool string class interface null this extends implements or while + if else return break new NewArray Print ReadInteger ReadLine true false""".split(), +) keywords = pp.MatchFirst(list(keywords)) -LPAR, RPAR, LBRACE, RBRACE, LBRACK, RBRACK, DOT, EQ, COMMA, SEMI = map(pp.Suppress, "(){}[].=,;") +LPAR, RPAR, LBRACE, RBRACE, LBRACK, RBRACK, DOT, EQ, COMMA, SEMI = map( + pp.Suppress, "(){}[].=,;" +) hexConstant = pp.Regex(r"0[xX][0-9a-fA-F]+").addParseAction(lambda t: int(t[0][2:], 16)) intConstant = hexConstant | ppc.integer doubleConstant = ppc.real @@ -59,7 +88,7 @@ stringConstant = pp.dblQuotedString null = NULL constant = doubleConstant | boolConstant | intConstant | stringConstant | null -ident = ~keywords + pp.Word(pp.alphas, pp.alphanums+'_') +ident = ~keywords + pp.Word(pp.alphas, pp.alphanums + "_") type_ = pp.Group((INT | DOUBLE | BOOL | STRING | ident) + pp.ZeroOrMore("[]")) variable = type_ + ident @@ -68,86 +97,174 @@ expr = pp.Forward() expr_parens = pp.Group(LPAR + expr + RPAR) actuals = pp.Optional(pp.delimitedList(expr)) -call = pp.Group(ident("call_ident") + LPAR + actuals("call_args") + RPAR - | (expr_parens + pp.ZeroOrMore(DOT + ident))("call_ident_expr") + LPAR + actuals("call_args") + RPAR) -lvalue = ((ident | expr_parens) - + pp.ZeroOrMore(DOT + (ident | expr_parens)) - + pp.ZeroOrMore(LBRACK + expr + RBRACK)) +call = pp.Group( + ident("call_ident") + LPAR + actuals("call_args") + RPAR + | (expr_parens + pp.ZeroOrMore(DOT + ident))("call_ident_expr") + + LPAR + + actuals("call_args") + + RPAR +) +lvalue = ( + (ident | expr_parens) + + pp.ZeroOrMore(DOT + (ident | expr_parens)) + + pp.ZeroOrMore(LBRACK + expr + RBRACK) +) assignment = pp.Group(lvalue("lhs") + EQ + expr("rhs")) read_integer = pp.Group(READINTEGER + LPAR + RPAR) read_line = pp.Group(READLINE + LPAR + RPAR) new_statement = pp.Group(NEW + ident) new_array = pp.Group(NEWARRAY + LPAR + expr + COMMA + type_ + RPAR) rvalue = constant | call | read_integer | read_line | new_statement | new_array | ident -arith_expr = pp.infixNotation(rvalue, +arith_expr = pp.infixNotation( + rvalue, [ - ('-', 1, pp.opAssoc.RIGHT,), - (pp.oneOf("* / %"), 2, pp.opAssoc.LEFT,), - (pp.oneOf("+ -"), 2, pp.opAssoc.LEFT,), - ]) -comparison_expr = pp.infixNotation(arith_expr, + ( + "-", + 1, + pp.opAssoc.RIGHT, + ), + ( + pp.oneOf("* / %"), + 2, + pp.opAssoc.LEFT, + ), + ( + pp.oneOf("+ -"), + 2, + pp.opAssoc.LEFT, + ), + ], +) +comparison_expr = pp.infixNotation( + arith_expr, [ - ('!', 1, pp.opAssoc.RIGHT,), - (pp.oneOf("< > <= >="), 2, pp.opAssoc.LEFT,), - (pp.oneOf("== !="), 2, pp.opAssoc.LEFT,), - (pp.oneOf("&&"), 2, pp.opAssoc.LEFT,), - (pp.oneOf("||"), 2, pp.opAssoc.LEFT,), - ]) -expr <<= (assignment - | call - | THIS - | comparison_expr - | arith_expr - | lvalue - | constant - | read_integer - | read_line - | new_statement - | new_array - ) + ( + "!", + 1, + pp.opAssoc.RIGHT, + ), + ( + pp.oneOf("< > <= >="), + 2, + pp.opAssoc.LEFT, + ), + ( + pp.oneOf("== !="), + 2, + pp.opAssoc.LEFT, + ), + ( + pp.oneOf("&&"), + 2, + pp.opAssoc.LEFT, + ), + ( + pp.oneOf("||"), + 2, + pp.opAssoc.LEFT, + ), + ], +) +expr <<= ( + assignment + | call + | THIS + | comparison_expr + | arith_expr + | lvalue + | constant + | read_integer + | read_line + | new_statement + | new_array +) stmt = pp.Forward() -print_stmt = pp.Group(PRINT("statement") + LPAR + pp.Group(pp.Optional(pp.delimitedList(expr)))("args") + RPAR + SEMI) +print_stmt = pp.Group( + PRINT("statement") + + LPAR + + pp.Group(pp.Optional(pp.delimitedList(expr)))("args") + + RPAR + + SEMI +) break_stmt = pp.Group(BREAK("statement") + SEMI) return_stmt = pp.Group(RETURN("statement") + expr + SEMI) -for_stmt = pp.Group(FOR("statement") + LPAR + pp.Optional(expr) + SEMI + expr + SEMI + pp.Optional(expr) + RPAR + stmt) +for_stmt = pp.Group( + FOR("statement") + + LPAR + + pp.Optional(expr) + + SEMI + + expr + + SEMI + + pp.Optional(expr) + + RPAR + + stmt +) while_stmt = pp.Group(WHILE("statement") + LPAR + expr + RPAR + stmt) -if_stmt = pp.Group(IF("statement") - + LPAR + pp.Group(expr)("condition") + RPAR - + pp.Group(stmt)("then_statement") - + pp.Group(pp.Optional(ELSE + stmt))("else_statement")) -stmt_block = pp.Group(LBRACE + pp.ZeroOrMore(variable_decl) + pp.ZeroOrMore(stmt) + RBRACE) -stmt <<= (if_stmt - | while_stmt - | for_stmt - | break_stmt - | return_stmt - | print_stmt - | stmt_block - | pp.Group(expr + SEMI) - ) +if_stmt = pp.Group( + IF("statement") + + LPAR + + pp.Group(expr)("condition") + + RPAR + + pp.Group(stmt)("then_statement") + + pp.Group(pp.Optional(ELSE + stmt))("else_statement") +) +stmt_block = pp.Group( + LBRACE + pp.ZeroOrMore(variable_decl) + pp.ZeroOrMore(stmt) + RBRACE +) +stmt <<= ( + if_stmt + | while_stmt + | for_stmt + | break_stmt + | return_stmt + | print_stmt + | stmt_block + | pp.Group(expr + SEMI) +) formals = pp.Optional(pp.delimitedList(variable)) -prototype = pp.Group((type_ | VOID)("return_type") - + ident("function_name") - + LPAR + formals("args") + RPAR + SEMI)("prototype") -function_decl = pp.Group((type_ | VOID)("return_type") + ident("function_name") - + LPAR + formals("args") + RPAR - + stmt_block("body"))("function_decl") +prototype = pp.Group( + (type_ | VOID)("return_type") + + ident("function_name") + + LPAR + + formals("args") + + RPAR + + SEMI +)("prototype") +function_decl = pp.Group( + (type_ | VOID)("return_type") + + ident("function_name") + + LPAR + + formals("args") + + RPAR + + stmt_block("body") +)("function_decl") -interface_decl = pp.Group(INTERFACE + ident("interface_name") - + LBRACE + pp.ZeroOrMore(prototype)("prototypes") + RBRACE)("interface") +interface_decl = pp.Group( + INTERFACE + + ident("interface_name") + + LBRACE + + pp.ZeroOrMore(prototype)("prototypes") + + RBRACE +)("interface") field = variable_decl | function_decl -class_decl = pp.Group(CLASS + ident("class_name") - + pp.Optional(EXTENDS + ident)("extends") - + pp.Optional(IMPLEMENTS + pp.delimitedList(ident))("implements") - + LBRACE + pp.ZeroOrMore(field)("fields") + RBRACE)("class_decl") +class_decl = pp.Group( + CLASS + + ident("class_name") + + pp.Optional(EXTENDS + ident)("extends") + + pp.Optional(IMPLEMENTS + pp.delimitedList(ident))("implements") + + LBRACE + + pp.ZeroOrMore(field)("fields") + + RBRACE +)("class_decl") decl = variable_decl | function_decl | class_decl | interface_decl | prototype program = pp.OneOrMore(pp.Group(decl)) decaf_parser = program -stmt.runTests("""\ +stmt.runTests( + """\ sin(30); a = 1; b = 1 + 1; @@ -158,7 +275,8 @@ a[100] = b; a[0][0] = 2; a = 0x1234; -""") +""" +) test_program = """ void getenv(string var); diff --git a/examples/delta_time.py b/examples/delta_time.py index e0790947..cdd58f48 100644 --- a/examples/delta_time.py +++ b/examples/delta_time.py @@ -36,125 +36,173 @@ __all__ = ["time_expression"] + # basic grammar definitions def make_integer_word_expr(int_name, int_value): - return pp.CaselessKeyword(int_name).addParseAction(pp.replaceWith(int_value)) -integer_word = pp.MatchFirst(make_integer_word_expr(int_str, int_value) - for int_value, int_str - in enumerate("one two three four five six seven eight nine ten" - " eleven twelve thirteen fourteen fifteen sixteen" - " seventeen eighteen nineteen twenty".split(), start=1)) + return pp.CaselessKeyword(int_name).add_parse_action(pp.replaceWith(int_value)) + + +integer_word = pp.MatchFirst( + make_integer_word_expr(int_str, int_value) + for int_value, int_str in enumerate( + "one two three four five six seven eight nine ten" + " eleven twelve thirteen fourteen fifteen sixteen" + " seventeen eighteen nineteen twenty".split(), + start=1, + ) +).set_name("integer_word") + integer = pp.pyparsing_common.integer | integer_word +integer.set_name("numeric") CK = pp.CaselessKeyword CL = pp.CaselessLiteral -today, tomorrow, yesterday, noon, midnight, now = map(CK, "today tomorrow yesterday noon midnight now".split()) +today, tomorrow, yesterday, noon, midnight, now = CK.using_each( + "today tomorrow yesterday noon midnight now".split() +) + + def plural(s): - return CK(s) | CK(s + 's').addParseAction(pp.replaceWith(s)) + return CK(s) | CK(s + "s").add_parse_action(pp.replaceWith(s)) + + week, day, hour, minute, second = map(plural, "week day hour minute second".split()) +time_units = hour | minute | second +any_time_units = (week | day | time_units).set_name("any_time_units") + am = CL("am") pm = CL("pm") -COLON = pp.Suppress(':') - -in_ = CK("in").setParseAction(pp.replaceWith(1)) -from_ = CK("from").setParseAction(pp.replaceWith(1)) -before = CK("before").setParseAction(pp.replaceWith(-1)) -after = CK("after").setParseAction(pp.replaceWith(1)) -ago = CK("ago").setParseAction(pp.replaceWith(-1)) -next_ = CK("next").setParseAction(pp.replaceWith(1)) -last_ = CK("last").setParseAction(pp.replaceWith(-1)) +COLON = pp.Suppress(":") + +in_ = CK("in").set_parse_action(pp.replaceWith(1)) +from_ = CK("from").set_parse_action(pp.replaceWith(1)) +before = CK("before").set_parse_action(pp.replaceWith(-1)) +after = CK("after").set_parse_action(pp.replaceWith(1)) +ago = CK("ago").set_parse_action(pp.replaceWith(-1)) +next_ = CK("next").set_parse_action(pp.replaceWith(1)) +last_ = CK("last").set_parse_action(pp.replaceWith(-1)) at_ = CK("at") on_ = CK("on") -couple = (pp.Optional(CK("a")) + CK("couple") + pp.Optional(CK("of"))).setParseAction(pp.replaceWith(2)) -a_qty = (CK("a") | CK("an")).setParseAction(pp.replaceWith(1)) -the_qty = CK("the").setParseAction(pp.replaceWith(1)) -qty = pp.ungroup(integer | couple | a_qty | the_qty) -time_ref_present = pp.Empty().addParseAction(pp.replaceWith(True))('time_ref_present') +couple = ( + (pp.Opt(CK("a")) + CK("couple") + pp.Opt(CK("of"))) + .set_parse_action(pp.replaceWith(2)) + .set_name("couple") +) + +a_qty = (CK("a") | CK("an")).set_parse_action(pp.replaceWith(1)) +the_qty = CK("the").set_parse_action(pp.replaceWith(1)) +qty = pp.ungroup( + (integer | couple | a_qty | the_qty).set_name("qty_expression") +).set_name("qty") +time_ref_present = pp.Empty().add_parse_action(pp.replace_with(True))( + "time_ref_present" +) + def fill_24hr_time_fields(t): - t['HH'] = t[0] - t['MM'] = t[1] - t['SS'] = 0 - t['ampm'] = ('am','pm')[t.HH >= 12] + t["HH"] = t[0] + t["MM"] = t[1] + t["SS"] = 0 + t["ampm"] = ("am", "pm")[t.HH >= 12] + def fill_default_time_fields(t): - for fld in 'HH MM SS'.split(): + for fld in "HH MM SS".split(): if fld not in t: t[fld] = 0 -weekday_name_list = list(calendar.day_name) -weekday_name = pp.oneOf(weekday_name_list) -_24hour_time = pp.Word(pp.nums, exact=4).addParseAction(lambda t: [int(t[0][:2]),int(t[0][2:])], - fill_24hr_time_fields) -_24hour_time.setName("0000 time") +# get weekday names from the calendar module +weekday_name_list = list(calendar.day_name) +weekday_name = pp.one_of(weekday_name_list).set_name("weekday_name") + +# expressions for military 2400 time +_24hour_time = ~(pp.Word(pp.nums) + any_time_units).set_name("numbered_time_units") + pp.Word( + pp.nums, exact=4, as_keyword=True +).set_name("HHMM").add_parse_action( + lambda t: [int(t[0][:2]), int(t[0][2:])], fill_24hr_time_fields +) +_24hour_time.set_name("0000 time") ampm = am | pm -timespec = (integer("HH") - + pp.Optional(CK("o'clock") - | - COLON + integer("MM") - + pp.Optional(COLON + integer("SS")) - ) - + (am | pm)("ampm") - ).addParseAction(fill_default_time_fields) +timespec = ( + integer("HH") + + pp.Opt(CK("o'clock") | COLON + integer("MM") + pp.Opt(COLON + integer("SS"))) + + (am | pm)("ampm") +).add_parse_action(fill_default_time_fields) absolute_time = _24hour_time | timespec +absolute_time.set_name("absolute time") absolute_time_of_day = noon | midnight | now | absolute_time +absolute_time_of_day.set_name("time of day") + def add_computed_time(t): - if t[0] in 'now noon midnight'.split(): - t['computed_time'] = {'now': datetime.now().time().replace(microsecond=0), - 'noon': time(hour=12), - 'midnight': time()}[t[0]] + if t[0] in "now noon midnight".split(): + t["computed_time"] = { + "now": datetime.now().time().replace(microsecond=0), + "noon": time(hour=12), + "midnight": time(), + }[t[0]] else: - t['HH'] = {'am': int(t['HH']) % 12, - 'pm': int(t['HH']) % 12 + 12}[t.ampm] - t['computed_time'] = time(hour=t.HH, minute=t.MM, second=t.SS) + t["HH"] = {"am": int(t["HH"]) % 12, "pm": int(t["HH"]) % 12 + 12}[t.ampm] + t["computed_time"] = time(hour=t.HH, minute=t.MM, second=t.SS) -absolute_time_of_day.addParseAction(add_computed_time) +absolute_time_of_day.add_parse_action(add_computed_time) -# relative_time_reference ::= qty time_units ('from' | 'before' | 'after') absolute_time_of_day -# | qty time_units 'ago' + +# relative_time_reference ::= qty time_units ('ago' | ('from' | 'before' | 'after') absolute_time_of_day) # | 'in' qty time_units -time_units = hour | minute | second -relative_time_reference = (qty('qty') + time_units('units') + ago('dir') - | qty('qty') + time_units('units') - + (from_ | before | after)('dir') - + pp.Group(absolute_time_of_day)('ref_time') - | in_('dir') + qty('qty') + time_units('units') - ) +time_units = (hour | minute | second).set_name("time unit") +relative_time_reference = ( + ( + qty("qty") + + time_units("units") + + ( + ago("dir") + | (from_ | before | after)("dir") + + pp.Group(absolute_time_of_day)("ref_time") + ) + ) + | in_("dir") + qty("qty") + time_units("units") +).set_name("relative time") + def compute_relative_time(t): - if 'ref_time' not in t: - t['ref_time'] = datetime.now().time().replace(microsecond=0) + if "ref_time" not in t: + t["ref_time"] = datetime.now().time().replace(microsecond=0) else: - t['ref_time'] = t.ref_time.computed_time - delta_seconds = {'hour': 3600, - 'minute': 60, - 'second': 1}[t.units] * t.qty - t['time_delta'] = timedelta(seconds=t.dir * delta_seconds) + t["ref_time"] = t.ref_time.computed_time + delta_seconds = {"hour": 3600, "minute": 60, "second": 1}[t.units] * t.qty + t["time_delta"] = timedelta(seconds=t.dir * delta_seconds) + -relative_time_reference.addParseAction(compute_relative_time) +relative_time_reference.add_parse_action(compute_relative_time) time_reference = absolute_time_of_day | relative_time_reference +time_reference.set_name("time reference") + + def add_default_time_ref_fields(t): - if 'time_delta' not in t: - t['time_delta'] = timedelta() -time_reference.addParseAction(add_default_time_ref_fields) + if "time_delta" not in t: + t["time_delta"] = timedelta() + + +time_reference.add_parse_action(add_default_time_ref_fields) # absolute_day_reference ::= 'today' | 'tomorrow' | 'yesterday' | ('next' | 'last') weekday_name # day_units ::= 'days' | 'weeks' day_units = day | week -weekday_reference = pp.Optional(next_ | last_, 1)('dir') + weekday_name('day_name') +weekday_reference = pp.Opt(next_ | last_, 1)("dir") + weekday_name("day_name") + def convert_abs_day_reference_to_date(t): now = datetime.now().replace(microsecond=0) # handle day reference by weekday name - if 'day_name' in t: + if "day_name" in t: todaynum = now.weekday() daynames = [n.lower() for n in weekday_name_list] nameddaynum = daynames.index(t.day_name.lower()) @@ -168,91 +216,120 @@ def convert_abs_day_reference_to_date(t): else: name = t[0] t["abs_date"] = { - "now" : now, - "today" : datetime(now.year, now.month, now.day), - "yesterday" : datetime(now.year, now.month, now.day) + timedelta(days=-1), - "tomorrow" : datetime(now.year, now.month, now.day) + timedelta(days=+1), - }[name] + "now": now, + "today": datetime(now.year, now.month, now.day), + "yesterday": datetime(now.year, now.month, now.day) + timedelta(days=-1), + "tomorrow": datetime(now.year, now.month, now.day) + timedelta(days=+1), + }[name] -absolute_day_reference = today | tomorrow | yesterday | now + time_ref_present | weekday_reference -absolute_day_reference.addParseAction(convert_abs_day_reference_to_date) +absolute_day_reference = ( + today | tomorrow | yesterday | now + time_ref_present | weekday_reference +) +absolute_day_reference.add_parse_action(convert_abs_day_reference_to_date) +absolute_day_reference.set_name("absolute day") # relative_day_reference ::= 'in' qty day_units -# | qty day_units 'ago' -# | 'qty day_units ('from' | 'before' | 'after') absolute_day_reference -relative_day_reference = (in_('dir') + qty('qty') + day_units('units') - | qty('qty') + day_units('units') + ago('dir') - | qty('qty') + day_units('units') + (from_ | before | after)('dir') - + absolute_day_reference('ref_day') - ) +# | qty day_units +# ('ago' +# | ('from' | 'before' | 'after') absolute_day_reference) +relative_day_reference = in_("dir") + qty("qty") + day_units("units") | qty( + "qty" +) + day_units("units") + ( + ago("dir") | ((from_ | before | after)("dir") + absolute_day_reference("ref_day")) +) +relative_day_reference.set_name("relative day") + def compute_relative_date(t): now = datetime.now().replace(microsecond=0) - if 'ref_day' in t: - t['computed_date'] = t.ref_day + if "ref_day" in t: + t["computed_date"] = t.ref_day else: - t['computed_date'] = now.date() - day_diff = t.dir * t.qty * {'week': 7, 'day': 1}[t.units] - t['date_delta'] = timedelta(days=day_diff) -relative_day_reference.addParseAction(compute_relative_date) + t["computed_date"] = now.date() + day_diff = t.dir * t.qty * {"week": 7, "day": 1}[t.units] + t["date_delta"] = timedelta(days=day_diff) + + +relative_day_reference.add_parse_action(compute_relative_date) # combine expressions for absolute and relative day references day_reference = relative_day_reference | absolute_day_reference +day_reference.set_name("day reference") + + def add_default_date_fields(t): - if 'date_delta' not in t: - t['date_delta'] = timedelta() -day_reference.addParseAction(add_default_date_fields) + if "date_delta" not in t: + t["date_delta"] = timedelta() + + +day_reference.add_parse_action(add_default_date_fields) # combine date and time expressions into single overall parser -time_and_day = (time_reference + time_ref_present + pp.Optional(pp.Optional(on_) + day_reference) - | day_reference + pp.Optional(at_ + absolute_time_of_day + time_ref_present)) +time_and_day = time_reference + time_ref_present + pp.Opt( + pp.Opt(on_) + day_reference +) | day_reference + pp.Opt(at_ + absolute_time_of_day + time_ref_present) +time_and_day.set_name("time and day") # parse actions for total time_and_day expression def save_original_string(s, l, t): # save original input string and reference time - t['original'] = ' '.join(s.strip().split()) - t['relative_to'] = datetime.now().replace(microsecond=0) + t["original"] = " ".join(s.strip().split()) + t["relative_to"] = datetime.now().replace(microsecond=0) + def compute_timestamp(t): # accumulate values from parsed time and day subexpressions - fill in defaults for omitted parts now = datetime.now().replace(microsecond=0) - if 'computed_time' not in t: - t['computed_time'] = t.ref_time or now.time() - if 'abs_date' not in t: - t['abs_date'] = now + if "computed_time" not in t: + t["computed_time"] = t.ref_time or now.time() + if "abs_date" not in t: + t["abs_date"] = now # roll up all fields and apply any time or day deltas - t['computed_dt'] = ( - t.abs_date.replace(hour=t.computed_time.hour, minute=t.computed_time.minute, second=t.computed_time.second) + t["computed_dt"] = ( + t.abs_date.replace( + hour=t.computed_time.hour, + minute=t.computed_time.minute, + second=t.computed_time.second, + ) + (t.time_delta or timedelta(0)) + (t.date_delta or timedelta(0)) ) # if time just given in terms of day expressions, zero out time fields if not t.time_ref_present: - t['computed_dt'] = t.computed_dt.replace(hour=0, minute=0, second=0) + t["computed_dt"] = t.computed_dt.replace(hour=0, minute=0, second=0) # add results name compatible with previous version - t['calculatedTime'] = t.computed_dt + t["calculatedTime"] = t.computed_dt # add time_offset fields - t['time_offset'] = t.computed_dt - t.relative_to + t["time_offset"] = t.computed_dt - t.relative_to + def remove_temp_keys(t): # strip out keys that are just used internally all_keys = list(t.keys()) for k in all_keys: - if k not in ('computed_dt', 'original', 'relative_to', 'time_offset', 'calculatedTime'): + if k not in ( + "computed_dt", + "original", + "relative_to", + "time_offset", + "calculatedTime", + ): del t[k] -time_and_day.addParseAction(save_original_string, compute_timestamp, remove_temp_keys) + +time_and_day.add_parse_action(save_original_string, compute_timestamp, remove_temp_keys) time_expression = time_and_day -if __name__ == "__main__": +# fmt: off +def main(): current_time = datetime.now() # test grammar tests = """\ @@ -302,62 +379,92 @@ def remove_temp_keys(t): 2pm next Sunday next Sunday at 2pm last Sunday at 2pm + 10 seconds ago + 100 seconds ago + 1000 seconds ago + 10000 seconds ago """ - time_of_day = timedelta(hours=current_time.hour, - minutes=current_time.minute, - seconds=current_time.second) + time_of_day = timedelta( + hours=current_time.hour, + minutes=current_time.minute, + seconds=current_time.second, + ) expected = { - 'now' : timedelta(0), - '10 minutes ago': timedelta(minutes=-10), - '10 minutes from now': timedelta(minutes=10), - 'in 10 minutes': timedelta(minutes=10), - 'in a minute': timedelta(minutes=1), - 'in a couple of minutes': timedelta(minutes=2), - '20 seconds ago': timedelta(seconds=-20), - 'in 30 seconds': timedelta(seconds=30), - 'in an hour': timedelta(hours=1), - 'in a couple hours': timedelta(hours=2), - 'a week from now': timedelta(days=7), - '3 days from now': timedelta(days=3), - 'a couple of days from now': timedelta(days=2), - 'an hour ago': timedelta(hours=-1), - 'in a couple days': timedelta(days=2) - time_of_day, - 'a week from today': timedelta(days=7) - time_of_day, - 'three weeks ago': timedelta(days=-21) - time_of_day, - 'a day ago': timedelta(days=-1) - time_of_day, - 'in a couple of days': timedelta(days=2) - time_of_day, - 'a couple of days from today': timedelta(days=2) - time_of_day, - '2 weeks after today': timedelta(days=14) - time_of_day, - 'in 2 weeks': timedelta(days=14) - time_of_day, - 'the day after tomorrow': timedelta(days=2) - time_of_day, - 'tomorrow': timedelta(days=1) - time_of_day, - 'the day before yesterday': timedelta(days=-2) - time_of_day, - 'yesterday': timedelta(days=-1) - time_of_day, - 'today': -time_of_day, - 'midnight': -time_of_day, - 'in a day': timedelta(days=1) - time_of_day, - '3 days ago': timedelta(days=-3) - time_of_day, - 'noon tomorrow': timedelta(days=1) - time_of_day + timedelta(hours=12), - '6am tomorrow': timedelta(days=1) - time_of_day + timedelta(hours=6), - '0800 yesterday': timedelta(days=-1) - time_of_day + timedelta(hours=8), - '1700 tomorrow': timedelta(days=1) - time_of_day + timedelta(hours=17), - '12:15 AM today': -time_of_day + timedelta(minutes=15), - '3pm 2 days from today': timedelta(days=2) - time_of_day + timedelta(hours=15), - 'ten seconds before noon tomorrow': timedelta(days=1) - time_of_day - + timedelta(hours=12) + timedelta(seconds=-10), - '20 seconds before noon': -time_of_day + timedelta(hours=12) + timedelta(seconds=-20), - 'in 3 days at 5pm': timedelta(days=3) - time_of_day + timedelta(hours=17), + "now": timedelta(0), + "10 seconds ago": timedelta(seconds=-10), + "100 seconds ago": timedelta(seconds=-100), + "1000 seconds ago": timedelta(seconds=-1000), + "10000 seconds ago": timedelta(seconds=-10000), + "10 minutes ago": timedelta(minutes=-10), + "10 minutes from now": timedelta(minutes=10), + "in 10 minutes": timedelta(minutes=10), + "in a minute": timedelta(minutes=1), + "in a couple of minutes": timedelta(minutes=2), + "20 seconds ago": timedelta(seconds=-20), + "in 30 seconds": timedelta(seconds=30), + "in an hour": timedelta(hours=1), + "in a couple hours": timedelta(hours=2), + "a week from now": timedelta(days=7), + "3 days from now": timedelta(days=3), + "a couple of days from now": timedelta(days=2), + "an hour ago": timedelta(hours=-1), + "in a couple days": timedelta(days=2) - time_of_day, + "a week from today": timedelta(days=7) - time_of_day, + "three weeks ago": timedelta(days=-21) - time_of_day, + "a day ago": timedelta(days=-1) - time_of_day, + "in a couple of days": timedelta(days=2) - time_of_day, + "a couple of days from today": timedelta(days=2) - time_of_day, + "2 weeks after today": timedelta(days=14) - time_of_day, + "in 2 weeks": timedelta(days=14) - time_of_day, + "the day after tomorrow": timedelta(days=2) - time_of_day, + "tomorrow": timedelta(days=1) - time_of_day, + "the day before yesterday": timedelta(days=-2) - time_of_day, + "8am the day after tomorrow": timedelta(days=+2) - time_of_day + timedelta(hours=8), + "yesterday": timedelta(days=-1) - time_of_day, + "today": -time_of_day, + "midnight": -time_of_day, + "in a day": timedelta(days=1) - time_of_day, + "3 days ago": timedelta(days=-3) - time_of_day, + "noon tomorrow": timedelta(days=1) - time_of_day + timedelta(hours=12), + "6am tomorrow": timedelta(days=1) - time_of_day + timedelta(hours=6), + "0800 yesterday": timedelta(days=-1) - time_of_day + timedelta(hours=8), + "1700 tomorrow": timedelta(days=1) - time_of_day + timedelta(hours=17), + "12:15 AM today": -time_of_day + timedelta(minutes=15), + "3pm 2 days from today": timedelta(days=2) - time_of_day + timedelta(hours=15), + "ten seconds before noon tomorrow": timedelta(days=1) + - time_of_day + + timedelta(hours=12) + + timedelta(seconds=-10), + "20 seconds before noon": -time_of_day + timedelta(hours=12) + timedelta(seconds=-20), + "in 3 days at 5pm": timedelta(days=3) - time_of_day + timedelta(hours=17), } + # fmt: on def verify_offset(instring, parsed): time_epsilon = timedelta(seconds=1) if instring in expected: # allow up to a second time discrepancy due to test processing time if (parsed.time_offset - expected[instring]) <= time_epsilon: - parsed['verify_offset'] = 'PASS' + parsed["verify_offset"] = "PASS" else: - parsed['verify_offset'] = 'FAIL' + parsed["verify_offset"] = "FAIL" print("(relative to %s)" % datetime.now()) - time_expression.runTests(tests, postParse=verify_offset) + success, report = time_expression.runTests(tests, postParse=verify_offset) + assert success + + fails = [] + for test, rpt in report: + if rpt.get("verify_offset", "PASS") != "PASS": + fails.append((test, rpt)) + + if fails: + print("\nFAILED") + print("\n".join("- " + test for test, rpt in fails)) + + assert not fails + + +if __name__ == "__main__": + main() diff --git a/examples/dfmparse.py b/examples/dfmparse.py index ae74bf0d..5d9b1b14 100644 --- a/examples/dfmparse.py +++ b/examples/dfmparse.py @@ -8,21 +8,36 @@ __author__ = "Daniel 'Dang' Griffith " -from pyparsing import Literal, CaselessLiteral, Word, delimitedList \ - , Optional, Combine, Group, alphas, nums, alphanums, Forward \ - , oneOf, OneOrMore, ZeroOrMore, CharsNotIn +from pyparsing import ( + Literal, + CaselessLiteral, + Word, + delimitedList, + Optional, + Combine, + Group, + alphas, + nums, + alphanums, + Forward, + oneOf, + OneOrMore, + ZeroOrMore, + CharsNotIn, +) # This converts DFM character constants into Python string (unicode) values. def to_chr(x): """chr(x) if 0 < x < 128 ; unicode(x) if x > 127.""" - return 0 < x < 128 and chr(x) or eval("u'\\u%d'" % x ) + return 0 < x < 128 and chr(x) or eval("u'\\u%d'" % x) + ################# # BEGIN GRAMMAR ################# -COLON = Literal(":").suppress() +COLON = Literal(":").suppress() CONCAT = Literal("+").suppress() EQUALS = Literal("=").suppress() LANGLE = Literal("<").suppress() @@ -33,66 +48,100 @@ def to_chr(x): RBRACE = Literal("]").suppress() RPAREN = Literal(")").suppress() -CATEGORIES = CaselessLiteral("categories").suppress() -END = CaselessLiteral("end").suppress() -FONT = CaselessLiteral("font").suppress() -HINT = CaselessLiteral("hint").suppress() -ITEM = CaselessLiteral("item").suppress() -OBJECT = CaselessLiteral("object").suppress() +CATEGORIES = CaselessLiteral("categories").suppress() +END = CaselessLiteral("end").suppress() +FONT = CaselessLiteral("font").suppress() +HINT = CaselessLiteral("hint").suppress() +ITEM = CaselessLiteral("item").suppress() +OBJECT = CaselessLiteral("object").suppress() -attribute_value_pair = Forward() # this is recursed in item_list_entry +attribute_value_pair = Forward() # this is recursed in item_list_entry simple_identifier = Word(alphas, alphanums + "_") -identifier = Combine( simple_identifier + ZeroOrMore( Literal(".") + simple_identifier )) +identifier = Combine(simple_identifier + ZeroOrMore(Literal(".") + simple_identifier)) object_name = identifier object_type = identifier # Integer and floating point values are converted to Python longs and floats, respectively. -int_value = Combine(Optional("-") + Word(nums)).setParseAction(lambda s,l,t: [ int(t[0]) ] ) -float_value = Combine(Optional("-") + Optional(Word(nums)) + "." + Word(nums)).setParseAction(lambda s,l,t: [ float(t[0]) ] ) +int_value = Combine(Optional("-") + Word(nums)).setParseAction( + lambda s, l, t: [int(t[0])] +) +float_value = Combine( + Optional("-") + Optional(Word(nums)) + "." + Word(nums) +).setParseAction(lambda s, l, t: [float(t[0])]) number_value = float_value | int_value # Base16 constants are left in string form, including the surrounding braces. -base16_value = Combine(Literal("{") + OneOrMore(Word("0123456789ABCDEFabcdef")) + Literal("}"), adjacent=False) +base16_value = Combine( + Literal("{") + OneOrMore(Word("0123456789ABCDEFabcdef")) + Literal("}"), + adjacent=False, +) # This is the first part of a hack to convert the various delphi partial sglQuotedStrings # into a single sglQuotedString equivalent. The gist of it is to combine # all sglQuotedStrings (with their surrounding quotes removed (suppressed)) # with sequences of #xyz character constants, with "strings" concatenated # with a '+' sign. -unquoted_sglQuotedString = Combine( Literal("'").suppress() + ZeroOrMore( CharsNotIn("'\n\r") ) + Literal("'").suppress() ) +unquoted_sglQuotedString = Combine( + Literal("'").suppress() + ZeroOrMore(CharsNotIn("'\n\r")) + Literal("'").suppress() +) # The parse action on this production converts repetitions of constants into a single string. pound_char = Combine( - OneOrMore((Literal("#").suppress()+Word(nums) - ).setParseAction( lambda s, l, t: to_chr(int(t[0]) )))) + OneOrMore( + (Literal("#").suppress() + Word(nums)).setParseAction( + lambda s, l, t: to_chr(int(t[0])) + ) + ) +) # This is the second part of the hack. It combines the various "unquoted" # partial strings into a single one. Then, the parse action puts # a single matched pair of quotes around it. delphi_string = Combine( - OneOrMore(CONCAT | pound_char | unquoted_sglQuotedString) - , adjacent=False - ).setParseAction(lambda s, l, t: "'%s'" % t[0]) + OneOrMore(CONCAT | pound_char | unquoted_sglQuotedString), adjacent=False +).setParseAction(lambda s, l, t: "'%s'" % t[0]) string_value = delphi_string | base16_value -list_value = LBRACE + Optional(Group(delimitedList(identifier | number_value | string_value))) + RBRACE -paren_list_value = LPAREN + ZeroOrMore(identifier | number_value | string_value) + RPAREN +list_value = ( + LBRACE + + Optional(Group(delimitedList(identifier | number_value | string_value))) + + RBRACE +) +paren_list_value = ( + LPAREN + ZeroOrMore(identifier | number_value | string_value) + RPAREN +) item_list_entry = ITEM + ZeroOrMore(attribute_value_pair) + END item_list = LANGLE + ZeroOrMore(item_list_entry) + RANGLE generic_value = identifier -value = item_list | number_value | string_value | list_value | paren_list_value | generic_value +value = ( + item_list + | number_value + | string_value + | list_value + | paren_list_value + | generic_value +) category_attribute = CATEGORIES + PERIOD + oneOf("strings itemsvisibles visibles", True) -event_attribute = oneOf("onactivate onclosequery onclose oncreate ondeactivate onhide onshow", True) +event_attribute = oneOf( + "onactivate onclosequery onclose oncreate ondeactivate onhide onshow", True +) font_attribute = FONT + PERIOD + oneOf("charset color height name style", True) hint_attribute = HINT layout_attribute = oneOf("left top width height", True) generic_attribute = identifier -attribute = (category_attribute | event_attribute | font_attribute | hint_attribute | layout_attribute | generic_attribute) +attribute = ( + category_attribute + | event_attribute + | font_attribute + | hint_attribute + | layout_attribute + | generic_attribute +) category_attribute_value_pair = category_attribute + EQUALS + paren_list_value event_attribute_value_pair = event_attribute + EQUALS + value @@ -101,31 +150,36 @@ def to_chr(x): layout_attribute_value_pair = layout_attribute + EQUALS + value generic_attribute_value_pair = attribute + EQUALS + value attribute_value_pair << Group( - category_attribute_value_pair + category_attribute_value_pair | event_attribute_value_pair | font_attribute_value_pair | hint_attribute_value_pair | layout_attribute_value_pair | generic_attribute_value_pair - ) +) -object_declaration = Group((OBJECT + object_name + COLON + object_type)) +object_declaration = Group(OBJECT + object_name + COLON + object_type) object_attributes = Group(ZeroOrMore(attribute_value_pair)) nested_object = Forward() -object_definition = object_declaration + object_attributes + ZeroOrMore(nested_object) + END +object_definition = ( + object_declaration + object_attributes + ZeroOrMore(nested_object) + END +) nested_object << Group(object_definition) ################# # END GRAMMAR ################# + def printer(s, loc, tok): - print(tok, end=' ') + print(tok, end=" ") return tok + def get_filename_list(tf): import sys, glob + if tf == None: if len(sys.argv) > 1: tf = sys.argv[1:] @@ -138,6 +192,7 @@ def get_filename_list(tf): testfiles.extend(glob.glob(arg)) return testfiles + def main(testfiles=None, action=printer): """testfiles can be None, in which case the command line arguments are used as filenames. testfiles can be a string, in which case that file is parsed. @@ -165,8 +220,8 @@ def main(testfiles=None, action=printer): failures.append(f) if failures: - print('\nfailed while processing %s' % ', '.join(failures)) - print('\nsucceeded on %d of %d files' %(success, len(testfiles))) + print("\nfailed while processing %s" % ", ".join(failures)) + print("\nsucceeded on %d of %d files" % (success, len(testfiles))) if len(retval) == 1 and len(testfiles) == 1: # if only one file is parsed, return the parseResults directly @@ -175,5 +230,6 @@ def main(testfiles=None, action=printer): # else, return a dictionary of parseResults return retval + if __name__ == "__main__": main() diff --git a/examples/dhcpd_leases_parser.py b/examples/dhcpd_leases_parser.py index a8850514..e9f64bd6 100644 --- a/examples/dhcpd_leases_parser.py +++ b/examples/dhcpd_leases_parser.py @@ -44,28 +44,32 @@ """ from pyparsing import * -import datetime,time +import datetime, time -LBRACE,RBRACE,SEMI,QUOTE = map(Suppress,'{};"') -ipAddress = Combine(Word(nums) + ('.' + Word(nums))*3) -hexint = Word(hexnums,exact=2) -macAddress = Combine(hexint + (':'+hexint)*5) +LBRACE, RBRACE, SEMI, QUOTE = map(Suppress, '{};"') +ipAddress = Combine(Word(nums) + ("." + Word(nums)) * 3) +hexint = Word(hexnums, exact=2) +macAddress = Combine(hexint + (":" + hexint) * 5) hdwType = Word(alphanums) -yyyymmdd = Combine((Word(nums,exact=4)|Word(nums,exact=2))+ - ('/'+Word(nums,exact=2))*2) -hhmmss = Combine(Word(nums,exact=2)+(':'+Word(nums,exact=2))*2) -dateRef = oneOf(list("0123456"))("weekday") + yyyymmdd("date") + \ - hhmmss("time") +yyyymmdd = Combine( + (Word(nums, exact=4) | Word(nums, exact=2)) + ("/" + Word(nums, exact=2)) * 2 +) +hhmmss = Combine(Word(nums, exact=2) + (":" + Word(nums, exact=2)) * 2) +dateRef = oneOf(list("0123456"))("weekday") + yyyymmdd("date") + hhmmss("time") + def utcToLocalTime(tokens): - utctime = datetime.datetime.strptime("%(date)s %(time)s" % tokens, - "%Y/%m/%d %H:%M:%S") - localtime = utctime-datetime.timedelta(0,time.timezone,0) - tokens["utcdate"],tokens["utctime"] = tokens["date"],tokens["time"] - tokens["localdate"],tokens["localtime"] = str(localtime).split() + utctime = datetime.datetime.strptime( + "%(date)s %(time)s" % tokens, "%Y/%m/%d %H:%M:%S" + ) + localtime = utctime - datetime.timedelta(0, time.timezone, 0) + tokens["utcdate"], tokens["utctime"] = tokens["date"], tokens["time"] + tokens["localdate"], tokens["localtime"] = str(localtime).split() del tokens["date"] del tokens["time"] + + dateRef.setParseAction(utcToLocalTime) startsStmt = "starts" + dateRef + SEMI @@ -76,12 +80,18 @@ def utcToLocalTime(tokens): uidStmt = "uid" + QuotedString('"')("uid") + SEMI bindingStmt = "binding" + Word(alphanums) + Word(alphanums) + SEMI -leaseStatement = startsStmt | endsStmt | tstpStmt | tsfpStmt | hdwStmt | \ - uidStmt | bindingStmt -leaseDef = "lease" + ipAddress("ipaddress") + LBRACE + \ - Dict(ZeroOrMore(Group(leaseStatement))) + RBRACE +leaseStatement = ( + startsStmt | endsStmt | tstpStmt | tsfpStmt | hdwStmt | uidStmt | bindingStmt +) +leaseDef = ( + "lease" + + ipAddress("ipaddress") + + LBRACE + + Dict(ZeroOrMore(Group(leaseStatement))) + + RBRACE +) for lease in leaseDef.searchString(sample): print(lease.dump()) - print(lease.ipaddress,'->',lease.hardware.mac) + print(lease.ipaddress, "->", lease.hardware.mac) print() diff --git a/examples/dictExample.py b/examples/dictExample.py index 7d3d45db..ebc437f1 100644 --- a/examples/dictExample.py +++ b/examples/dictExample.py @@ -19,15 +19,19 @@ """ # define grammar for datatable -heading = (pp.Literal( -"+-------+------+------+------+------+------+------+------+------+") + -"| | A1 | B1 | C1 | D1 | A2 | B2 | C2 | D2 |" + -"+=======+======+======+======+======+======+======+======+======+").suppress() +heading = ( + pp.Literal("+-------+------+------+------+------+------+------+------+------+") + + "| | A1 | B1 | C1 | D1 | A2 | B2 | C2 | D2 |" + + "+=======+======+======+======+======+======+======+======+======+" +).suppress() vert = pp.Literal("|").suppress() number = pp.Word(pp.nums) -rowData = pp.Group( vert + pp.Word(pp.alphas) + vert + pp.delimitedList(number,"|") + vert ) +rowData = pp.Group( + vert + pp.Word(pp.alphas) + vert + pp.delimitedList(number, "|") + vert +) trailing = pp.Literal( -"+-------+------+------+------+------+------+------+------+------+").suppress() + "+-------+------+------+------+------+------+------+------+------+" +).suppress() datatable = heading + pp.Dict(pp.ZeroOrMore(rowData)) + trailing @@ -42,7 +46,7 @@ print("data keys=", list(data.keys())) # use dict-style access to values -print("data['min']=", data['min']) +print("data['min']=", data["min"]) # use attribute-style access to values (if key is a valid Python identifier) print("data.max", data.max) diff --git a/examples/dictExample2.py b/examples/dictExample2.py index fa1b866e..16590a36 100644 --- a/examples/dictExample2.py +++ b/examples/dictExample2.py @@ -6,7 +6,17 @@ # # Copyright (c) 2004, Paul McGuire # -from pyparsing import Literal, Word, Group, Dict, ZeroOrMore, alphas, nums, delimitedList, pyparsing_common as ppc +from pyparsing import ( + Literal, + Word, + Group, + Dict, + ZeroOrMore, + alphas, + nums, + delimitedList, + pyparsing_common as ppc, +) testData = """ +-------+------+------+------+------+------+------+------+------+ @@ -25,34 +35,34 @@ vert = Literal("|").suppress() -rowDelim = ("+" + ZeroOrMore( underline + "+" ) ).suppress() +rowDelim = ("+" + ZeroOrMore(underline + "+")).suppress() columnHeader = Group(vert + vert + delimitedList(Word(alphas + nums), "|") + vert) heading = rowDelim + columnHeader("columns") + rowDelim -rowData = Group( vert + Word(alphas) + vert + delimitedList(number,"|") + vert ) +rowData = Group(vert + Word(alphas) + vert + delimitedList(number, "|") + vert) trailing = rowDelim -datatable = heading + Dict( ZeroOrMore(rowData) ) + trailing +datatable = heading + Dict(ZeroOrMore(rowData)) + trailing # now parse data and print results data = datatable.parseString(testData) print(data.dump()) print("data keys=", list(data.keys())) -print("data['min']=", data['min']) -print("sum(data['min']) =", sum(data['min'])) +print("data['min']=", data["min"]) +print("sum(data['min']) =", sum(data["min"])) print("data.max =", data.max) print("sum(data.max) =", sum(data.max)) # now print transpose of data table, using column labels read from table header and # values from data lists print() -print(" " * 5, end=' ') -for i in range(1,len(data)): - print("|%5s" % data[i][0], end=' ') +print(" " * 5, end=" ") +for i in range(1, len(data)): + print("|%5s" % data[i][0], end=" ") print() -print(("-" * 6) + ("+------" * (len(data)-1))) +print(("-" * 6) + ("+------" * (len(data) - 1))) for i in range(len(data.columns)): - print("%5s" % data.columns[i], end=' ') + print("%5s" % data.columns[i], end=" ") for j in range(len(data) - 1): - print('|%5s' % data[j + 1][i + 1], end=' ') + print("|%5s" % data[j + 1][i + 1], end=" ") print() diff --git a/examples/ebnf.py b/examples/ebnf.py index bb191559..4843d40c 100644 --- a/examples/ebnf.py +++ b/examples/ebnf.py @@ -11,7 +11,7 @@ from pyparsing import * -all_names = ''' +all_names = """ integer meta_identifier terminal_string @@ -25,29 +25,36 @@ definitions_list syntax_rule syntax -'''.split() +""".split() integer = Word(nums) -meta_identifier = Word(alphas, alphanums + '_') -terminal_string = Suppress("'") + CharsNotIn("'") + Suppress("'") ^ \ - Suppress('"') + CharsNotIn('"') + Suppress('"') +meta_identifier = Word(alphas, alphanums + "_") +terminal_string = Suppress("'") + CharsNotIn("'") + Suppress("'") ^ Suppress( + '"' +) + CharsNotIn('"') + Suppress('"') definitions_list = Forward() -optional_sequence = Suppress('[') + definitions_list + Suppress(']') -repeated_sequence = Suppress('{') + definitions_list + Suppress('}') -grouped_sequence = Suppress('(') + definitions_list + Suppress(')') -syntactic_primary = optional_sequence ^ repeated_sequence ^ \ - grouped_sequence ^ meta_identifier ^ terminal_string -syntactic_factor = Optional(integer + Suppress('*')) + syntactic_primary -syntactic_term = syntactic_factor + Optional(Suppress('-') + syntactic_factor) -single_definition = delimitedList(syntactic_term, ',') -definitions_list << delimitedList(single_definition, '|') -syntax_rule = meta_identifier + Suppress('=') + definitions_list + \ - Suppress(';') - -ebnfComment = ( "(*" + - ZeroOrMore( CharsNotIn("*") | ( "*" + ~Literal(")") ) ) + - "*)" ).streamline().setName("ebnfComment") +optional_sequence = Suppress("[") + definitions_list + Suppress("]") +repeated_sequence = Suppress("{") + definitions_list + Suppress("}") +grouped_sequence = Suppress("(") + definitions_list + Suppress(")") +syntactic_primary = ( + optional_sequence + ^ repeated_sequence + ^ grouped_sequence + ^ meta_identifier + ^ terminal_string +) +syntactic_factor = Optional(integer + Suppress("*")) + syntactic_primary +syntactic_term = syntactic_factor + Optional(Suppress("-") + syntactic_factor) +single_definition = delimitedList(syntactic_term, ",") +definitions_list << delimitedList(single_definition, "|") +syntax_rule = meta_identifier + Suppress("=") + definitions_list + Suppress(";") + +ebnfComment = ( + ("(*" + ZeroOrMore(CharsNotIn("*") | ("*" + ~Literal(")"))) + "*)") + .streamline() + .setName("ebnfComment") +) syntax = OneOrMore(syntax_rule) syntax.ignore(ebnfComment) @@ -56,6 +63,7 @@ def do_integer(str, loc, toks): return int(toks[0]) + def do_meta_identifier(str, loc, toks): if toks[0] in symbol_table: return symbol_table[toks[0]] @@ -64,28 +72,35 @@ def do_meta_identifier(str, loc, toks): symbol_table[toks[0]] = Forward() return symbol_table[toks[0]] + def do_terminal_string(str, loc, toks): return Literal(toks[0]) + def do_optional_sequence(str, loc, toks): return Optional(toks[0]) + def do_repeated_sequence(str, loc, toks): return ZeroOrMore(toks[0]) + def do_grouped_sequence(str, loc, toks): return Group(toks[0]) + def do_syntactic_primary(str, loc, toks): return toks[0] + def do_syntactic_factor(str, loc, toks): if len(toks) == 2: # integer * syntactic_primary return And([toks[1]] * toks[0]) else: # syntactic_primary - return [ toks[0] ] + return [toks[0]] + def do_syntactic_term(str, loc, toks): if len(toks) == 2: @@ -93,7 +108,8 @@ def do_syntactic_term(str, loc, toks): return NotAny(toks[1]) + toks[0] else: # syntactic_factor - return [ toks[0] ] + return [toks[0]] + def do_single_definition(str, loc, toks): toks = toks.asList() @@ -102,7 +118,8 @@ def do_single_definition(str, loc, toks): return And(toks) else: # syntactic_term - return [ toks[0] ] + return [toks[0]] + def do_definitions_list(str, loc, toks): toks = toks.asList() @@ -111,31 +128,36 @@ def do_definitions_list(str, loc, toks): return Or(toks) else: # single_definition - return [ toks[0] ] + return [toks[0]] + def do_syntax_rule(str, loc, toks): # meta_identifier = definitions_list ; assert toks[0].expr is None, "Duplicate definition" forward_count.value -= 1 toks[0] << toks[1] - return [ toks[0] ] + return [toks[0]] + def do_syntax(str, loc, toks): # syntax_rule syntax_rule ... return symbol_table - symbol_table = {} + + class forward_count: pass + + forward_count.value = 0 for name in all_names: expr = vars()[name] - action = vars()['do_' + name] + action = vars()["do_" + name] expr.setName(name) expr.setParseAction(action) - #~ expr.setDebug() + # ~ expr.setDebug() def parse(ebnf, given_table={}): @@ -147,5 +169,5 @@ def parse(ebnf, given_table={}): for name in table: expr = table[name] expr.setName(name) - #~ expr.setDebug() + # ~ expr.setDebug() return table diff --git a/examples/ebnftest.py b/examples/ebnftest.py index 40772ee0..7b1ff759 100644 --- a/examples/ebnftest.py +++ b/examples/ebnftest.py @@ -5,14 +5,14 @@ # # Submitted 2004 by Seo Sanghyeon # -print('Importing pyparsing...') +print("Importing pyparsing...") from pyparsing import * -print('Constructing EBNF parser with pyparsing...') +print("Constructing EBNF parser with pyparsing...") import ebnf -grammar = ''' +grammar = """ syntax = (syntax_rule), {(syntax_rule)}; syntax_rule = meta_identifier, '=', definitions_list, ';'; definitions_list = single_definition, {'|', single_definition}; @@ -30,43 +30,46 @@ meta_identifier = letter, {letter | digit}; integer = digit, {digit}; *) -''' +""" table = {} -#~ table['character'] = Word(printables, exact=1) -#~ table['letter'] = Word(alphas + '_', exact=1) -#~ table['digit'] = Word(nums, exact=1) -table['terminal_string'] = sglQuotedString -table['meta_identifier'] = Word(alphas+"_", alphas+"_"+nums) -table['integer'] = Word(nums) +# ~ table['character'] = Word(printables, exact=1) +# ~ table['letter'] = Word(alphas + '_', exact=1) +# ~ table['digit'] = Word(nums, exact=1) +table["terminal_string"] = sglQuotedString +table["meta_identifier"] = Word(alphas + "_", alphas + "_" + nums) +table["integer"] = Word(nums) -print('Parsing EBNF grammar with EBNF parser...') +print("Parsing EBNF grammar with EBNF parser...") parsers = ebnf.parse(grammar, table) -ebnf_parser = parsers['syntax'] +ebnf_parser = parsers["syntax"] commentcharcount = 0 commentlocs = set() -def tallyCommentChars(s,l,t): - global commentcharcount,commentlocs + + +def tallyCommentChars(s, l, t): + global commentcharcount, commentlocs # only count this comment if we haven't seen it before if l not in commentlocs: - charCount = ( len(t[0]) - len(list(filter(str.isspace, t[0]))) ) + charCount = len(t[0]) - len(list(filter(str.isspace, t[0]))) commentcharcount += charCount commentlocs.add(l) - return l,t + return l, t + -#ordinarily, these lines wouldn't be necessary, but we are doing extra stuff with the comment expression -ebnf.ebnfComment.setParseAction( tallyCommentChars ) -ebnf_parser.ignore( ebnf.ebnfComment ) +# ordinarily, these lines wouldn't be necessary, but we are doing extra stuff with the comment expression +ebnf.ebnfComment.setParseAction(tallyCommentChars) +ebnf_parser.ignore(ebnf.ebnfComment) -print('Parsing EBNF grammar with generated EBNF parser...\n') +print("Parsing EBNF grammar with generated EBNF parser...\n") parsed_chars = ebnf_parser.parseString(grammar) parsed_char_len = len(parsed_chars) -print("],\n".join(str( parsed_chars.asList() ).split("],"))) +print("],\n".join(str(parsed_chars.asList()).split("],"))) -#~ grammar_length = len(grammar) - len(filter(str.isspace, grammar))-commentcharcount +# ~ grammar_length = len(grammar) - len(filter(str.isspace, grammar))-commentcharcount -#~ assert parsed_char_len == grammar_length +# ~ assert parsed_char_len == grammar_length -print('Ok!') +print("Ok!") diff --git a/examples/eval_arith.py b/examples/eval_arith.py index 0896c010..3a19ae04 100644 --- a/examples/eval_arith.py +++ b/examples/eval_arith.py @@ -8,28 +8,46 @@ # Added support for exponentiation, using right-to-left evaluation of # operands # -from pyparsing import Word, nums, alphas, Combine, oneOf, \ - opAssoc, infixNotation, Literal +from pyparsing import ( + Word, + nums, + alphas, + Combine, + one_of, + OpAssoc, + infix_notation, + Literal, + ParserElement, +) -class EvalConstant(object): +ParserElement.enablePackrat() + + +class EvalConstant: "Class to evaluate a parsed constant or variable" vars_ = {} + def __init__(self, tokens): self.value = tokens[0] + def eval(self): if self.value in EvalConstant.vars_: return EvalConstant.vars_[self.value] else: return float(self.value) -class EvalSignOp(object): + +class EvalSignOp: "Class to evaluate expressions with a leading + or - sign" + def __init__(self, tokens): self.sign, self.value = tokens[0] + def eval(self): - mult = {'+':1, '-':-1}[self.sign] + mult = {"+": 1, "-": -1}[self.sign] return mult * self.value.eval() + def operatorOperands(tokenlist): "generator to extract operators and operands in pairs" it = iter(tokenlist) @@ -39,67 +57,79 @@ def operatorOperands(tokenlist): except StopIteration: break -class EvalPowerOp(object): + +class EvalPowerOp: "Class to evaluate multiplication and division expressions" + def __init__(self, tokens): self.value = tokens[0] + def eval(self): res = self.value[-1].eval() for val in self.value[-3::-2]: - res = val.eval()**res + res = val.eval() ** res return res -class EvalMultOp(object): + +class EvalMultOp: "Class to evaluate multiplication and division expressions" + def __init__(self, tokens): self.value = tokens[0] + def eval(self): prod = self.value[0].eval() - for op,val in operatorOperands(self.value[1:]): - if op == '*': + for op, val in operatorOperands(self.value[1:]): + if op == "*": prod *= val.eval() - if op == '/': + if op == "/": prod /= val.eval() return prod -class EvalAddOp(object): + +class EvalAddOp: "Class to evaluate addition and subtraction expressions" + def __init__(self, tokens): self.value = tokens[0] + def eval(self): sum = self.value[0].eval() - for op,val in operatorOperands(self.value[1:]): - if op == '+': + for op, val in operatorOperands(self.value[1:]): + if op == "+": sum += val.eval() - if op == '-': + if op == "-": sum -= val.eval() return sum -class EvalComparisonOp(object): + +class EvalComparisonOp: "Class to evaluate comparison expressions" opMap = { - "<" : lambda a,b : a < b, - "<=" : lambda a,b : a <= b, - ">" : lambda a,b : a > b, - ">=" : lambda a,b : a >= b, - "!=" : lambda a,b : a != b, - "=" : lambda a,b : a == b, - "LT" : lambda a,b : a < b, - "LE" : lambda a,b : a <= b, - "GT" : lambda a,b : a > b, - "GE" : lambda a,b : a >= b, - "NE" : lambda a,b : a != b, - "EQ" : lambda a,b : a == b, - "<>" : lambda a,b : a != b, - } + "<": lambda a, b: a < b, + "<=": lambda a, b: a <= b, + ">": lambda a, b: a > b, + ">=": lambda a, b: a >= b, + "!=": lambda a, b: a != b, + "=": lambda a, b: a == b, + "LT": lambda a, b: a < b, + "LE": lambda a, b: a <= b, + "GT": lambda a, b: a > b, + "GE": lambda a, b: a >= b, + "NE": lambda a, b: a != b, + "EQ": lambda a, b: a == b, + "<>": lambda a, b: a != b, + } + def __init__(self, tokens): self.value = tokens[0] + def eval(self): val1 = self.value[0].eval() - for op,val in operatorOperands(self.value[1:]): + for op, val in operatorOperands(self.value[1:]): fn = EvalComparisonOp.opMap[op] val2 = val.eval() - if not fn(val1,val2): + if not fn(val1, val2): break val1 = val2 else: @@ -110,120 +140,129 @@ def eval(self): # define the parser integer = Word(nums) real = Combine(Word(nums) + "." + Word(nums)) -variable = Word(alphas,exact=1) +variable = Word(alphas, exact=1) operand = real | integer | variable -signop = oneOf('+ -') -multop = oneOf('* /') -plusop = oneOf('+ -') -expop = Literal('**') +signop = one_of("+ -") +multop = one_of("* /") +plusop = one_of("+ -") +expop = Literal("**") # use parse actions to attach EvalXXX constructors to sub-expressions operand.setParseAction(EvalConstant) -arith_expr = infixNotation(operand, +arith_expr = infix_notation( + operand, [ - (signop, 1, opAssoc.RIGHT, EvalSignOp), - (expop, 2, opAssoc.LEFT, EvalPowerOp), - (multop, 2, opAssoc.LEFT, EvalMultOp), - (plusop, 2, opAssoc.LEFT, EvalAddOp), - ]) - -comparisonop = oneOf("< <= > >= != = <> LT GT LE GE EQ NE") -comp_expr = infixNotation(arith_expr, + (signop, 1, OpAssoc.RIGHT, EvalSignOp), + (expop, 2, OpAssoc.LEFT, EvalPowerOp), + (multop, 2, OpAssoc.LEFT, EvalMultOp), + (plusop, 2, OpAssoc.LEFT, EvalAddOp), + ], +) + +comparisonop = one_of("< <= > >= != = <> LT GT LE GE EQ NE") +comp_expr = infix_notation( + arith_expr, [ - (comparisonop, 2, opAssoc.LEFT, EvalComparisonOp), - ]) - -def main(): - # sample expressions posted on comp.lang.python, asking for advice - # in safely evaluating them - rules=[ - '( A - B ) = 0', - '(A + B + C + D + E + F + G + H + I) = J', - '(A + B + C + D + E + F + G + H) = I', - '(A + B + C + D + E + F) = G', - '(A + B + C + D + E) = (F + G + H + I + J)', - '(A + B + C + D + E) = (F + G + H + I)', - '(A + B + C + D + E) = F', - '(A + B + C + D) = (E + F + G + H)', - '(A + B + C) = (D + E + F)', - '(A + B) = (C + D + E + F)', - '(A + B) = (C + D)', - '(A + B) = (C - D + E - F - G + H + I + J)', - '(A + B) = C', - '(A + B) = 0', - '(A+B+C+D+E) = (F+G+H+I+J)', - '(A+B+C+D) = (E+F+G+H)', - '(A+B+C+D)=(E+F+G+H)', - '(A+B+C)=(D+E+F)', - '(A+B)=(C+D)', - '(A+B)=C', - '(A-B)=C', - '(A/(B+C))', - '(B/(C+D))', - '(G + H) = I', - '-0.99 LE ((A+B+C)-(D+E+F+G)) LE 0.99', - '-0.99 LE (A-(B+C)) LE 0.99', - '-1000.00 LE A LE 0.00', - '-5000.00 LE A LE 0.00', - 'A < B', - 'A < 7000', - 'A = -(B)', - 'A = C', - 'A = 0', - 'A GT 0', - 'A GT 0.00', - 'A GT 7.00', - 'A LE B', - 'A LT -1000.00', - 'A LT -5000', - 'A LT 0', - 'A=(B+C+D)', - 'A=B', - 'I = (G + H)', - '0.00 LE A LE 4.00', - '4.00 LT A LE 7.00', - '0.00 LE A LE 4.00 LE E > D', - '2**2**(A+3)', - ] - vars_={'A': 0, 'B': 1.1, 'C': 2.2, 'D': 3.3, 'E': 4.4, 'F': 5.5, 'G': - 6.6, 'H':7.7, 'I':8.8, 'J':9.9} - - # define tests from given rules - tests = [] - for t in rules: - t_orig = t - t = t.replace("=","==") - t = t.replace("EQ","==") - t = t.replace("LE","<=") - t = t.replace("GT",">") - t = t.replace("LT","<") - t = t.replace("GE",">=") - t = t.replace("LE","<=") - t = t.replace("NE","!=") - t = t.replace("<>","!=") - tests.append( (t_orig,eval(t,vars_)) ) - - # copy vars_ to EvalConstant lookup dict - EvalConstant.vars_ = vars_ - failed = 0 - for test,expected in tests: - ret = comp_expr.parseString(test)[0] - parsedvalue = ret.eval() - print(test, expected, parsedvalue) - if parsedvalue != expected: - print("<<< FAIL") - failed += 1 - else: - print('') + (comparisonop, 2, OpAssoc.LEFT, EvalComparisonOp), + ], +) + + +# sample expressions posted on comp.lang.python, asking for advice +# in safely evaluating them +rules = [ + "( A - B ) = 0", + "( B - C + B ) = 0", + "(A + B + C + D + E + F + G + H + I) = J", + "(A + B + C + D + E + F + G + H) = I", + "(A + B + C + D + E + F) = G", + "(A + B + C + D + E) = (F + G + H + I + J)", + "(A + B + C + D + E) = (F + G + H + I)", + "(A + B + C + D + E) = F", + "(A + B + C + D) = (E + F + G + H)", + "(A + B + C) = D", + "(A + B + C) = (D + E + F)", + "(A + B) = (C + D + E + F)", + "(A + B) = (C + D)", + "(A + B) = (C - D + E - F - G + H + I + J)", + "(A + B) = C", + "(A + B) = 0", + "(A+B+C+D+E) = (F+G+H+I+J)", + "(A+B+C+D) = (E+F+G+H)", + "(A+B+C+D)=(E+F+G+H)", + "(A+B+C)=(D+E+F)", + "(A+B)=(C+D)", + "(A+B)=C", + "(A-B)=C", + "(A/(B+C))", + "(B/(C+D))", + "(G + H) = I", + "-0.99 LE ((A+B+C)-(D+E+F+G)) LE 0.99", + "-0.99 LE (A-(B+C)) LE 0.99", + "-1000.00 LE A LE 0.00", + "-5000.00 LE A LE 0.00", + "A < B", + "A < 7000", + "A = -(B)", + "A = C", + "A = 0", + "A GT 0", + "A GT 0.00", + "A GT 7.00", + "A LE B", + "A LT -1000.00", + "A LT -5000", + "A LT 0", + "G=(B+C+D)", + "A=B", + "I = (G + H)", + "0.00 LE A LE 4.00", + "4.00 LT A LE 7.00", + "0.00 LE A LE 4.00 LE E > D", + "2**2**(A+3)", +] +vars_ = { + "A": 0, + "B": 1.1, + "C": 2.2, + "D": 3.3, + "E": 4.4, + "F": 5.5, + "G": 6.6, + "H": 7.7, + "I": 8.8, + "J": 9.9, +} + +# define tests from given rules +tests = [] +for t in rules: + t_orig = t + t = t.replace("=", "==") + t = t.replace("EQ", "==") + t = t.replace("LE", "<=") + t = t.replace("GT", ">") + t = t.replace("LT", "<") + t = t.replace("GE", ">=") + t = t.replace("LE", "<=") + t = t.replace("NE", "!=") + t = t.replace("<>", "!=") + tests.append((t_orig, eval(t, vars_))) - print('') - if failed: - print(failed, "tests FAILED") - return 1 +# copy vars_ to EvalConstant lookup dict +EvalConstant.vars_ = vars_ +failed = 0 +for test, expected in tests: + ret = comp_expr.parseString(test)[0] + parsedvalue = ret.eval() + print(test, expected, parsedvalue) + if abs(parsedvalue - expected) > 1e-6: + print("<<< FAIL") + failed += 1 else: - print("all tests PASSED") - return 0 + print("") -if __name__=='__main__': - exit(main()) +print("") +if failed: + raise Exception("could not parse") diff --git a/examples/excelExpr.py b/examples/excelExpr.py deleted file mode 100644 index 86237ef6..00000000 --- a/examples/excelExpr.py +++ /dev/null @@ -1,70 +0,0 @@ -# excelExpr.py -# -# Copyright 2010, Paul McGuire -# -# A partial implementation of a parser of Excel formula expressions. -# -from pyparsing import (CaselessKeyword, Suppress, Word, alphas, - alphanums, nums, Optional, Group, oneOf, Forward, - infixNotation, opAssoc, dblQuotedString, delimitedList, - Combine, Literal, QuotedString, ParserElement, pyparsing_common as ppc) -ParserElement.enablePackrat() - -EQ,LPAR,RPAR,COLON,COMMA = map(Suppress, '=():,') -EXCL, DOLLAR = map(Literal,"!$") -sheetRef = Word(alphas, alphanums) | QuotedString("'",escQuote="''") -colRef = Optional(DOLLAR) + Word(alphas,max=2) -rowRef = Optional(DOLLAR) + Word(nums) -cellRef = Combine(Group(Optional(sheetRef + EXCL)("sheet") + colRef("col") + - rowRef("row"))) - -cellRange = (Group(cellRef("start") + COLON + cellRef("end"))("range") - | cellRef | Word(alphas,alphanums)) - -expr = Forward() - -COMPARISON_OP = oneOf("< = > >= <= != <>") -condExpr = expr + COMPARISON_OP + expr - -ifFunc = (CaselessKeyword("if") - - LPAR - + Group(condExpr)("condition") - + COMMA + Group(expr)("if_true") - + COMMA + Group(expr)("if_false") - + RPAR) - -statFunc = lambda name : Group(CaselessKeyword(name) + Group(LPAR + delimitedList(expr) + RPAR)) -sumFunc = statFunc("sum") -minFunc = statFunc("min") -maxFunc = statFunc("max") -aveFunc = statFunc("ave") -funcCall = ifFunc | sumFunc | minFunc | maxFunc | aveFunc - -multOp = oneOf("* /") -addOp = oneOf("+ -") -numericLiteral = ppc.number -operand = numericLiteral | funcCall | cellRange | cellRef -arithExpr = infixNotation(operand, - [ - (multOp, 2, opAssoc.LEFT), - (addOp, 2, opAssoc.LEFT), - ]) - -textOperand = dblQuotedString | cellRef -textExpr = infixNotation(textOperand, - [ - ('&', 2, opAssoc.LEFT), - ]) - -expr << (arithExpr | textExpr) - - -(EQ + expr).runTests("""\ - =3*A7+5 - =3*Sheet1!$A$7+5 - =3*'Sheet 1'!$A$7+5" - =3*'O''Reilly''s sheet'!$A$7+5 - =if(Sum(A1:A25)>42,Min(B1:B25),if(Sum(C1:C25)>3.14, (Min(C1:C25)+3)*18,Max(B1:B25))) - =sum(a1:a25,10,min(b1,c2,d3)) - =if("T"&a2="TTime", "Ready", "Not ready") -""") diff --git a/examples/excel_expr.py b/examples/excel_expr.py new file mode 100644 index 00000000..0877e543 --- /dev/null +++ b/examples/excel_expr.py @@ -0,0 +1,93 @@ +# excelExpr.py +# +# Copyright 2010, Paul McGuire +# +# A partial implementation of a parser of Excel formula expressions. +# +import pyparsing as pp +ppc = pp.common + +pp.ParserElement.enable_packrat() + +EQ, LPAR, RPAR, COLON, COMMA = pp.Suppress.using_each("=():,") +EXCL, DOLLAR = pp.Literal.using_each("!$") +sheet_ref = pp.Word(pp.alphas, pp.alphanums) | pp.QuotedString("'", escQuote="''") +col_ref = pp.Opt(DOLLAR) + pp.Word(pp.alphas, max=2) +row_ref = pp.Opt(DOLLAR) + pp.Word(pp.nums) +cell_ref = pp.Combine( + pp.Group(pp.Opt(sheet_ref + EXCL)("sheet") + col_ref("col") + row_ref("row")) +) + +cell_range = ( + pp.Group(cell_ref("start") + COLON + cell_ref("end"))("range") + | cell_ref + | pp.Word(pp.alphas, pp.alphanums) +) + +expr = pp.Forward() + +COMPARISON_OP = pp.one_of("< = > >= <= != <>") +cond_expr = expr + COMPARISON_OP + expr + +if_func = ( + pp.CaselessKeyword("if") + - LPAR + + pp.Group(cond_expr)("condition") + + COMMA + + pp.Group(expr)("if_true") + + COMMA + + pp.Group(expr)("if_false") + + RPAR +) + + +def stat_function(name): + return pp.Group(pp.CaselessKeyword(name) + pp.Group(LPAR + pp.DelimitedList(expr) + RPAR)) + + +sum_func = stat_function("sum") +min_func = stat_function("min") +max_func = stat_function("max") +ave_func = stat_function("ave") +func_call = if_func | sum_func | min_func | max_func | ave_func + +mult_op = pp.one_of("* /") +add_op = pp.one_of("+ -") +numeric_literal = ppc.number +operand = numeric_literal | func_call | cell_range | cell_ref +arith_expr = pp.infix_notation( + operand, + [ + (mult_op, 2, pp.OpAssoc.LEFT), + (add_op, 2, pp.OpAssoc.LEFT), + ], +) + +text_operand = pp.dbl_quoted_string | cell_ref +text_expr = pp.infix_notation( + text_operand, + [ + ("&", 2, pp.OpAssoc.LEFT), + ], +) + +expr <<= arith_expr | text_expr + + +def main(): + success, report = (EQ + expr).run_tests( + """\ + =3*A7+5 + =3*Sheet1!$A$7+5 + =3*'Sheet 1'!$A$7+5 + =3*'O''Reilly''s sheet'!$A$7+5 + =if(Sum(A1:A25)>42,Min(B1:B25),if(Sum(C1:C25)>3.14, (Min(C1:C25)+3)*18,Max(B1:B25))) + =sum(a1:a25,10,min(b1,c2,d3)) + =if("T"&a2="TTime", "Ready", "Not ready") + """ + ) + assert success + + +if __name__ == '__main__': + main() diff --git a/examples/fourFn.py b/examples/fourFn.py index e1393b65..e448fbb8 100644 --- a/examples/fourFn.py +++ b/examples/fourFn.py @@ -1,192 +1,276 @@ -# fourFn.py -# -# Demonstration of the pyparsing module, implementing a simple 4-function expression parser, -# with support for scientific notation, and symbols for e and pi. -# Extended to add exponentiation and simple built-in functions. -# Extended test cases, simplified pushFirst method. -# Removed unnecessary expr.suppress() call (thanks Nathaniel Peterson!), and added Group -# Changed fnumber to use a Regex, which is now the preferred method -# -# Copyright 2003-2009 by Paul McGuire -# -from pyparsing import Literal,Word,Group,\ - ZeroOrMore,Forward,alphas,alphanums,Regex,ParseException,\ - CaselessKeyword, Suppress -import math -import operator - -exprStack = [] - -def pushFirst( strg, loc, toks ): - exprStack.append( toks[0] ) -def pushUMinus( strg, loc, toks ): - for t in toks: - if t == '-': - exprStack.append( 'unary -' ) - #~ exprStack.append( '-1' ) - #~ exprStack.append( '*' ) - else: - break - -bnf = None -def BNF(): - """ - expop :: '^' - multop :: '*' | '/' - addop :: '+' | '-' - integer :: ['+' | '-'] '0'..'9'+ - atom :: PI | E | real | fn '(' expr ')' | '(' expr ')' - factor :: atom [ expop factor ]* - term :: factor [ multop factor ]* - expr :: term [ addop term ]* - """ - global bnf - if not bnf: - point = Literal( "." ) - # use CaselessKeyword for e and pi, to avoid accidentally matching - # functions that start with 'e' or 'pi' (such as 'exp'); Keyword - # and CaselessKeyword only match whole words - e = CaselessKeyword( "E" ) - pi = CaselessKeyword( "PI" ) - #~ fnumber = Combine( Word( "+-"+nums, nums ) + - #~ Optional( point + Optional( Word( nums ) ) ) + - #~ Optional( e + Word( "+-"+nums, nums ) ) ) - fnumber = Regex(r"[+-]?\d+(?:\.\d*)?(?:[eE][+-]?\d+)?") - ident = Word(alphas, alphanums+"_$") - - plus, minus, mult, div = map(Literal, "+-*/") - lpar, rpar = map(Suppress, "()") - addop = plus | minus - multop = mult | div - expop = Literal( "^" ) - - expr = Forward() - atom = ((0,None)*minus + ( pi | e | fnumber | ident + lpar + expr + rpar | ident ).setParseAction( pushFirst ) | - Group( lpar + expr + rpar )).setParseAction(pushUMinus) - - # by defining exponentiation as "atom [ ^ factor ]..." instead of "atom [ ^ atom ]...", we get right-to-left exponents, instead of left-to-righ - # that is, 2^3^2 = 2^(3^2), not (2^3)^2. - factor = Forward() - factor << atom + ZeroOrMore( ( expop + factor ).setParseAction( pushFirst ) ) - - term = factor + ZeroOrMore( ( multop + factor ).setParseAction( pushFirst ) ) - expr << term + ZeroOrMore( ( addop + term ).setParseAction( pushFirst ) ) - bnf = expr - return bnf - -# map operator symbols to corresponding arithmetic operations -epsilon = 1e-12 -opn = { "+" : operator.add, - "-" : operator.sub, - "*" : operator.mul, - "/" : operator.truediv, - "^" : operator.pow } -fn = { "sin" : math.sin, - "cos" : math.cos, - "tan" : math.tan, - "exp" : math.exp, - "abs" : abs, - "trunc" : lambda a: int(a), - "round" : round, - "sgn" : lambda a: (a > epsilon) - (a < -epsilon) } -def evaluateStack( s ): - op = s.pop() - if op == 'unary -': - return -evaluateStack( s ) - if op in "+-*/^": - op2 = evaluateStack( s ) - op1 = evaluateStack( s ) - return opn[op]( op1, op2 ) - elif op == "PI": - return math.pi # 3.1415926535 - elif op == "E": - return math.e # 2.718281828 - elif op in fn: - return fn[op]( evaluateStack( s ) ) - elif op[0].isalpha(): - raise Exception("invalid identifier '%s'" % op) - else: - return float( op ) - -if __name__ == "__main__": - - def test( s, expVal ): - global exprStack - exprStack[:] = [] - try: - results = BNF().parseString( s, parseAll=True ) - val = evaluateStack( exprStack[:] ) - except ParseException as pe: - print(s, "failed parse:", str(pe)) - except Exception as e: - print(s, "failed eval:", str(e)) - else: - if val == expVal: - print(s, "=", val, results, "=>", exprStack) - else: - print(s+"!!!", val, "!=", expVal, results, "=>", exprStack) - - test( "9", 9 ) - test( "-9", -9 ) - test( "--9", 9 ) - test( "-E", -math.e ) - test( "9 + 3 + 6", 9 + 3 + 6 ) - test( "9 + 3 / 11", 9 + 3.0 / 11 ) - test( "(9 + 3)", (9 + 3) ) - test( "(9+3) / 11", (9+3.0) / 11 ) - test( "9 - 12 - 6", 9 - 12 - 6 ) - test( "9 - (12 - 6)", 9 - (12 - 6) ) - test( "2*3.14159", 2*3.14159 ) - test( "3.1415926535*3.1415926535 / 10", 3.1415926535*3.1415926535 / 10 ) - test( "PI * PI / 10", math.pi * math.pi / 10 ) - test( "PI*PI/10", math.pi*math.pi/10 ) - test( "PI^2", math.pi**2 ) - test( "round(PI^2)", round(math.pi**2) ) - test( "6.02E23 * 8.048", 6.02E23 * 8.048 ) - test( "e / 3", math.e / 3 ) - test( "sin(PI/2)", math.sin(math.pi/2) ) - test( "trunc(E)", int(math.e) ) - test( "trunc(-E)", int(-math.e) ) - test( "round(E)", round(math.e) ) - test( "round(-E)", round(-math.e) ) - test( "E^PI", math.e**math.pi ) - test( "exp(0)", 1 ) - test( "exp(1)", math.e ) - test( "2^3^2", 2**3**2 ) - test( "2^3+2", 2**3+2 ) - test( "2^3+5", 2**3+5 ) - test( "2^9", 2**9 ) - test( "sgn(-2)", -1 ) - test( "sgn(0)", 0 ) - test( "foo(0.1)", None ) - test( "sgn(0.1)", 1 ) - - -""" -Test output: ->pythonw -u fourFn.py -9 = 9.0 ['9'] => ['9'] -9 + 3 + 6 = 18.0 ['9', '+', '3', '+', '6'] => ['9', '3', '+', '6', '+'] -9 + 3 / 11 = 9.27272727273 ['9', '+', '3', '/', '11'] => ['9', '3', '11', '/', '+'] -(9 + 3) = 12.0 [] => ['9', '3', '+'] -(9+3) / 11 = 1.09090909091 ['/', '11'] => ['9', '3', '+', '11', '/'] -9 - 12 - 6 = -9.0 ['9', '-', '12', '-', '6'] => ['9', '12', '-', '6', '-'] -9 - (12 - 6) = 3.0 ['9', '-'] => ['9', '12', '6', '-', '-'] -2*3.14159 = 6.28318 ['2', '*', '3.14159'] => ['2', '3.14159', '*'] -3.1415926535*3.1415926535 / 10 = 0.986960440053 ['3.1415926535', '*', '3.1415926535', '/', '10'] => ['3.1415926535', '3.1415926535', '*', '10', '/'] -PI * PI / 10 = 0.986960440109 ['PI', '*', 'PI', '/', '10'] => ['PI', 'PI', '*', '10', '/'] -PI*PI/10 = 0.986960440109 ['PI', '*', 'PI', '/', '10'] => ['PI', 'PI', '*', '10', '/'] -PI^2 = 9.86960440109 ['PI', '^', '2'] => ['PI', '2', '^'] -6.02E23 * 8.048 = 4.844896e+024 ['6.02E23', '*', '8.048'] => ['6.02E23', '8.048', '*'] -e / 3 = 0.90609394282 ['E', '/', '3'] => ['E', '3', '/'] -sin(PI/2) = 1.0 ['sin', 'PI', '/', '2'] => ['PI', '2', '/', 'sin'] -trunc(E) = 2 ['trunc', 'E'] => ['E', 'trunc'] -E^PI = 23.1406926328 ['E', '^', 'PI'] => ['E', 'PI', '^'] -2^3^2 = 512.0 ['2', '^', '3', '^', '2'] => ['2', '3', '2', '^', '^'] -2^3+2 = 10.0 ['2', '^', '3', '+', '2'] => ['2', '3', '^', '2', '+'] -2^9 = 512.0 ['2', '^', '9'] => ['2', '9', '^'] -sgn(-2) = -1 ['sgn', '-2'] => ['-2', 'sgn'] -sgn(0) = 0 ['sgn', '0'] => ['0', 'sgn'] -sgn(0.1) = 1 ['sgn', '0.1'] => ['0.1', 'sgn'] ->Exit code: 0 -""" +# fourFn.py +# +# Demonstration of the pyparsing module, implementing a simple 4-function expression parser, +# with support for scientific notation, and symbols for e and pi. +# Extended to add exponentiation and simple built-in functions. +# Extended test cases, simplified pushFirst method. +# Removed unnecessary expr.suppress() call (thanks Nathaniel Peterson!), and added Group +# Changed fnumber to use a Regex, which is now the preferred method +# Reformatted to latest pypyparsing features, support multiple and variable args to functions +# +# Copyright 2003-2019 by Paul McGuire +# +from pyparsing import ( + Literal, + Word, + Group, + Forward, + alphas, + alphanums, + Regex, + ParseException, + CaselessKeyword, + Suppress, + delimitedList, +) +import math +import operator + +exprStack = [] + + +def push_first(toks): + exprStack.append(toks[0]) + + +def push_unary_minus(toks): + for t in toks: + if t == "-": + exprStack.append("unary -") + else: + break + + +bnf = None + + +def BNF(): + """ + expop :: '^' + multop :: '*' | '/' + addop :: '+' | '-' + integer :: ['+' | '-'] '0'..'9'+ + atom :: PI | E | real | fn '(' expr ')' | '(' expr ')' + factor :: atom [ expop factor ]* + term :: factor [ multop factor ]* + expr :: term [ addop term ]* + """ + global bnf + if not bnf: + # use CaselessKeyword for e and pi, to avoid accidentally matching + # functions that start with 'e' or 'pi' (such as 'exp'); Keyword + # and CaselessKeyword only match whole words + e = CaselessKeyword("E") + pi = CaselessKeyword("PI") + # fnumber = Combine(Word("+-"+nums, nums) + + # Optional("." + Optional(Word(nums))) + + # Optional(e + Word("+-"+nums, nums))) + # or use provided pyparsing_common.number, but convert back to str: + # fnumber = ppc.number().addParseAction(lambda t: str(t[0])) + fnumber = Regex(r"[+-]?\d+(?:\.\d*)?(?:[eE][+-]?\d+)?") + ident = Word(alphas, alphanums + "_$") + + plus, minus, mult, div = map(Literal, "+-*/") + lpar, rpar = map(Suppress, "()") + addop = plus | minus + multop = mult | div + expop = Literal("^") + + expr = Forward() + expr_list = delimitedList(Group(expr)) + # add parse action that replaces the function identifier with a (name, number of args) tuple + def insert_fn_argcount_tuple(t): + fn = t.pop(0) + num_args = len(t[0]) + t.insert(0, (fn, num_args)) + + fn_call = (ident + lpar - Group(expr_list) + rpar).setParseAction( + insert_fn_argcount_tuple + ) + atom = ( + addop[...] + + ( + (fn_call | pi | e | fnumber | ident).setParseAction(push_first) + | Group(lpar + expr + rpar) + ) + ).setParseAction(push_unary_minus) + + # by defining exponentiation as "atom [ ^ factor ]..." instead of "atom [ ^ atom ]...", we get right-to-left + # exponents, instead of left-to-right that is, 2^3^2 = 2^(3^2), not (2^3)^2. + factor = Forward() + factor <<= atom + (expop + factor).setParseAction(push_first)[...] + term = factor + (multop + factor).setParseAction(push_first)[...] + expr <<= term + (addop + term).setParseAction(push_first)[...] + bnf = expr + return bnf + + +# map operator symbols to corresponding arithmetic operations +epsilon = 1e-12 +opn = { + "+": operator.add, + "-": operator.sub, + "*": operator.mul, + "/": operator.truediv, + "^": operator.pow, +} + +fn = { + "sin": math.sin, + "cos": math.cos, + "tan": math.tan, + "exp": math.exp, + "abs": abs, + "trunc": int, + "round": round, + "sgn": lambda a: -1 if a < -epsilon else 1 if a > epsilon else 0, + # functionsl with multiple arguments + "multiply": lambda a, b: a * b, + "hypot": math.hypot, + # functions with a variable number of arguments + "all": lambda *a: all(a), +} + + +def evaluate_stack(s): + op, num_args = s.pop(), 0 + if isinstance(op, tuple): + op, num_args = op + if op == "unary -": + return -evaluate_stack(s) + if op in "+-*/^": + # note: operands are pushed onto the stack in reverse order + op2 = evaluate_stack(s) + op1 = evaluate_stack(s) + return opn[op](op1, op2) + elif op == "PI": + return math.pi # 3.1415926535 + elif op == "E": + return math.e # 2.718281828 + elif op in fn: + # note: args are pushed onto the stack in reverse order + args = reversed([evaluate_stack(s) for _ in range(num_args)]) + return fn[op](*args) + elif op[0].isalpha(): + raise Exception("invalid identifier '%s'" % op) + else: + # try to evaluate as int first, then as float if int fails + try: + return int(op) + except ValueError: + return float(op) + + +if __name__ == "__main__": + + def test(s, expected): + exprStack[:] = [] + try: + results = BNF().parseString(s, parseAll=True) + val = evaluate_stack(exprStack[:]) + except ParseException as pe: + print(s, "failed parse:", str(pe)) + except Exception as e: + print(s, "failed eval:", str(e), exprStack) + else: + if val == expected: + print(s, "=", val, results, "=>", exprStack) + else: + print(s + "!!!", val, "!=", expected, results, "=>", exprStack) + + test("9", 9) + test("-9", -9) + test("--9", 9) + test("-E", -math.e) + test("9 + 3 + 6", 9 + 3 + 6) + test("9 + 3 / 11", 9 + 3.0 / 11) + test("(9 + 3)", (9 + 3)) + test("(9+3) / 11", (9 + 3.0) / 11) + test("9 - 12 - 6", 9 - 12 - 6) + test("9 - (12 - 6)", 9 - (12 - 6)) + test("2*3.14159", 2 * 3.14159) + test("3.1415926535*3.1415926535 / 10", 3.1415926535 * 3.1415926535 / 10) + test("PI * PI / 10", math.pi * math.pi / 10) + test("PI*PI/10", math.pi * math.pi / 10) + test("PI^2", math.pi ** 2) + test("round(PI^2)", round(math.pi ** 2)) + test("6.02E23 * 8.048", 6.02e23 * 8.048) + test("e / 3", math.e / 3) + test("sin(PI/2)", math.sin(math.pi / 2)) + test("10+sin(PI/4)^2", 10 + math.sin(math.pi / 4) ** 2) + test("trunc(E)", int(math.e)) + test("trunc(-E)", int(-math.e)) + test("round(E)", round(math.e)) + test("round(-E)", round(-math.e)) + test("E^PI", math.e ** math.pi) + test("exp(0)", 1) + test("exp(1)", math.e) + test("2^3^2", 2 ** 3 ** 2) + test("(2^3)^2", (2 ** 3) ** 2) + test("2^3+2", 2 ** 3 + 2) + test("2^3+5", 2 ** 3 + 5) + test("2^9", 2 ** 9) + test("sgn(-2)", -1) + test("sgn(0)", 0) + test("sgn(0.1)", 1) + test("foo(0.1)", None) + test("round(E, 3)", round(math.e, 3)) + test("round(PI^2, 3)", round(math.pi ** 2, 3)) + test("sgn(cos(PI/4))", 1) + test("sgn(cos(PI/2))", 0) + test("sgn(cos(PI*3/4))", -1) + test("+(sgn(cos(PI/4)))", 1) + test("-(sgn(cos(PI/4)))", -1) + test("hypot(3, 4)", 5) + test("multiply(3, 7)", 21) + test("all(1,1,1)", True) + test("all(1,1,1,1,1,0)", False) + + +""" +Test output: +>python fourFn.py +9 = 9 ['9'] => ['9'] +-9 = -9 ['-', '9'] => ['9', 'unary -'] +--9 = 9 ['-', '-', '9'] => ['9', 'unary -', 'unary -'] +-E = -2.718281828459045 ['-', 'E'] => ['E', 'unary -'] +9 + 3 + 6 = 18 ['9', '+', '3', '+', '6'] => ['9', '3', '+', '6', '+'] +9 + 3 / 11 = 9.272727272727273 ['9', '+', '3', '/', '11'] => ['9', '3', '11', '/', '+'] +(9 + 3) = 12 [['9', '+', '3']] => ['9', '3', '+'] +(9+3) / 11 = 1.0909090909090908 [['9', '+', '3'], '/', '11'] => ['9', '3', '+', '11', '/'] +9 - 12 - 6 = -9 ['9', '-', '12', '-', '6'] => ['9', '12', '-', '6', '-'] +9 - (12 - 6) = 3 ['9', '-', ['12', '-', '6']] => ['9', '12', '6', '-', '-'] +2*3.14159 = 6.28318 ['2', '*', '3.14159'] => ['2', '3.14159', '*'] +3.1415926535*3.1415926535 / 10 = 0.9869604400525172 ['3.1415926535', '*', '3.1415926535', '/', '10'] => ['3.1415926535', '3.1415926535', '*', '10', '/'] +PI * PI / 10 = 0.9869604401089358 ['PI', '*', 'PI', '/', '10'] => ['PI', 'PI', '*', '10', '/'] +PI*PI/10 = 0.9869604401089358 ['PI', '*', 'PI', '/', '10'] => ['PI', 'PI', '*', '10', '/'] +PI^2 = 9.869604401089358 ['PI', '^', '2'] => ['PI', '2', '^'] +round(PI^2) = 10 [('round', 1), [['PI', '^', '2']]] => ['PI', '2', '^', ('round', 1)] +6.02E23 * 8.048 = 4.844896e+24 ['6.02E23', '*', '8.048'] => ['6.02E23', '8.048', '*'] +e / 3 = 0.9060939428196817 ['E', '/', '3'] => ['E', '3', '/'] +sin(PI/2) = 1.0 [('sin', 1), [['PI', '/', '2']]] => ['PI', '2', '/', ('sin', 1)] +10+sin(PI/4)^2 = 10.5 ['10', '+', ('sin', 1), [['PI', '/', '4']], '^', '2'] => ['10', 'PI', '4', '/', ('sin', 1), '2', '^', '+'] +trunc(E) = 2 [('trunc', 1), [['E']]] => ['E', ('trunc', 1)] +trunc(-E) = -2 [('trunc', 1), [['-', 'E']]] => ['E', 'unary -', ('trunc', 1)] +round(E) = 3 [('round', 1), [['E']]] => ['E', ('round', 1)] +round(-E) = -3 [('round', 1), [['-', 'E']]] => ['E', 'unary -', ('round', 1)] +E^PI = 23.140692632779263 ['E', '^', 'PI'] => ['E', 'PI', '^'] +exp(0) = 1.0 [('exp', 1), [['0']]] => ['0', ('exp', 1)] +exp(1) = 2.718281828459045 [('exp', 1), [['1']]] => ['1', ('exp', 1)] +2^3^2 = 512 ['2', '^', '3', '^', '2'] => ['2', '3', '2', '^', '^'] +(2^3)^2 = 64 [['2', '^', '3'], '^', '2'] => ['2', '3', '^', '2', '^'] +2^3+2 = 10 ['2', '^', '3', '+', '2'] => ['2', '3', '^', '2', '+'] +2^3+5 = 13 ['2', '^', '3', '+', '5'] => ['2', '3', '^', '5', '+'] +2^9 = 512 ['2', '^', '9'] => ['2', '9', '^'] +sgn(-2) = -1 [('sgn', 1), [['-', '2']]] => ['2', 'unary -', ('sgn', 1)] +sgn(0) = 0 [('sgn', 1), [['0']]] => ['0', ('sgn', 1)] +sgn(0.1) = 1 [('sgn', 1), [['0.1']]] => ['0.1', ('sgn', 1)] +foo(0.1) failed eval: invalid identifier 'foo' ['0.1', ('foo', 1)] +round(E, 3) = 2.718 [('round', 2), [['E'], ['3']]] => ['E', '3', ('round', 2)] +round(PI^2, 3) = 9.87 [('round', 2), [['PI', '^', '2'], ['3']]] => ['PI', '2', '^', '3', ('round', 2)] +sgn(cos(PI/4)) = 1 [('sgn', 1), [[('cos', 1), [['PI', '/', '4']]]]] => ['PI', '4', '/', ('cos', 1), ('sgn', 1)] +sgn(cos(PI/2)) = 0 [('sgn', 1), [[('cos', 1), [['PI', '/', '2']]]]] => ['PI', '2', '/', ('cos', 1), ('sgn', 1)] +sgn(cos(PI*3/4)) = -1 [('sgn', 1), [[('cos', 1), [['PI', '*', '3', '/', '4']]]]] => ['PI', '3', '*', '4', '/', ('cos', 1), ('sgn', 1)] ++(sgn(cos(PI/4))) = 1 ['+', [('sgn', 1), [[('cos', 1), [['PI', '/', '4']]]]]] => ['PI', '4', '/', ('cos', 1), ('sgn', 1)] +-(sgn(cos(PI/4))) = -1 ['-', [('sgn', 1), [[('cos', 1), [['PI', '/', '4']]]]]] => ['PI', '4', '/', ('cos', 1), ('sgn', 1), 'unary -'] +""" diff --git a/examples/gen_ctypes.py b/examples/gen_ctypes.py index f4a87562..0eb0b7b7 100644 --- a/examples/gen_ctypes.py +++ b/examples/gen_ctypes.py @@ -8,76 +8,94 @@ from pyparsing import * typemap = { - "byte" : "c_byte", - "char" : "c_char", - "char *" : "c_char_p", - "double" : "c_double", - "float" : "c_float", - "int" : "c_int", - "int16" : "c_int16", - "int32" : "c_int32", - "int64" : "c_int64", - "int8" : "c_int8", - "long" : "c_long", - "longlong" : "c_longlong", - "short" : "c_short", - "size_t" : "c_size_t", - "ubyte" : "c_ubyte", - "uchar" : "c_ubyte", - "u_char" : "c_ubyte", - "uint" : "c_uint", - "u_int" : "c_uint", - "uint16" : "c_uint16", - "uint32" : "c_uint32", - "uint64" : "c_uint64", - "uint8" : "c_uint8", - "u_long" : "c_ulong", - "ulong" : "c_ulong", - "ulonglong" : "c_ulonglong", - "ushort" : "c_ushort", - "u_short" : "c_ushort", - "void *" : "c_void_p", - "voidp" : "c_voidp", - "wchar" : "c_wchar", - "wchar *" : "c_wchar_p", - "Bool" : "c_bool", - "void" : "None", - } - -LPAR,RPAR,LBRACE,RBRACE,COMMA,SEMI = map(Suppress,"(){},;") -ident = Word(alphas, alphanums + "_") + "byte": "c_byte", + "char": "c_char", + "char *": "c_char_p", + "double": "c_double", + "float": "c_float", + "int": "c_int", + "int16": "c_int16", + "int32": "c_int32", + "int64": "c_int64", + "int8": "c_int8", + "long": "c_long", + "longlong": "c_longlong", + "short": "c_short", + "size_t": "c_size_t", + "ubyte": "c_ubyte", + "uchar": "c_ubyte", + "u_char": "c_ubyte", + "uint": "c_uint", + "u_int": "c_uint", + "uint16": "c_uint16", + "uint32": "c_uint32", + "uint64": "c_uint64", + "uint8": "c_uint8", + "u_long": "c_ulong", + "ulong": "c_ulong", + "ulonglong": "c_ulonglong", + "ushort": "c_ushort", + "u_short": "c_ushort", + "void *": "c_void_p", + "voidp": "c_voidp", + "wchar": "c_wchar", + "wchar *": "c_wchar_p", + "Bool": "c_bool", + "void": "None", +} + +LPAR, RPAR, LBRACE, RBRACE, COMMA, SEMI = Suppress.using_each("(){},;") +ident = pyparsing_common.identifier integer = Regex(r"[+-]?\d+") hexinteger = Regex(r"0x[0-9a-fA-F]+") const = Suppress("const") -primitiveType = oneOf(t for t in typemap if not t.endswith("*")) +primitiveType = one_of(t for t in typemap if not t.endswith("*")) structType = Suppress("struct") + ident -vartype = (Optional(const) + - (primitiveType | structType | ident) + - Optional(Word("*")("ptr"))) +vartype = ( + Opt(const) + (primitiveType | structType | ident) + Opt(Word("*")("ptr")) +) + + def normalizetype(t): if isinstance(t, ParseResults): - return ' '.join(t) - #~ ret = ParseResults([' '.join(t)]) - #~ return ret + return " ".join(t) + # ~ ret = ParseResults([' '.join(t)]) + # ~ return ret + + +vartype.set_parse_action(normalizetype) + +arg = Group(vartype("argtype") + Opt(ident("argname"))) +func_def = ( + vartype("fn_type") + + ident("fn_name") + + LPAR + + Opt(DelimitedList(arg | "..."))("fn_args") + + RPAR + + SEMI +) -vartype.setParseAction(normalizetype) -arg = Group(vartype("argtype") + Optional(ident("argname"))) -func_def = (vartype("fn_type") + ident("fn_name") + - LPAR + Optional(delimitedList(arg|"..."))("fn_args") + RPAR + SEMI) def derivefields(t): if t.fn_args and t.fn_args[-1] == "...": - t["varargs"]=True -func_def.setParseAction(derivefields) + t["varargs"] = True + + +func_def.set_parse_action(derivefields) fn_typedef = "typedef" + func_def var_typedef = "typedef" + primitiveType("primType") + ident("name") + SEMI -enum_def = (Keyword("enum") + LBRACE + - delimitedList(Group(ident("name") + '=' + (hexinteger|integer)("value")))("evalues") - + Optional(COMMA) - + RBRACE) +enum_def = ( + Keyword("enum") + + LBRACE + + DelimitedList(Group(ident("name") + "=" + (hexinteger | integer)("value")))( + "evalues" + ) + + Opt(COMMA) + + RBRACE +) c_header = open("snmp_api.h").read() @@ -91,18 +109,22 @@ def derivefields(t): enum_constants = [] # add structures commonly included from std lib headers -def addStdType(t,namespace=""): - fullname = namespace+'_'+t if namespace else t +def addStdType(t, namespace=""): + fullname = namespace + "_" + t if namespace else t typemap[t] = fullname user_defined_types.add(t) + + addStdType("fd_set", "sys_select") addStdType("timeval", "sys_time") + def getUDType(typestr): key = typestr.rstrip(" *") if key not in typemap: user_defined_types.add(key) - typemap[key] = "{0}_{1}".format(module, key) + typemap[key] = "{}_{}".format(module, key) + def typeAsCtypes(typestr): if typestr in typemap: @@ -111,21 +133,25 @@ def typeAsCtypes(typestr): return "POINTER(%s)" % typeAsCtypes(typestr.rstrip(" *")) return typestr + # scan input header text for primitive typedefs -for td,_,_ in var_typedef.scanString(c_header): - typedefs.append( (td.name, td.primType) ) +for td, _, _ in var_typedef.scan_string(c_header): + typedefs.append((td.name, td.primType)) # add typedef type to typemap to map to itself typemap[td.name] = td.name # scan input header text for function typedefs -fn_typedefs = fn_typedef.searchString(c_header) +fn_typedefs = fn_typedef.search_string(c_header) # add each function typedef to typemap to map to itself for fntd in fn_typedefs: typemap[fntd.fn_name] = fntd.fn_name # scan input header text, and keep running list of user-defined types -for fn,_,_ in (cStyleComment.suppress() | fn_typedef.suppress() | func_def).scanString(c_header): - if not fn: continue +for fn, _, _ in ( + cStyleComment.suppress() | fn_typedef.suppress() | func_def +).scan_string(c_header): + if not fn: + continue getUDType(fn.fn_type) for arg in fn.fn_args: if arg != "...": @@ -134,39 +160,46 @@ def typeAsCtypes(typestr): functions.append(fn) # scan input header text for enums -enum_def.ignore(cppStyleComment) -for en_,_,_ in enum_def.scanString(c_header): +enum_def.ignore(cpp_style_comment) +for en_, _, _ in enum_def.scan_string(c_header): for ev in en_.evalues: - enum_constants.append( (ev.name, ev.value) ) + enum_constants.append((ev.name, ev.value)) print("from ctypes import *") -print("{0} = CDLL('{1}.dll')".format(module, module)) +print("{} = CDLL('{}.dll')".format(module, module)) print() print("# user defined types") -for tdname,tdtyp in typedefs: - print("{0} = {1}".format(tdname, typemap[tdtyp])) +for tdname, tdtyp in typedefs: + print("{} = {}".format(tdname, typemap[tdtyp])) for fntd in fn_typedefs: - print("{0} = CFUNCTYPE({1})".format(fntd.fn_name, - ',\n '.join(typeAsCtypes(a.argtype) for a in fntd.fn_args))) + print( + "{} = CFUNCTYPE({})".format( + fntd.fn_name, ",\n ".join(typeAsCtypes(a.argtype) for a in fntd.fn_args) + ) + ) for udtype in user_defined_types: print("class %s(Structure): pass" % typemap[udtype]) print() print("# constant definitions") -for en,ev in enum_constants: - print("{0} = {1}".format(en,ev)) +for en, ev in enum_constants: + print("{} = {}".format(en, ev)) print() print("# functions") for fn in functions: - prefix = "{0}.{1}".format(module, fn.fn_name) + prefix = "{}.{}".format(module, fn.fn_name) - print("{0}.restype = {1}".format(prefix, typeAsCtypes(fn.fn_type))) + print("{}.restype = {}".format(prefix, typeAsCtypes(fn.fn_type))) if fn.varargs: print("# warning - %s takes variable argument list" % prefix) del fn.fn_args[-1] - if fn.fn_args.asList() != [['void']]: - print("{0}.argtypes = ({1},)".format(prefix, ','.join(typeAsCtypes(a.argtype) for a in fn.fn_args))) + if fn.fn_args.asList() != [["void"]]: + print( + "{}.argtypes = ({},)".format( + prefix, ",".join(typeAsCtypes(a.argtype) for a in fn.fn_args) + ) + ) else: print("%s.argtypes = ()" % (prefix)) diff --git a/examples/getNTPserversNew.py b/examples/getNTPserversNew.py index c86e7561..8c4c94f3 100644 --- a/examples/getNTPserversNew.py +++ b/examples/getNTPserversNew.py @@ -7,30 +7,32 @@ # September, 2010 - updated to more current use of setResultsName, new NIST URL # import pyparsing as pp -ppc = pp.pyparsing_common -from contextlib import closing -try: - import urllib.request - urlopen = urllib.request.urlopen -except ImportError: - import urllib - urlopen = urllib.urlopen +ppc = pp.pyparsing_common +from urllib.request import urlopen integer = pp.Word(pp.nums) ipAddress = ppc.ipv4_address() -hostname = pp.delimitedList(pp.Word(pp.alphas, pp.alphanums+"-_"), ".", combine=True) -tdStart, tdEnd = pp.makeHTMLTags("td") -timeServerPattern = (tdStart + hostname("hostname") + tdEnd - + tdStart + ipAddress("ipAddr") + tdEnd - + tdStart + tdStart.tag_body("loc") + tdEnd) +hostname = pp.DelimitedList(pp.Word(pp.alphas, pp.alphanums + "-_"), ".", combine=True) +tdStart, tdEnd = pp.make_html_tags("td") +timeServerPattern = ( + tdStart + + hostname("hostname") + + tdEnd + + tdStart + + ipAddress("ipAddr") + + tdEnd + + tdStart + + tdStart.tag_body("loc") + + tdEnd +) # get list of time servers nistTimeServerURL = "https://tf.nist.gov/tf-cgi/servers.cgi#" -with closing(urlopen(nistTimeServerURL)) as serverListPage: +with urlopen(nistTimeServerURL) as serverListPage: serverListHTML = serverListPage.read().decode("UTF-8") addrs = {} -for srvr, startloc, endloc in timeServerPattern.scanString(serverListHTML): - print("{0} ({1}) - {2}".format(srvr.ipAddr, srvr.hostname.strip(), srvr.loc.strip())) +for srvr, startloc, endloc in timeServerPattern.scan_string(serverListHTML): + print(f"{srvr.ipAddr} ({srvr.hostname.strip()}) - {srvr.loc.strip()}") addrs[srvr.ipAddr] = srvr.loc diff --git a/examples/greeting.py b/examples/greeting.py index 6b1cfe31..17a7b2ab 100644 --- a/examples/greeting.py +++ b/examples/greeting.py @@ -8,18 +8,20 @@ import pyparsing as pp # define grammar -greet = pp.Word(pp.alphas) + "," + pp.Word(pp.alphas) + pp.oneOf("! ? .") +greet = pp.Word(pp.alphas) + "," + pp.Word(pp.alphas) + pp.one_of("! ? .") # input string hello = "Hello, World!" # parse input string -print(hello, "->", greet.parseString( hello )) +print(hello, "->", greet.parse_string(hello)) # parse a bunch of input strings -greet.runTests("""\ +greet.run_tests( + """\ Hello, World! Ahoy, Matey! Howdy, Pardner! Morning, Neighbor! - """) \ No newline at end of file + """ +) diff --git a/examples/greetingInGreek.py b/examples/greetingInGreek.py index 8d20c365..aa8272a6 100644 --- a/examples/greetingInGreek.py +++ b/examples/greetingInGreek.py @@ -1,4 +1,3 @@ -# vim:fileencoding=utf-8 # # greetingInGreek.py # @@ -10,10 +9,10 @@ # define grammar alphas = ppu.Greek.alphas -greet = Word(alphas) + ',' + Word(alphas) + '!' +greet = Word(alphas) + "," + Word(alphas) + "!" # input string hello = "Καλημέρα, κόσμε!" # parse input string -print(greet.parseString(hello)) +print(greet.parse_string(hello)) diff --git a/examples/greetingInKorean.py b/examples/greetingInKorean.py index 8b6fa495..d2c0b634 100644 --- a/examples/greetingInKorean.py +++ b/examples/greetingInKorean.py @@ -1,4 +1,3 @@ -# vim:fileencoding=utf-8 # # greetingInKorean.py # @@ -8,14 +7,14 @@ # from pyparsing import Word, pyparsing_unicode as ppu -koreanChars = ppu.Korean.alphas -koreanWord = Word(koreanChars, min=2) +korean_chars = ppu.한국어.alphas +korean_word = Word(korean_chars, min=2) # define grammar -greet = koreanWord + "," + koreanWord + "!" +greet = korean_word + "," + korean_word + "!" # input string -hello = '안녕, 여러분!' #"Hello, World!" in Korean +hello = "안녕, 여러분!" # "Hello, World!" in Korean # parse input string -print(greet.parseString(hello)) +print(greet.parse_string(hello)) diff --git a/examples/holaMundo.py b/examples/holaMundo.py deleted file mode 100644 index 2773a34e..00000000 --- a/examples/holaMundo.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- - -# escrito por Marco Alfonso, 2004 Noviembre - -# importamos los símbolos requeridos desde el módulo -from pyparsing import Word, alphas, oneOf, nums, Group, OneOrMore, pyparsing_unicode as ppu - -# usamos las letras en latin1, que incluye las como 'ñ', 'á', 'é', etc. -alphas = ppu.Latin1.alphas - -# Aqui decimos que la gramatica "saludo" DEBE contener -# una palabra compuesta de caracteres alfanumericos -# (Word(alphas)) mas una ',' mas otra palabra alfanumerica, -# mas '!' y esos seian nuestros tokens -saludo = Word(alphas) + ',' + Word(alphas) + oneOf('! . ?') -tokens = saludo.parseString("Hola, Mundo !") - -# Ahora parseamos una cadena, "Hola, Mundo!", -# el metodo parseString, nos devuelve una lista con los tokens -# encontrados, en caso de no haber errores... -for i, token in enumerate(tokens): - print ("Token %d -> %s" % (i,token)) - -#imprimimos cada uno de los tokens Y listooo!!, he aquí a salida -# Token 0 -> Hola -# Token 1 -> , -# Token 2-> Mundo -# Token 3 -> ! - -# ahora cambia el parseador, aceptando saludos con mas que una sola palabra antes que ',' -saludo = Group(OneOrMore(Word(alphas))) + ',' + Word(alphas) + oneOf('! . ?') -tokens = saludo.parseString("Hasta mañana, Mundo !") - -for i, token in enumerate(tokens): - print ("Token %d -> %s" % (i,token)) - -# Ahora parseamos algunas cadenas, usando el metodo runTests -saludo.runTests("""\ - Hola, Mundo! - Hasta mañana, Mundo ! -""", fullDump=False) - -# Por supuesto, se pueden "reutilizar" gramáticas, por ejemplo: -numimag = Word(nums) + 'i' -numreal = Word(nums) -numcomplex = numreal + '+' + numimag -print (numcomplex.parseString("3+5i")) - -# Cambiar a complejo numero durante parsear: -numcomplex.setParseAction(lambda t: complex(''.join(t).replace('i','j'))) -print (numcomplex.parseString("3+5i")) - -# Excelente!!, bueno, los dejo, me voy a seguir tirando código... diff --git a/examples/hola_mundo.py b/examples/hola_mundo.py new file mode 100644 index 00000000..d44bb351 --- /dev/null +++ b/examples/hola_mundo.py @@ -0,0 +1,73 @@ +# escrito por Marco Alfonso, 2004 Noviembre + +# importamos los símbolos requeridos desde el módulo +from pyparsing import ( + Word, + one_of, + nums, + Group, + OneOrMore, + Opt, + pyparsing_unicode as ppu, +) + +# usamos las letras en latin1, que incluye las como 'ñ', 'á', 'é', etc. +alphas = ppu.Latin1.alphas + +# Aqui decimos que la gramatica "saludo" DEBE contener +# una palabra compuesta de caracteres alfanumericos +# (Word(alphas)) mas una ',' mas otra palabra alfanumerica, +# mas '!' y esos seian nuestros tokens +saludo = Word(alphas) + "," + Word(alphas) + one_of("! . ?") +tokens = saludo.parse_string("Hola, Mundo !") + +# Ahora parseamos una cadena, "Hola, Mundo!", +# el metodo parseString, nos devuelve una lista con los tokens +# encontrados, en caso de no haber errores... +for i, token in enumerate(tokens): + print(f"Token {i} -> {token}") + +# imprimimos cada uno de los tokens Y listooo!!, he aquí a salida +# Token 0 -> Hola +# Token 1 -> , +# Token 2-> Mundo +# Token 3 -> ! + +# ahora cambia el parseador, aceptando saludos con mas que una sola palabra antes que ',' +saludo = Group(OneOrMore(Word(alphas))) + "," + Word(alphas) + one_of("! . ?") +tokens = saludo.parse_string("Hasta mañana, Mundo !") + +for i, token in enumerate(tokens): + print(f"Token {i} -> {token}") + +# Ahora parseamos algunas cadenas, usando el metodo runTests +saludo.run_tests("""\ + Hola, Mundo! + Hasta mañana, Mundo ! + """, + fullDump=False, +) + +# Por supuesto, se pueden "reutilizar" gramáticas, por ejemplo: +numimag = Word(nums) + "i" +numreal = Word(nums) +numcomplex = numimag | numreal + Opt("+" + numimag) + +# Funcion para cambiar a complejo numero durante parsear: +def hace_python_complejo(t): + valid_python = "".join(t).replace("i", "j") + for tipo in (int, complex): + try: + return tipo(valid_python) + except ValueError: + pass + + +numcomplex.set_parse_action(hace_python_complejo) +numcomplex.run_tests("""\ + 3 + 5i + 3+5i +""") + +# Excelente!!, bueno, los dejo, me voy a seguir tirando código... diff --git a/examples/htmlStripper.py b/examples/htmlStripper.py deleted file mode 100644 index 18f33959..00000000 --- a/examples/htmlStripper.py +++ /dev/null @@ -1,32 +0,0 @@ -# -# htmlStripper.py -# -# Sample code for stripping HTML markup tags and scripts from -# HTML source files. -# -# Copyright (c) 2006, 2016, Paul McGuire -# -from contextlib import closing -import urllib.request, urllib.parse, urllib.error -from pyparsing import (makeHTMLTags, commonHTMLEntity, replaceHTMLEntity, - htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith) - -scriptOpen, scriptClose = makeHTMLTags("script") -scriptBody = scriptOpen + scriptOpen.tag_body + scriptClose -commonHTMLEntity.setParseAction(replaceHTMLEntity) - -# get some HTML -targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary" -with closing(urllib.request.urlopen( targetURL )) as targetPage: - targetHTML = targetPage.read().decode("UTF-8") - -# first pass, strip out tags and translate entities -firstPass = (htmlComment | scriptBody | commonHTMLEntity | - anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML) - -# first pass leaves many blank lines, collapse these down -repeatedNewlines = LineEnd()*(2,) -repeatedNewlines.setParseAction(replaceWith("\n\n")) -secondPass = repeatedNewlines.transformString(firstPass) - -print(secondPass) diff --git a/examples/htmlTableParser.py b/examples/htmlTableParser.py deleted file mode 100644 index 35cdd038..00000000 --- a/examples/htmlTableParser.py +++ /dev/null @@ -1,61 +0,0 @@ -# -# htmlTableParser.py -# -# Example of parsing a simple HTML table into a list of rows, and optionally into a little database -# -# Copyright 2019, Paul McGuire -# - -import pyparsing as pp -import urllib.request - - -# define basic HTML tags, and compose into a Table -table, table_end = pp.makeHTMLTags('table') -thead, thead_end = pp.makeHTMLTags('thead') -tbody, tbody_end = pp.makeHTMLTags('tbody') -tr, tr_end = pp.makeHTMLTags('tr') -th, th_end = pp.makeHTMLTags('th') -td, td_end = pp.makeHTMLTags('td') -a, a_end = pp.makeHTMLTags('a') - -# method to strip HTML tags from a string - will be used to clean up content of table cells -strip_html = (pp.anyOpenTag | pp.anyCloseTag).suppress().transformString - -# expression for parsing text links, returning a (text, url) tuple -link = pp.Group(a + a.tag_body('text') + a_end.suppress()) -link.addParseAction(lambda t: (t[0].text, t[0].href)) - -# method to create table rows of header and data tags -def table_row(start_tag, end_tag): - body = start_tag.tag_body - body.addParseAction(pp.tokenMap(str.strip), - pp.tokenMap(strip_html)) - row = pp.Group(tr.suppress() - + pp.ZeroOrMore(start_tag.suppress() - + body - + end_tag.suppress()) - + tr_end.suppress()) - return row - -th_row = table_row(th, th_end) -td_row = table_row(td, td_end) - -# define expression for overall table - may vary slightly for different pages -html_table = table + tbody + pp.Optional(th_row('headers')) + pp.ZeroOrMore(td_row)('rows') + tbody_end + table_end - - -# read in a web page containing an interesting HTML table -with urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_tz_database_time_zones") as page: - page_html = page.read().decode() - -tz_table = html_table.searchString(page_html)[0] - -# convert rows to dicts -rows = [dict(zip(tz_table.headers, row)) for row in tz_table.rows] - -# make a dict keyed by TZ database name -tz_db = {row['TZ database name']: row for row in rows} - -from pprint import pprint -pprint(tz_db['America/Chicago']) diff --git a/examples/html_stripper.py b/examples/html_stripper.py new file mode 100644 index 00000000..92d38c75 --- /dev/null +++ b/examples/html_stripper.py @@ -0,0 +1,58 @@ +# +# html_stripper.py +# +# Sample code for stripping HTML markup tags and scripts from +# HTML source files. +# +# Copyright (c) 2006, 2016, 2023, Paul McGuire +# +from urllib.request import urlopen +from pyparsing import ( + LineEnd, + quoted_string, + make_html_tags, + common_html_entity, + replace_html_entity, + html_comment, + any_open_tag, + any_close_tag, + replace_with, +) + +# if