Skip to content

Commit 97ff090

Browse files
committed
Fix docs example [ci skip]
1 parent 9fb3244 commit 97ff090

File tree

1 file changed

+23
-31
lines changed

1 file changed

+23
-31
lines changed

website/docs/usage/processing-pipelines.md

Lines changed: 23 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1403,9 +1403,9 @@ especially useful it you want to pass in a string instead of calling
14031403
14041404
This example shows the implementation of a pipeline component that fetches
14051405
country meta data via the [REST Countries API](https://restcountries.eu), sets
1406-
entity annotations for countries, merges entities into one token and sets custom
1407-
attributes on the `Doc`, `Span` and `Token` for example, the capital,
1408-
latitude/longitude coordinates and even the country flag.
1406+
entity annotations for countries and sets custom attributes on the `Doc` and
1407+
`Span`for example, the capital, latitude/longitude coordinates and even the
1408+
country flag.
14091409
14101410
```python
14111411
### {executable="true"}
@@ -1427,54 +1427,46 @@ class RESTCountriesComponent:
14271427
# Set up the PhraseMatcher with Doc patterns for each country name
14281428
self.matcher = PhraseMatcher(nlp.vocab)
14291429
self.matcher.add("COUNTRIES", [nlp.make_doc(c) for c in self.countries.keys()])
1430-
# Register attribute on the Token. We'll be overwriting this based on
1430+
# Register attributes on the Span. We'll be overwriting this based on
14311431
# the matches, so we're only setting a default value, not a getter.
1432-
Token.set_extension("is_country", default=False)
1433-
Token.set_extension("country_capital", default=False)
1434-
Token.set_extension("country_latlng", default=False)
1435-
Token.set_extension("country_flag", default=False)
1436-
# Register attributes on Doc and Span via a getter that checks if one of
1437-
# the contained tokens is set to is_country == True.
1432+
Span.set_extension("is_country", default=None)
1433+
Span.set_extension("country_capital", default=None)
1434+
Span.set_extension("country_latlng", default=None)
1435+
Span.set_extension("country_flag", default=None)
1436+
# Register attribute on Doc via a getter that checks if the Doc
1437+
# contains a country entity
14381438
Doc.set_extension("has_country", getter=self.has_country)
1439-
Span.set_extension("has_country", getter=self.has_country)
14401439
14411440
def __call__(self, doc):
14421441
spans = [] # keep the spans for later so we can merge them afterwards
14431442
for _, start, end in self.matcher(doc):
14441443
# Generate Span representing the entity & set label
14451444
entity = Span(doc, start, end, label=self.label)
1445+
# Set custom attributes on entity. Can be extended with other data
1446+
# returned by the API, like currencies, country code, calling code etc.
1447+
entity._.set("is_country", True)
1448+
entity._.set("country_capital", self.countries[entity.text]["capital"])
1449+
entity._.set("country_latlng", self.countries[entity.text]["latlng"])
1450+
entity._.set("country_flag", self.countries[entity.text]["flag"])
14461451
spans.append(entity)
1447-
# Set custom attribute on each token of the entity
1448-
# Can be extended with other data returned by the API, like
1449-
# currencies, country code, flag, calling code etc.
1450-
for token in entity:
1451-
token._.set("is_country", True)
1452-
token._.set("country_capital", self.countries[entity.text]["capital"])
1453-
token._.set("country_latlng", self.countries[entity.text]["latlng"])
1454-
token._.set("country_flag", self.countries[entity.text]["flag"])
1455-
# Iterate over all spans and merge them into one token
1456-
with doc.retokenize() as retokenizer:
1457-
for span in spans:
1458-
retokenizer.merge(span)
14591452
# Overwrite doc.ents and add entity – be careful not to replace!
14601453
doc.ents = list(doc.ents) + spans
14611454
return doc # don't forget to return the Doc!
14621455
1463-
def has_country(self, tokens):
1464-
"""Getter for Doc and Span attributes. Since the getter is only called
1465-
when we access the attribute, we can refer to the Token's 'is_country'
1456+
def has_country(self, doc):
1457+
"""Getter for Doc attributes. Since the getter is only called
1458+
when we access the attribute, we can refer to the Span's 'is_country'
14661459
attribute here, which is already set in the processing step."""
1467-
return any([t._.get("is_country") for t in tokens])
1460+
return any([entity._.get("is_country") for entity in doc.ents])
14681461
14691462
nlp = English()
14701463
nlp.add_pipe("rest_countries", config={"label": "GPE"})
14711464
doc = nlp("Some text about Colombia and the Czech Republic")
14721465
print("Pipeline", nlp.pipe_names) # pipeline contains component name
14731466
print("Doc has countries", doc._.has_country) # Doc contains countries
1474-
for token in doc:
1475-
if token._.is_country:
1476-
print(token.text, token._.country_capital, token._.country_latlng, token._.country_flag)
1477-
print("Entities", [(e.text, e.label_) for e in doc.ents])
1467+
for ent in doc.ents:
1468+
if ent._.is_country:
1469+
print(ent.text, ent.label_, ent._.country_capital, ent._.country_latlng, ent._.country_flag)
14781470
```
14791471
14801472
In this case, all data can be fetched on initialization in one request. However,

0 commit comments

Comments
 (0)