@@ -1403,9 +1403,9 @@ especially useful it you want to pass in a string instead of calling
1403
1403
1404
1404
This example shows the implementation of a pipeline component that fetches
1405
1405
country meta data via the [REST Countries API ](https:// restcountries.eu), sets
1406
- entity annotations for countries, merges entities into one token and sets custom
1407
- attributes on the `Doc` , ` Span` and `Token` – for example, the capital,
1408
- latitude / longitude coordinates and even the country flag.
1406
+ entity annotations for countries and sets custom attributes on the `Doc` and
1407
+ ` Span` – for example, the capital, latitude / longitude coordinates and even the
1408
+ country flag.
1409
1409
1410
1410
```python
1411
1411
# ## {executable="true"}
@@ -1427,54 +1427,46 @@ class RESTCountriesComponent:
1427
1427
# Set up the PhraseMatcher with Doc patterns for each country name
1428
1428
self .matcher = PhraseMatcher(nlp.vocab)
1429
1429
self .matcher.add(" COUNTRIES" , [nlp.make_doc(c) for c in self .countries.keys()])
1430
- # Register attribute on the Token . We'll be overwriting this based on
1430
+ # Register attributes on the Span . We'll be overwriting this based on
1431
1431
# the matches, so we're only setting a default value, not a getter.
1432
- Token .set_extension(" is_country" , default = False )
1433
- Token .set_extension(" country_capital" , default = False )
1434
- Token .set_extension(" country_latlng" , default = False )
1435
- Token .set_extension(" country_flag" , default = False )
1436
- # Register attributes on Doc and Span via a getter that checks if one of
1437
- # the contained tokens is set to is_country == True.
1432
+ Span .set_extension(" is_country" , default = None )
1433
+ Span .set_extension(" country_capital" , default = None )
1434
+ Span .set_extension(" country_latlng" , default = None )
1435
+ Span .set_extension(" country_flag" , default = None )
1436
+ # Register attribute on Doc via a getter that checks if the Doc
1437
+ # contains a country entity
1438
1438
Doc.set_extension(" has_country" , getter = self .has_country)
1439
- Span.set_extension(" has_country" , getter = self .has_country)
1440
1439
1441
1440
def __call__ (self , doc):
1442
1441
spans = [] # keep the spans for later so we can merge them afterwards
1443
1442
for _, start, end in self .matcher(doc):
1444
1443
# Generate Span representing the entity & set label
1445
1444
entity = Span(doc, start, end, label = self .label)
1445
+ # Set custom attributes on entity. Can be extended with other data
1446
+ # returned by the API, like currencies, country code, calling code etc.
1447
+ entity._.set(" is_country" , True )
1448
+ entity._.set(" country_capital" , self .countries[entity.text][" capital" ])
1449
+ entity._.set(" country_latlng" , self .countries[entity.text][" latlng" ])
1450
+ entity._.set(" country_flag" , self .countries[entity.text][" flag" ])
1446
1451
spans.append(entity)
1447
- # Set custom attribute on each token of the entity
1448
- # Can be extended with other data returned by the API, like
1449
- # currencies, country code, flag, calling code etc.
1450
- for token in entity:
1451
- token._.set(" is_country" , True )
1452
- token._.set(" country_capital" , self .countries[entity.text][" capital" ])
1453
- token._.set(" country_latlng" , self .countries[entity.text][" latlng" ])
1454
- token._.set(" country_flag" , self .countries[entity.text][" flag" ])
1455
- # Iterate over all spans and merge them into one token
1456
- with doc.retokenize() as retokenizer:
1457
- for span in spans:
1458
- retokenizer.merge(span)
1459
1452
# Overwrite doc.ents and add entity – be careful not to replace!
1460
1453
doc.ents = list (doc.ents) + spans
1461
1454
return doc # don't forget to return the Doc!
1462
1455
1463
- def has_country(self , tokens ):
1464
- """ Getter for Doc and Span attributes. Since the getter is only called
1465
- when we access the attribute, we can refer to the Token 's 'is_country'
1456
+ def has_country(self , doc ):
1457
+ """ Getter for Doc attributes. Since the getter is only called
1458
+ when we access the attribute, we can refer to the Span 's 'is_country'
1466
1459
attribute here, which is already set in the processing step."""
1467
- return any ([t ._.get(" is_country" ) for t in tokens ])
1460
+ return any ([entity ._.get(" is_country" ) for entity in doc.ents ])
1468
1461
1469
1462
nlp = English()
1470
1463
nlp.add_pipe(" rest_countries" , config = {" label" : " GPE" })
1471
1464
doc = nlp(" Some text about Colombia and the Czech Republic" )
1472
1465
print (" Pipeline" , nlp.pipe_names) # pipeline contains component name
1473
1466
print (" Doc has countries" , doc._.has_country) # Doc contains countries
1474
- for token in doc:
1475
- if token._.is_country:
1476
- print (token.text, token._.country_capital, token._.country_latlng, token._.country_flag)
1477
- print (" Entities" , [(e.text, e.label_) for e in doc.ents])
1467
+ for ent in doc.ents:
1468
+ if ent._.is_country:
1469
+ print (ent.text, ent.label_, ent._.country_capital, ent._.country_latlng, ent._.country_flag)
1478
1470
```
1479
1471
1480
1472
In this case, all data can be fetched on initialization in one request. However,
0 commit comments