Merge branch 'master' of https://github.com/atulkum/sequence_prediction

atulkum · atulkum · commit 0a703c3775a0 · 2019-02-17T14:36:10.000-08:00
diff --git a/neural_ner/data_utils/utils.py b/neural_ner/data_utils/utils.py
@@ -21,26 +21,14 @@ def load_sentences_json(path, tag_scheme):
 
         json_data = json.loads(line)
         entities = json_data['entities']
-        sentence = [[t] for t in json_data['tokens']]
-        curr = 0
-        for e in entities:
-            name = e['name']
-            end = e['end']
-            begin = e['begin']
-
-            while curr < begin:
-                sentence[curr].append(Constants.ENTITY_OTHER_TAG)
-                curr += 1
+        sentence = [[t, Constants.ENTITY_OTHER_TAG] for t in json_data['tokens']]
 
-            sentence[curr].append(Constants.ENTITY_BEGIN + name)
-            curr += 1
-            while curr <= end:
-                sentence[curr].append(Constants.ENTITY_INSIDE + name)
-                curr += 1
+        for e in entities:
+            name, end, begin = e['name'], e['end'], e['begin']
 
-        while curr < len(sentence):
-            sentence[curr].append('O')
-            curr += 1
+            sentence[begin][1] = Constants.ENTITY_BEGIN + name
+            for i in range(begin+1, end+1):
+                sentence[i][1] = Constants.ENTITY_INSIDE + name
 
         sentences.append(sentence)
 
@@ -82,7 +70,7 @@ def prepare_dataset(sentences, vocab, config):
     return data
 
 def get_chunks(seq):
-    col_names = ['name', 'end', 'begin']
+    col_names = ['name', 'begin', 'end']
     chunks = []
     chunk_type, chunk_start = None, None
     for i, tok in enumerate(seq):