Skip to content

Commit d7cdedd

Browse files
committed
refactoring of motif submodule
1 parent febfa33 commit d7cdedd

File tree

1 file changed

+57
-19
lines changed

1 file changed

+57
-19
lines changed

tsfresh/feature_extraction/motifs.py

Lines changed: 57 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111

1212
from __future__ import division, absolute_import, print_function
1313
import numpy as np
14+
import pandas as pd
1415
from six.moves import range
16+
import six
1517

1618

1719
def distance(x, y, type="euclid"):
@@ -20,11 +22,11 @@ def distance(x, y, type="euclid"):
2022
function to use another metric
2123
2224
:param x: first vector
23-
:rtype x: iterable
25+
:type x: iterable
2426
:param y: second vector
25-
:rtype y: iterable
27+
:type y: iterable
2628
:param type: how to calculate the distance
27-
:rtype type: str
29+
:type type: str
2830
2931
:return: the calculated distance between both vectors
3032
"""
@@ -57,6 +59,8 @@ def _sliding_window(data, pattern_length):
5759
# todo: I removed the +1 here, because the last window was not returned, can you verfy that?
5860
dimensions = (data.shape[-1] - pattern_length, pattern_length)
5961
steplen = (data.strides[-1],) + data.strides
62+
63+
# TODO: can we somehow remove the dependence on `as_strided`? even its own docstrings warns not to use it...
6064
return np.lib.stride_tricks.as_strided(data, shape=dimensions, strides=steplen)
6165

6266

@@ -72,9 +76,16 @@ def _match_scores(data, pattern):
7276

7377

7478
def _best_n_matches(data, sample, count=1):
79+
"""
80+
81+
:param data:
82+
:param sample:
83+
:param count:
84+
:return:
85+
"""
7586
match_scores = np.absolute(_sliding_window(data, sample))
7687
top_list = []
77-
for i in range(count):
88+
for _ in range(count):
7889
top_spot = np.argmax(match_scores)
7990
match_scores[top_spot:top_spot + len(sample)] = np.zeros((len(sample, )))
8091
top_list.append(top_spot)
@@ -86,11 +97,15 @@ def _candidates_top_uniques(length, candidates, count):
8697
The candidate filter if statement first makes sure this isn't a reverse match (A-->B, don't also return B-->A),
8798
then takes the top item of any overlapping motifs (first ones in are the best ones, then exclude lower ones),
8899
then eliminates any matches that contain parts of the better ones above
100+
89101
:param candidates:
90102
:param count:
91103
:return:
92104
"""
105+
106+
# sort candidates by distance
93107
candidates.sort(key=lambda result: result[2])
108+
94109
top_uniques = []
95110
for candidate in candidates:
96111
if candidate[0] not in [y for x in top_uniques for y in range(x[1], x[1] + length)] \
@@ -102,38 +117,61 @@ def _candidates_top_uniques(length, candidates, count):
102117
return top_uniques[0:count]
103118

104119

105-
def find_motifs(length, data, motif_count):
120+
def find_motifs(data, motif_length, motif_count):
106121
"""
122+
Goes over the data iterable and searches for motifs in it. The result is a list of tuples holding the start_point
123+
of each motif, the best match point, and its distance
107124
108-
:param length: length of the motifs to look for
109125
:param data: times series data to match motifs in
126+
:type data: iterable
127+
:param motif_length: length of the motifs to look for
128+
:type motif_length: int
110129
:param motif_count: how many motifs to return
111-
:return: tuples of length 3 holding start_point of each motif, best match point, and distance metric
130+
:type motif_count: int
131+
132+
:return: tuples of length 3 holding start_point of each motif, best match point, and distance
133+
:return type: list
112134
"""
113-
if length * 8 > len(data):
135+
136+
if motif_length * 8 > len(data):
114137
raise ValueError("Motif size too large for dataset.")
138+
115139
candidates = []
116-
for start in range(len(data) - (3 * length)):
117-
match_pattern = data[start:start + length]
118-
pattern_scores = _match_scores(data[start + length:-length], match_pattern)
140+
141+
for start in range(len(data) - (3 * motif_length)):
142+
143+
pattern = data[start:start + motif_length]
144+
# todo: why you are not matching backwards? You only look for matches of the pattern starting at start
145+
pattern_scores = _match_scores(data[start + motif_length:-motif_length], pattern)
146+
119147
candidates.append((start,
120-
np.argmin(pattern_scores) + start + length,
148+
np.argmin(pattern_scores) + start + motif_length,
121149
np.min(pattern_scores)))
122-
return _candidates_top_uniques(length, candidates, motif_count)
150+
151+
return _candidates_top_uniques(motif_length, candidates, motif_count)
123152

124153

125154
def count_motifs(data, motif, dist=10):
126155
"""
127-
It's interesting that this can return values for distance measures better than the "best" found during motif
128-
finding. I think this is backmatching,
129156
130157
:param data: time series data to search
158+
:type data: iterable
131159
:param motif: motif in tuple format (start, best match, score)
160+
:type motif: list of tuples
132161
:param dist: Count any segment found to be this close
162+
:type dist: numeric
163+
133164
:return: returns an integer count
165+
:return type: int
134166
"""
135-
length = len(motif)
136-
match_pattern = data[motif[0]:motif[0] + length]
137-
pattern_scores = _match_scores(data, match_pattern)
138-
pattern_scores[motif[0] - length:motif[0] + length] = np.inf
167+
168+
# Todo: turn off the backmatching, see following comment
169+
# It's interesting that this can return values for distance measures better than the "best" found during motif
170+
# finding. I think this is backmatching,
171+
172+
l = len(motif)
173+
pattern = data[motif[0]:motif[0] + l]
174+
175+
pattern_scores = _match_scores(data, pattern)
176+
pattern_scores[motif[0] - l:motif[0] + l] = np.inf
139177
return np.sum(pattern_scores < dist)

0 commit comments

Comments
 (0)