11
11
12
12
from __future__ import division , absolute_import , print_function
13
13
import numpy as np
14
+ import pandas as pd
14
15
from six .moves import range
16
+ import six
15
17
16
18
17
19
def distance (x , y , type = "euclid" ):
@@ -20,11 +22,11 @@ def distance(x, y, type="euclid"):
20
22
function to use another metric
21
23
22
24
:param x: first vector
23
- :rtype x: iterable
25
+ :type x: iterable
24
26
:param y: second vector
25
- :rtype y: iterable
27
+ :type y: iterable
26
28
:param type: how to calculate the distance
27
- :rtype type: str
29
+ :type type: str
28
30
29
31
:return: the calculated distance between both vectors
30
32
"""
@@ -57,6 +59,8 @@ def _sliding_window(data, pattern_length):
57
59
# todo: I removed the +1 here, because the last window was not returned, can you verfy that?
58
60
dimensions = (data .shape [- 1 ] - pattern_length , pattern_length )
59
61
steplen = (data .strides [- 1 ],) + data .strides
62
+
63
+ # TODO: can we somehow remove the dependence on `as_strided`? even its own docstrings warns not to use it...
60
64
return np .lib .stride_tricks .as_strided (data , shape = dimensions , strides = steplen )
61
65
62
66
@@ -72,9 +76,16 @@ def _match_scores(data, pattern):
72
76
73
77
74
78
def _best_n_matches (data , sample , count = 1 ):
79
+ """
80
+
81
+ :param data:
82
+ :param sample:
83
+ :param count:
84
+ :return:
85
+ """
75
86
match_scores = np .absolute (_sliding_window (data , sample ))
76
87
top_list = []
77
- for i in range (count ):
88
+ for _ in range (count ):
78
89
top_spot = np .argmax (match_scores )
79
90
match_scores [top_spot :top_spot + len (sample )] = np .zeros ((len (sample , )))
80
91
top_list .append (top_spot )
@@ -86,11 +97,15 @@ def _candidates_top_uniques(length, candidates, count):
86
97
The candidate filter if statement first makes sure this isn't a reverse match (A-->B, don't also return B-->A),
87
98
then takes the top item of any overlapping motifs (first ones in are the best ones, then exclude lower ones),
88
99
then eliminates any matches that contain parts of the better ones above
100
+
89
101
:param candidates:
90
102
:param count:
91
103
:return:
92
104
"""
105
+
106
+ # sort candidates by distance
93
107
candidates .sort (key = lambda result : result [2 ])
108
+
94
109
top_uniques = []
95
110
for candidate in candidates :
96
111
if candidate [0 ] not in [y for x in top_uniques for y in range (x [1 ], x [1 ] + length )] \
@@ -102,38 +117,61 @@ def _candidates_top_uniques(length, candidates, count):
102
117
return top_uniques [0 :count ]
103
118
104
119
105
- def find_motifs (length , data , motif_count ):
120
+ def find_motifs (data , motif_length , motif_count ):
106
121
"""
122
+ Goes over the data iterable and searches for motifs in it. The result is a list of tuples holding the start_point
123
+ of each motif, the best match point, and its distance
107
124
108
- :param length: length of the motifs to look for
109
125
:param data: times series data to match motifs in
126
+ :type data: iterable
127
+ :param motif_length: length of the motifs to look for
128
+ :type motif_length: int
110
129
:param motif_count: how many motifs to return
111
- :return: tuples of length 3 holding start_point of each motif, best match point, and distance metric
130
+ :type motif_count: int
131
+
132
+ :return: tuples of length 3 holding start_point of each motif, best match point, and distance
133
+ :return type: list
112
134
"""
113
- if length * 8 > len (data ):
135
+
136
+ if motif_length * 8 > len (data ):
114
137
raise ValueError ("Motif size too large for dataset." )
138
+
115
139
candidates = []
116
- for start in range (len (data ) - (3 * length )):
117
- match_pattern = data [start :start + length ]
118
- pattern_scores = _match_scores (data [start + length :- length ], match_pattern )
140
+
141
+ for start in range (len (data ) - (3 * motif_length )):
142
+
143
+ pattern = data [start :start + motif_length ]
144
+ # todo: why you are not matching backwards? You only look for matches of the pattern starting at start
145
+ pattern_scores = _match_scores (data [start + motif_length :- motif_length ], pattern )
146
+
119
147
candidates .append ((start ,
120
- np .argmin (pattern_scores ) + start + length ,
148
+ np .argmin (pattern_scores ) + start + motif_length ,
121
149
np .min (pattern_scores )))
122
- return _candidates_top_uniques (length , candidates , motif_count )
150
+
151
+ return _candidates_top_uniques (motif_length , candidates , motif_count )
123
152
124
153
125
154
def count_motifs (data , motif , dist = 10 ):
126
155
"""
127
- It's interesting that this can return values for distance measures better than the "best" found during motif
128
- finding. I think this is backmatching,
129
156
130
157
:param data: time series data to search
158
+ :type data: iterable
131
159
:param motif: motif in tuple format (start, best match, score)
160
+ :type motif: list of tuples
132
161
:param dist: Count any segment found to be this close
162
+ :type dist: numeric
163
+
133
164
:return: returns an integer count
165
+ :return type: int
134
166
"""
135
- length = len (motif )
136
- match_pattern = data [motif [0 ]:motif [0 ] + length ]
137
- pattern_scores = _match_scores (data , match_pattern )
138
- pattern_scores [motif [0 ] - length :motif [0 ] + length ] = np .inf
167
+
168
+ # Todo: turn off the backmatching, see following comment
169
+ # It's interesting that this can return values for distance measures better than the "best" found during motif
170
+ # finding. I think this is backmatching,
171
+
172
+ l = len (motif )
173
+ pattern = data [motif [0 ]:motif [0 ] + l ]
174
+
175
+ pattern_scores = _match_scores (data , pattern )
176
+ pattern_scores [motif [0 ] - l :motif [0 ] + l ] = np .inf
139
177
return np .sum (pattern_scores < dist )
0 commit comments