Skip to content

Commit ef22cbf

Browse files
committed
add full linear suffix array
1 parent 14a45d1 commit ef22cbf

File tree

3 files changed

+151
-109
lines changed

3 files changed

+151
-109
lines changed

string-utility/SuffixArray.cc

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#include "../data-structure/RangeMinimumQuery.cc"
2+
#include <cstring>
3+
#include <vector>
4+
#include <algorithm>
5+
6+
// Two Efficient Algorithms for Linear Suffix Array Construction
7+
#define pushS(x) sa[--cur[(int)s[x]]] = x
8+
#define pushL(x) sa[cur[(int)s[x]]++] = x
9+
class SuffixArray {
10+
public:
11+
std::vector<int> sa;
12+
std::vector<int> rank;
13+
std::vector<int> lcp;
14+
SchieberVishkinRMQ<int> lcpRMQ;
15+
16+
template <class T> void build(const T *s, int n);
17+
template <class T> void build(const T *s, int n, int m);
18+
19+
int size() const {
20+
return sa.size() - 1;
21+
}
22+
int computeLCP(int i, int j) const {
23+
if (i == j) return size() - i;
24+
int x = rank[i], y = rank[j];
25+
if (x > y) std::swap(x, y);
26+
return lcp[lcpRMQ.query(x + 1, y)];
27+
}
28+
29+
private:
30+
using SLTypes = std::vector<bool>;
31+
int *buffer, *freq, *cur;
32+
int len;
33+
34+
void buildRankTable();
35+
void buildLCPArrayRMQ();
36+
template <class T> void computeLCPArray(const T *s);
37+
template <class T> void countFrequence(const T *s, int n, int m);
38+
template <class T> void induce(const T *s, int *sa, int m, const SLTypes &t);
39+
template <class T> void sa_is(const T *s, int *sa, int n, int m);
40+
};
41+
42+
template <class T>
43+
void SuffixArray::build(const T *s, int n) {
44+
this->len = n;
45+
int m = *std::max_element(s, s + n) + 1;
46+
build(s, n, m);
47+
buildRankTable();
48+
computeLCPArray(s);
49+
buildLCPArrayRMQ();
50+
}
51+
52+
template <class T>
53+
void SuffixArray::build(const T *s, int n, int m) {
54+
sa.resize(n + 1);
55+
if (n == 0) sa[0] = 0;
56+
else {
57+
// memory usage: sa + buffer + types
58+
// = 4 * (n + max(m * 2, n)) + 2 * n / 8 + O(1) bytes
59+
std::vector<int> buffer((std::max(m, (n + 1) / 2) + 1) * 2);
60+
this->buffer = &buffer[0];
61+
sa_is<T>(s, &sa[0], n + 1, m);
62+
}
63+
}
64+
65+
void SuffixArray::buildRankTable() {
66+
int n = size() + 1;
67+
rank.resize(n);
68+
for (int i = 0; i < n; ++i) rank[sa[i]] = i;
69+
}
70+
71+
void SuffixArray::buildLCPArrayRMQ() {
72+
lcpRMQ.build(&lcp[0], size() + 1);
73+
}
74+
75+
template <class T>
76+
void SuffixArray::computeLCPArray(const T *s) {
77+
const int n = size() + 1;
78+
lcp.resize(n);
79+
for (int i = 0, h = lcp[0] = 0; i < n; ++i) {
80+
int j = sa[rank[i] - 1];
81+
while (i + h < n && j + h < n && s[i + h] == s[j + h]) ++h;
82+
if (lcp[rank[i]] = h) --h;
83+
}
84+
}
85+
86+
template <class T>
87+
void SuffixArray::countFrequence(const T *s, int n, int m) {
88+
memset(freq, 0, sizeof(int) * m);
89+
for (int i = 0; i < n; ++i) ++freq[(int)s[i]];
90+
for (int i = 1; i < m; ++i) freq[i] += freq[i - 1];
91+
memcpy(cur, freq, sizeof(int) * m);
92+
}
93+
94+
template <class T>
95+
void SuffixArray::induce(const T *s, int *sa, int m, const SLTypes &t) {
96+
const int n = t.size();
97+
memcpy(cur + 1, freq, sizeof(int) * (m - 1));
98+
for (int i = 0; i < n; ++i) {
99+
int p = sa[i] - 1;
100+
if (p >= 0 && t[p]) pushL(p);
101+
}
102+
memcpy(cur, freq, sizeof(int) * m);
103+
for (int i = n - 1; i > 0; --i) {
104+
int p = sa[i] - 1;
105+
if (p >= 0 && !t[p]) pushS(p);
106+
}
107+
}
108+
109+
template <class T>
110+
void SuffixArray::sa_is(const T *s, int *sa, int n, int m) {
111+
SLTypes t(n); memset(sa, 0, sizeof(int) * n);
112+
for (int i = n - 2; ~i; --i) {
113+
t[i] = (s[i] == s[i + 1]) ? t[i + 1] : s[i] > s[i + 1];
114+
}
115+
freq = buffer, cur = buffer + m;
116+
countFrequence(s, n, m);
117+
for (int i = 1; i < n; ++i) if (t[i - 1] > t[i]) pushS(i);
118+
induce(s, sa, m, t);
119+
int n1 = 0, order = 0;
120+
for (int i = 0, p; i < n; ++i) {
121+
if ((p = sa[i]) && t[p - 1] > t[p]) sa[n1++] = p;
122+
}
123+
int *s1 = sa + n1;
124+
memset(s1, -1, sizeof(int) * (n - n1));
125+
s1[(sa[0] - 1) / 2] = order++;
126+
for (int i = 1; i < n1; ++i) {
127+
bool diff = true;
128+
for (int x = sa[i - 1], y = sa[i]; ; ++x, ++y) {
129+
if (s[x] != s[y] || t[x] != t[y]) break;
130+
else if (t[x] > t[x + 1] && t[y] > t[y + 1]) {
131+
diff = (s[x + 1] != s[y + 1]); break;
132+
}
133+
}
134+
s1[(sa[i] - 1) / 2] = (order += diff) - 1;
135+
}
136+
for (int i = 0, x = 0; i < n - n1; ++i) {
137+
if (~s1[i]) s1[x++] = s1[i];
138+
}
139+
if (order != n1) {
140+
sa_is<int>(s1, sa, n1, order);
141+
for (int i = 0; i < n1; ++i) s1[sa[i]] = i;
142+
}
143+
for (int i = 1, j = 0; i < n; ++i) {
144+
if (t[i - 1] > t[i]) sa[s1[j++]] = -i;
145+
}
146+
memset(s1, 0, sizeof(int) * (n - n1));
147+
freq = buffer, cur = buffer + m;
148+
countFrequence(s, n, m);
149+
for (int i = n1 - 1; ~i; --i) pushS(-sa[i]);
150+
induce(s, sa, m, t);
151+
}

string-utility/sa-is.cc

Lines changed: 0 additions & 62 deletions
This file was deleted.

string-utility/sa-prefix-doubling.cc

Lines changed: 0 additions & 47 deletions
This file was deleted.

0 commit comments

Comments
 (0)