Skip to content

Commit 514a92d

Browse files
committed
Tree-sitter extractors: use fresh IDs for locations
Since locations for any given source file are never referenced in any TRAP files besides the one for that particular source file, it's not necessary to use global IDs. Using fresh IDs will reduce the size of the ID pool (both on disk and in memory) and the speed of multi-threaded TRAP import. The one exception is the empty location, which still uses a global ID.
1 parent 06334ee commit 514a92d

File tree

2 files changed

+105
-45
lines changed

2 files changed

+105
-45
lines changed

shared/tree-sitter-extractor/src/extractor/mod.rs

+83-45
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,16 @@ fn populate_empty_file(writer: &mut trap::Writer) -> trap::Label {
4343

4444
pub fn populate_empty_location(writer: &mut trap::Writer) {
4545
let file_label = populate_empty_file(writer);
46-
location(writer, file_label, 0, 0, 0, 0);
46+
global_location(
47+
writer,
48+
file_label,
49+
trap::Location {
50+
start_line: 0,
51+
start_column: 0,
52+
end_line: 0,
53+
end_column: 0,
54+
},
55+
);
4756
}
4857

4958
pub fn populate_parent_folders(
@@ -85,28 +94,54 @@ pub fn populate_parent_folders(
8594
}
8695
}
8796

88-
fn location(
97+
/** Get the label for the given location, defining it a global ID if it doesn't exist yet. */
98+
fn global_location(
8999
writer: &mut trap::Writer,
90100
file_label: trap::Label,
91-
start_line: usize,
92-
start_column: usize,
93-
end_line: usize,
94-
end_column: usize,
101+
location: trap::Location,
95102
) -> trap::Label {
96103
let (loc_label, fresh) = writer.global_id(&format!(
97104
"loc,{{{}}},{},{},{},{}",
98-
file_label, start_line, start_column, end_line, end_column
105+
file_label,
106+
location.start_line,
107+
location.start_column,
108+
location.end_line,
109+
location.end_column
99110
));
100111
if fresh {
101112
writer.add_tuple(
102113
"locations_default",
103114
vec![
104115
trap::Arg::Label(loc_label),
105116
trap::Arg::Label(file_label),
106-
trap::Arg::Int(start_line),
107-
trap::Arg::Int(start_column),
108-
trap::Arg::Int(end_line),
109-
trap::Arg::Int(end_column),
117+
trap::Arg::Int(location.start_line),
118+
trap::Arg::Int(location.start_column),
119+
trap::Arg::Int(location.end_line),
120+
trap::Arg::Int(location.end_column),
121+
],
122+
);
123+
}
124+
loc_label
125+
}
126+
127+
/** Get the label for the given location, creating it as a fresh ID if we haven't seen the location
128+
* yet for this file. */
129+
fn location_label(
130+
writer: &mut trap::Writer,
131+
file_label: trap::Label,
132+
location: trap::Location,
133+
) -> trap::Label {
134+
let (loc_label, fresh) = writer.location_label(location);
135+
if fresh {
136+
writer.add_tuple(
137+
"locations_default",
138+
vec![
139+
trap::Arg::Label(loc_label),
140+
trap::Arg::Label(file_label),
141+
trap::Arg::Int(location.start_line),
142+
trap::Arg::Int(location.start_column),
143+
trap::Arg::Int(location.end_line),
144+
trap::Arg::Int(location.end_column),
110145
],
111146
);
112147
}
@@ -245,26 +280,25 @@ impl<'a> Visitor<'a> {
245280
node: Node,
246281
status_page: bool,
247282
) {
248-
let (start_line, start_column, end_line, end_column) = location_for(self, node);
249-
let loc = location(
250-
self.trap_writer,
251-
self.file_label,
252-
start_line,
253-
start_column,
254-
end_line,
255-
end_column,
256-
);
283+
let loc = location_for(self, node);
284+
let loc_label = location_label(self.trap_writer, self.file_label, loc);
257285
let mut mesg = self.diagnostics_writer.new_entry(
258286
"parse-error",
259287
"Could not process some files due to syntax errors",
260288
);
261289
mesg.severity(diagnostics::Severity::Warning)
262-
.location(self.path, start_line, start_column, end_line, end_column)
290+
.location(
291+
self.path,
292+
loc.start_line,
293+
loc.start_column,
294+
loc.end_line,
295+
loc.end_column,
296+
)
263297
.message(message, args);
264298
if status_page {
265299
mesg.status_page();
266300
}
267-
self.record_parse_error(loc, &mesg);
301+
self.record_parse_error(loc_label, &mesg);
268302
}
269303

270304
fn enter_node(&mut self, node: Node) -> bool {
@@ -298,15 +332,8 @@ impl<'a> Visitor<'a> {
298332
return;
299333
}
300334
let (id, _, child_nodes) = self.stack.pop().expect("Vistor: empty stack");
301-
let (start_line, start_column, end_line, end_column) = location_for(self, node);
302-
let loc = location(
303-
self.trap_writer,
304-
self.file_label,
305-
start_line,
306-
start_column,
307-
end_line,
308-
end_column,
309-
);
335+
let loc = location_for(self, node);
336+
let loc_label = location_label(self.trap_writer, self.file_label, loc);
310337
let table = self
311338
.schema
312339
.get(&TypeName {
@@ -333,7 +360,7 @@ impl<'a> Visitor<'a> {
333360
trap::Arg::Label(id),
334361
trap::Arg::Label(parent_id),
335362
trap::Arg::Int(parent_index),
336-
trap::Arg::Label(loc),
363+
trap::Arg::Label(loc_label),
337364
],
338365
);
339366
self.trap_writer.add_tuple(
@@ -356,7 +383,7 @@ impl<'a> Visitor<'a> {
356383
trap::Arg::Label(id),
357384
trap::Arg::Label(parent_id),
358385
trap::Arg::Int(parent_index),
359-
trap::Arg::Label(loc),
386+
trap::Arg::Label(loc_label),
360387
],
361388
);
362389
let mut all_args = vec![trap::Arg::Label(id)];
@@ -366,14 +393,20 @@ impl<'a> Visitor<'a> {
366393
}
367394
_ => {
368395
self.record_parse_error(
369-
loc,
396+
loc_label,
370397
self.diagnostics_writer
371398
.new_entry(
372399
"parse-error",
373400
"Could not process some files due to syntax errors",
374401
)
375402
.severity(diagnostics::Severity::Warning)
376-
.location(self.path, start_line, start_column, end_line, end_column)
403+
.location(
404+
self.path,
405+
loc.start_line,
406+
loc.start_column,
407+
loc.end_line,
408+
loc.end_column,
409+
)
377410
.message(
378411
"Unknown table type: {}",
379412
&[diagnostics::MessageArg::Code(node.kind())],
@@ -555,7 +588,7 @@ fn sliced_source_arg(source: &[u8], n: Node) -> trap::Arg {
555588
// Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated.
556589
// The first is the location and label definition, and the second is the
557590
// 'Located' entry.
558-
fn location_for(visitor: &mut Visitor, n: Node) -> (usize, usize, usize, usize) {
591+
fn location_for(visitor: &mut Visitor, n: Node) -> trap::Location {
559592
// Tree-sitter row, column values are 0-based while CodeQL starts
560593
// counting at 1. In addition Tree-sitter's row and column for the
561594
// end position are exclusive while CodeQL's end positions are inclusive.
@@ -565,16 +598,16 @@ fn location_for(visitor: &mut Visitor, n: Node) -> (usize, usize, usize, usize)
565598
// the end column is 0 (start of a line). In such cases the end position must be
566599
// set to the end of the previous line.
567600
let start_line = n.start_position().row + 1;
568-
let start_col = n.start_position().column + 1;
601+
let start_column = n.start_position().column + 1;
569602
let mut end_line = n.end_position().row + 1;
570-
let mut end_col = n.end_position().column;
571-
if start_line > end_line || start_line == end_line && start_col > end_col {
603+
let mut end_column = n.end_position().column;
604+
if start_line > end_line || start_line == end_line && start_column > end_column {
572605
// the range is empty, clip it to sensible values
573606
end_line = start_line;
574-
end_col = start_col - 1;
575-
} else if end_col == 0 {
607+
end_column = start_column - 1;
608+
} else if end_column == 0 {
576609
let source = visitor.source;
577-
// end_col = 0 means that we are at the start of a line
610+
// end_column = 0 means that we are at the start of a line
578611
// unfortunately 0 is invalid as column number, therefore
579612
// we should update the end location to be the end of the
580613
// previous line
@@ -591,10 +624,10 @@ fn location_for(visitor: &mut Visitor, n: Node) -> (usize, usize, usize, usize)
591624
);
592625
}
593626
end_line -= 1;
594-
end_col = 1;
627+
end_column = 1;
595628
while index > 0 && source[index - 1] != b'\n' {
596629
index -= 1;
597-
end_col += 1;
630+
end_column += 1;
598631
}
599632
} else {
600633
visitor.diagnostics_writer.write(
@@ -612,7 +645,12 @@ fn location_for(visitor: &mut Visitor, n: Node) -> (usize, usize, usize, usize)
612645
);
613646
}
614647
}
615-
(start_line, start_col, end_line, end_col)
648+
trap::Location {
649+
start_line,
650+
start_column,
651+
end_line,
652+
end_column,
653+
}
616654
}
617655

618656
fn traverse(tree: &Tree, visitor: &mut Visitor) {

shared/tree-sitter-extractor/src/trap.rs

+22
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,23 @@ use std::path::Path;
55

66
use flate2::write::GzEncoder;
77

8+
#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)]
9+
pub struct Location {
10+
pub start_line: usize,
11+
pub start_column: usize,
12+
pub end_line: usize,
13+
pub end_column: usize,
14+
}
15+
816
pub struct Writer {
917
/// The accumulated trap entries
1018
trap_output: Vec<Entry>,
1119
/// A counter for generating fresh labels
1220
counter: u32,
1321
/// cache of global keys
1422
global_keys: std::collections::HashMap<String, Label>,
23+
/// Labels for locations, which don't use global keys
24+
location_labels: std::collections::HashMap<Location, Label>,
1525
}
1626

1727
impl Writer {
@@ -20,6 +30,7 @@ impl Writer {
2030
counter: 0,
2131
trap_output: Vec::new(),
2232
global_keys: std::collections::HashMap::new(),
33+
location_labels: std::collections::HashMap::new(),
2334
}
2435
}
2536

@@ -50,6 +61,17 @@ impl Writer {
5061
(label, true)
5162
}
5263

64+
/// Gets the label for the given location. The first call for a given location will define it as
65+
/// a fresh (star) ID.
66+
pub fn location_label(&mut self, loc: Location) -> (Label, bool) {
67+
if let Some(label) = self.location_labels.get(&loc) {
68+
return (*label, false);
69+
}
70+
let label = self.fresh_id();
71+
self.location_labels.insert(loc, label);
72+
(label, true)
73+
}
74+
5375
pub fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
5476
self.trap_output
5577
.push(Entry::GenericTuple(table_name.to_owned(), args))

0 commit comments

Comments
 (0)