Skip to content

Commit 6b02a90

Browse files
committed
Introduce the concept of an iterstore
This lets us serialize iterators as JSON to a file. It lets us take snapshots of transformed docs so we can have many readers of the same transformed content, without having to load all docs into memory at once.
1 parent a87500c commit 6b02a90

File tree

7 files changed

+184
-42
lines changed

7 files changed

+184
-42
lines changed

lettersmith/bin/site.py

Lines changed: 50 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,49 +18,71 @@
1818
from lettersmith import taxonomy
1919
from lettersmith import jinjatools
2020
from lettersmith import jsontools
21+
from lettersmith import iterstore
2122
from lettersmith.data import load_data_files
22-
from lettersmith.file import copy, copy_all
23+
from lettersmith.file import copy_all
2324

2425

2526
def main():
2627
parser = lettersmith_argparser(
2728
description="""Generates a blog-aware site with Lettersmith""")
2829
args = parser.parse_args()
2930
config = read_config(args.config)
30-
input_path = config["input_path"]
31+
input_path = Path(config["input_path"])
3132
output_path = config["output_path"]
3233
theme_path = config["theme_path"]
3334
base_url = config["base_url"]
35+
build_drafts = config["build_drafts"]
3436

35-
data = load_data_files(config["data_path"])
37+
with tempfile.TemporaryDirectory(suffix="_lettersmith") as cache_path:
38+
data = load_data_files(config["data_path"])
3639

37-
md_paths = tuple(Path(input_path).glob("**/*.md"))
38-
docs = Docs.load(md_paths, relative_to=input_path)
39-
docs = docs if config["build_drafts"] else Docs.remove_drafts(docs)
40+
md_paths = (
41+
x for x in input_path.glob("**/*.md")
42+
if pathtools.should_pub(x, build_drafts))
4043

41-
docs = (Doc.decorate_smart_items(doc) for doc in docs)
42-
docs = templatetools.map_templates(docs)
43-
docs = (wikilink.uplift_wikilinks(doc) for doc in docs)
44-
docs = map_permalink(docs, config["permalink_templates"])
45-
docs = markdowntools.map_markdown(docs)
46-
docs = absolutize.map_absolutize(docs, base=base_url)
47-
docs = (Doc.decorate_summary(doc) for doc in docs)
44+
md_docs = Docs.load(md_paths, relative_to=input_path)
45+
46+
json_paths = (
47+
x for x in input_path.glob("**/*.json")
48+
if pathtools.should_pub(x, build_drafts))
49+
50+
json_docs = Docs.load_json(json_paths, relative_to=input_path)
51+
52+
yaml_paths = (
53+
x for x in input_path.glob("**/*.yaml")
54+
if pathtools.should_pub(x, build_drafts))
55+
56+
yaml_docs = Docs.load_yaml(yaml_paths, relative_to=input_path)
57+
58+
docs = chain(md_docs, json_docs, yaml_docs)
59+
60+
docs = (Doc.change_ext(doc, ".html") for doc in docs)
61+
docs = (Doc.decorate_smart_items(doc) for doc in docs)
62+
docs = templatetools.map_templates(docs)
63+
docs = (wikilink.uplift_wikilinks(doc) for doc in docs)
64+
docs = map_permalink(docs, config["permalink_templates"])
65+
docs = markdowntools.map_markdown(docs)
66+
docs = absolutize.map_absolutize(docs, base=base_url)
4867

49-
with tempfile.TemporaryDirectory(suffix="_lettersmith") as cache_path:
5068
doc_cache_path = PurePath(cache_path, "docs.txt")
51-
jsontools.write_chunks(doc_cache_path, docs)
69+
# Store current state of docs to disk.
70+
# Class instance is an iterable that will read them back out from disk.
71+
# This allows you to consume the iterator more than once.
72+
docs_store = iterstore.store(docs, doc_cache_path)
5273

53-
stub_docs = jsontools.load_chunks(doc_cache_path)
54-
stub_docs = tuple(Doc.rm_content(doc) for doc in stub_docs)
74+
stub_docs = tuple(Doc.rm_content(doc) for doc in docs_store)
75+
76+
index = Docs.reduce_index(stub_docs)
5577
wikilink_index = wikilink.index_wikilinks(stub_docs, base=base_url)
5678
backlink_index = wikilink.index_backlinks(stub_docs)
57-
index = Docs.reduce_index(stub_docs)
58-
taxonomy_index = taxonomy.index_by_taxonomy(stub_docs,
59-
config["taxonomies"])
79+
taxonomy_index = taxonomy.index_by_taxonomy(
80+
stub_docs, config["taxonomies"])
81+
6082
paging_docs = paging.gen_paging(stub_docs, **config["paging"])
6183

62-
docs = jsontools.load_chunks(doc_cache_path)
63-
docs = wikilink.map_wikilinks(docs, wikilink_index)
84+
docs = wikilink.map_wikilinks(docs_store, wikilink_index)
85+
docs = (Doc.decorate_summary(doc) for doc in docs)
6486

6587
docs = chain(docs, paging_docs)
6688

@@ -69,24 +91,21 @@ def main():
6991
"index": index,
7092
"taxonomy_index": taxonomy_index,
7193
"backlink_index": backlink_index,
94+
"wikilink_index": wikilink_index,
7295
"site": config["site"],
7396
"data": data,
7497
"base_url": base_url,
7598
"now": datetime.now()
7699
}
77100

78-
docs = jinjatools.map_jinja(docs, context=context, theme_path=theme_path)
101+
docs = jinjatools.map_jinja(
102+
docs, context=context, theme_path=theme_path)
79103
stats = Docs.write(docs, output_path=output_path)
80104

81-
# Copy static files from project dir (if any)
82-
try:
83-
copy_all(config["static_paths"], output_path)
84-
except CalledProcessError:
85-
pass
86-
87-
# Copy static files from theme (if any)
88105
try:
89-
copy(PurePath(theme_path, "static"), output_path)
106+
static_paths = config.get("static_paths", [])
107+
static_paths.append(PurePath(theme_path, "static"))
108+
copy_all(static_paths, output_path)
90109
except CalledProcessError:
91110
pass
92111

lettersmith/data.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,6 @@ def _smart_read_data_file(file_path):
1616
1717
* .json
1818
* .yaml
19-
* .lson: Line-delimeted json. Each line is a discrete JSON blob.
20-
This function will also attempt to read .txt as line-delimeted JSON.
21-
See https://en.wikipedia.org/wiki/JSON_Streaming#Line_delimited_JSON.
2219
"""
2320
ext = Path(file_path).suffix
2421
with open(file_path, "r") as f:

lettersmith/doc.py

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,28 @@
11
from os import path
22
from pathlib import PurePath
3+
import json
34

45
import frontmatter
56

67
from lettersmith.date import parse_iso_8601, read_file_times, EPOCH
78
from lettersmith.file import write_file_deep
9+
from lettersmith import yamltools
810
from lettersmith.stringtools import truncate, strip_html
911
from lettersmith import path as pathtools
1012
from lettersmith.util import put, merge, unset, pick
1113

1214

13-
def load(pathlike, relative_to=""):
15+
def load_raw(pathlike, relative_to=""):
1416
"""
1517
Loads a basic doc dictionary from a file path. This dictionary
16-
contains content string, the meta (headmatter) of the doc and some
17-
basic information about the file.
18+
contains content string, and some basic information about the file.
19+
Typically, you decorate the doc later with meta and other fields.
1820
1921
Returns a dictionary.
2022
"""
2123
file_created_time, file_modified_time = read_file_times(pathlike)
2224
with open(pathlike) as f:
23-
raw = f.read()
24-
meta, content = frontmatter.parse(raw)
25+
content = f.read()
2526
input_path = PurePath(pathlike)
2627
simple_path = input_path.relative_to(relative_to)
2728
output_path = pathtools.to_nice_path(simple_path)
@@ -32,11 +33,79 @@ def load(pathlike, relative_to=""):
3233
"input_path": str(input_path),
3334
"simple_path": str(simple_path),
3435
"output_path": str(output_path),
35-
"meta": meta,
3636
"content": content
3737
}
3838

3939

40+
def load(pathlike, relative_to=""):
41+
"""
42+
Loads a doc dictionary with optional headmatter from a file path.
43+
This dictionary contains content string, meta from the headmatter,
44+
and some basic information about the file.
45+
46+
Returns a dictionary.
47+
"""
48+
return parse_doc_frontmatter(load_raw(pathlike, relative_to))
49+
50+
51+
def load_yaml(pathlike, relative_to=""):
52+
"""
53+
Loads a doc dictionary from a YAML file.
54+
This dictionary contains an empty content string, meta from the file,
55+
and some basic information about the file.
56+
57+
Returns a dictionary.
58+
"""
59+
return parse_doc_yaml(load_raw(pathlike, relative_to))
60+
61+
62+
def load_json(pathlike, relative_to=""):
63+
"""
64+
Loads a doc dictionary from a JSON file.
65+
This dictionary contains an empty content string, meta from the file,
66+
and some basic information about the file.
67+
68+
Returns a dictionary.
69+
"""
70+
return parse_doc_json(load_raw(pathlike, relative_to))
71+
72+
73+
def parse_doc_frontmatter(doc):
74+
"""
75+
Split headmatter from doc content. Sets headmatter meta as doc meta.
76+
Sets content as content.
77+
78+
If no meta is present, sets an empty dict as meta.
79+
"""
80+
meta, content = frontmatter.parse(doc["content"])
81+
return merge(doc, {
82+
"meta": meta,
83+
"content": content
84+
})
85+
86+
87+
def parse_doc_yaml(doc):
88+
"""
89+
Load doc content as YAML data
90+
"""
91+
meta = yamltools.loads(doc["content"])
92+
return merge(doc, {
93+
"meta": meta,
94+
"content": ""
95+
})
96+
97+
98+
def parse_doc_json(doc):
99+
"""
100+
Load doc content as JSON data
101+
"""
102+
meta = json.loads(doc["content"])
103+
return merge(doc, {
104+
"meta": meta,
105+
"content": ""
106+
})
107+
108+
40109
def rm_content(doc):
41110
"""
42111
Remove the content field.

lettersmith/docs.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
def load(file_paths, relative_to=""):
99
"""
10-
Given an iterable of fle paths, create an iterable of loaded docs.
10+
Given an iterable of file paths, create an iterable of loaded docs.
1111
Ignores special files.
1212
"""
1313
return (
@@ -17,6 +17,30 @@ def load(file_paths, relative_to=""):
1717
)
1818

1919

20+
def load_json(file_paths, relative_to=""):
21+
"""
22+
Given an iterable of file paths, create an iterable of loaded docs.
23+
Ignores special files.
24+
"""
25+
return (
26+
Doc.load_json(x, relative_to=relative_to)
27+
for x in file_paths
28+
if is_doc_file(x)
29+
)
30+
31+
32+
def load_yaml(file_paths, relative_to=""):
33+
"""
34+
Given an iterable of file paths, create an iterable of loaded docs.
35+
Ignores special files.
36+
"""
37+
return (
38+
Doc.load_yaml(x, relative_to=relative_to)
39+
for x in file_paths
40+
if is_doc_file(x)
41+
)
42+
43+
2044
def remove_drafts(docs):
2145
return (doc for doc in docs if not is_draft(doc["simple_path"]))
2246

lettersmith/iterstore.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from lettersmith import jsontools
2+
3+
4+
class IterStore:
5+
"""
6+
Consume an iterable, serializing it and saving it to disk.
7+
You can read the stored iter back out via the `__iter__` method.
8+
This allows you to use it as an iterator that may be consumed
9+
multiple times from disk.
10+
"""
11+
def __init__(self, iterable, file_path):
12+
self.file_path = file_path
13+
jsontools.write_chunks(iterable, file_path)
14+
15+
def __iter__(self):
16+
return jsontools.load_chunks(self.file_path)
17+
18+
19+
def store(iterable, file_path):
20+
"""
21+
Store the iterator at `file_path`.
22+
23+
Right now this is just a proxy for IterStore initialization.
24+
"""
25+
return IterStore(iterable, file_path)

lettersmith/jsontools.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def load_chunks(file_path, object_hook=decode_object_hook):
9999
yield json.loads(chunk, object_hook=object_hook)
100100

101101

102-
def write_chunks(file_path, iterable, default=encode_default):
102+
def write_chunks(iterable, file_path, default=encode_default):
103103
"""
104104
Dump an iterable of JSON blobs into a file, line-by-line.
105105

lettersmith/path.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,14 @@ def is_draft(pathlike):
119119
return PurePath(pathlike).name.startswith("_")
120120

121121

122+
def should_pub(pathlike, build_drafts=False):
123+
"""
124+
Should you publish this? This function is just an ergonomic shortcut
125+
for filtering out drafts based on build_drafts setting.
126+
"""
127+
return build_drafts or not is_draft(pathlike)
128+
129+
122130
def is_dotfile(pathlike):
123131
return PurePath(pathlike).name.startswith(".")
124132

0 commit comments

Comments
 (0)