Introduce the concept of an iterstore

gordonbrander · gordonbrander · commit 6b02a90d0fc3 · 2018-02-17T16:47:25.000-08:00
This lets us serialize iterators as JSON to a file. It lets us take
snapshots of transformed docs so we can have many readers of the same
transformed content, without having to load all docs into memory at
once.
diff --git a/lettersmith/bin/site.py b/lettersmith/bin/site.py
@@ -18,49 +18,71 @@
 from lettersmith import taxonomy
 from lettersmith import jinjatools
 from lettersmith import jsontools
+from lettersmith import iterstore
 from lettersmith.data import load_data_files
-from lettersmith.file import copy, copy_all
+from lettersmith.file import copy_all
 
 
 def main():
     parser = lettersmith_argparser(
         description="""Generates a blog-aware site with Lettersmith""")
     args = parser.parse_args()
     config = read_config(args.config)
-    input_path = config["input_path"]
+    input_path = Path(config["input_path"])
     output_path = config["output_path"]
     theme_path = config["theme_path"]
     base_url = config["base_url"]
+    build_drafts = config["build_drafts"]
 
-    data = load_data_files(config["data_path"])
+    with tempfile.TemporaryDirectory(suffix="_lettersmith") as cache_path:
+        data = load_data_files(config["data_path"])
 
-    md_paths = tuple(Path(input_path).glob("**/*.md"))
-    docs = Docs.load(md_paths, relative_to=input_path)
-    docs = docs if config["build_drafts"] else Docs.remove_drafts(docs)
+        md_paths = (
+            x for x in input_path.glob("**/*.md")
+            if pathtools.should_pub(x, build_drafts))
 
-    docs = (Doc.decorate_smart_items(doc) for doc in docs)
-    docs = templatetools.map_templates(docs)
-    docs = (wikilink.uplift_wikilinks(doc) for doc in docs)
-    docs = map_permalink(docs, config["permalink_templates"])
-    docs = markdowntools.map_markdown(docs)
-    docs = absolutize.map_absolutize(docs, base=base_url)
-    docs = (Doc.decorate_summary(doc) for doc in docs)
+        md_docs = Docs.load(md_paths, relative_to=input_path)
+
+        json_paths = (
+            x for x in input_path.glob("**/*.json")
+            if pathtools.should_pub(x, build_drafts))
+
+        json_docs = Docs.load_json(json_paths, relative_to=input_path)
+
+        yaml_paths = (
+            x for x in input_path.glob("**/*.yaml")
+            if pathtools.should_pub(x, build_drafts))
+
+        yaml_docs = Docs.load_yaml(yaml_paths, relative_to=input_path)
+
+        docs = chain(md_docs, json_docs, yaml_docs)
+
+        docs = (Doc.change_ext(doc, ".html") for doc in docs)
+        docs = (Doc.decorate_smart_items(doc) for doc in docs)
+        docs = templatetools.map_templates(docs)
+        docs = (wikilink.uplift_wikilinks(doc) for doc in docs)
+        docs = map_permalink(docs, config["permalink_templates"])
+        docs = markdowntools.map_markdown(docs)
+        docs = absolutize.map_absolutize(docs, base=base_url)
 
-    with tempfile.TemporaryDirectory(suffix="_lettersmith") as cache_path:
         doc_cache_path = PurePath(cache_path, "docs.txt")
-        jsontools.write_chunks(doc_cache_path, docs)
+        # Store current state of docs to disk.
+        # Class instance is an iterable that will read them back out from disk.
+        # This allows you to consume the iterator more than once.
+        docs_store = iterstore.store(docs, doc_cache_path)
 
-        stub_docs = jsontools.load_chunks(doc_cache_path)
-        stub_docs = tuple(Doc.rm_content(doc) for doc in stub_docs)
+        stub_docs = tuple(Doc.rm_content(doc) for doc in docs_store)
+
+        index = Docs.reduce_index(stub_docs)
         wikilink_index = wikilink.index_wikilinks(stub_docs, base=base_url)
         backlink_index = wikilink.index_backlinks(stub_docs)
-        index = Docs.reduce_index(stub_docs)
-        taxonomy_index = taxonomy.index_by_taxonomy(stub_docs,
-            config["taxonomies"])
+        taxonomy_index = taxonomy.index_by_taxonomy(
+            stub_docs, config["taxonomies"])
+
         paging_docs = paging.gen_paging(stub_docs, **config["paging"])
 
-        docs = jsontools.load_chunks(doc_cache_path)
-        docs = wikilink.map_wikilinks(docs, wikilink_index)
+        docs = wikilink.map_wikilinks(docs_store, wikilink_index)
+        docs = (Doc.decorate_summary(doc) for doc in docs)
 
         docs = chain(docs, paging_docs)
 
@@ -69,24 +91,21 @@ def main():
             "index": index,
             "taxonomy_index": taxonomy_index,
             "backlink_index": backlink_index,
+            "wikilink_index": wikilink_index,
             "site": config["site"],
             "data": data,
             "base_url": base_url,
             "now": datetime.now()
         }
 
-        docs = jinjatools.map_jinja(docs, context=context, theme_path=theme_path)
+        docs = jinjatools.map_jinja(
+            docs, context=context, theme_path=theme_path)
         stats = Docs.write(docs, output_path=output_path)
 
-    # Copy static files from project dir (if any)
-    try:
-        copy_all(config["static_paths"], output_path)
-    except CalledProcessError:
-        pass
-
-    # Copy static files from theme (if any)
     try:
-        copy(PurePath(theme_path, "static"), output_path)
+        static_paths = config.get("static_paths", [])
+        static_paths.append(PurePath(theme_path, "static"))
+        copy_all(static_paths, output_path)
     except CalledProcessError:
         pass
 
diff --git a/lettersmith/data.py b/lettersmith/data.py
@@ -16,9 +16,6 @@ def _smart_read_data_file(file_path):
 
     * .json
     * .yaml
-    * .lson: Line-delimeted json. Each line is a discrete JSON blob.
-      This function will also attempt to read .txt as line-delimeted JSON.
-      See https://en.wikipedia.org/wiki/JSON_Streaming#Line_delimited_JSON.
     """
     ext = Path(file_path).suffix
     with open(file_path, "r") as f:
diff --git a/lettersmith/doc.py b/lettersmith/doc.py
@@ -1,27 +1,28 @@
 from os import path
 from pathlib import PurePath
+import json
 
 import frontmatter
 
 from lettersmith.date import parse_iso_8601, read_file_times, EPOCH
 from lettersmith.file import write_file_deep
+from lettersmith import yamltools
 from lettersmith.stringtools import truncate, strip_html
 from lettersmith import path as pathtools
 from lettersmith.util import put, merge, unset, pick
 
 
-def load(pathlike, relative_to=""):
+def load_raw(pathlike, relative_to=""):
     """
     Loads a basic doc dictionary from a file path. This dictionary
-    contains content string, the meta (headmatter) of the doc and some
-    basic information about the file.
+    contains content string, and some basic information about the file.
+    Typically, you decorate the doc later with meta and other fields.
 
     Returns a dictionary.
     """
     file_created_time, file_modified_time = read_file_times(pathlike)
     with open(pathlike) as f:
-        raw = f.read()
-        meta, content = frontmatter.parse(raw)
+        content = f.read()
         input_path = PurePath(pathlike)
         simple_path = input_path.relative_to(relative_to)
         output_path = pathtools.to_nice_path(simple_path)
@@ -32,11 +33,79 @@ def load(pathlike, relative_to=""):
             "input_path": str(input_path),
             "simple_path": str(simple_path),
             "output_path": str(output_path),
-            "meta": meta,
             "content": content
         }
 
 
+def load(pathlike, relative_to=""):
+    """
+    Loads a doc dictionary with optional headmatter from a file path.
+    This dictionary contains content string, meta from the headmatter,
+    and some basic information about the file.
+
+    Returns a dictionary.
+    """
+    return parse_doc_frontmatter(load_raw(pathlike, relative_to))
+
+
+def load_yaml(pathlike, relative_to=""):
+    """
+    Loads a doc dictionary from a YAML file.
+    This dictionary contains an empty content string, meta from the file,
+    and some basic information about the file.
+
+    Returns a dictionary.
+    """
+    return parse_doc_yaml(load_raw(pathlike, relative_to))
+
+
+def load_json(pathlike, relative_to=""):
+    """
+    Loads a doc dictionary from a JSON file.
+    This dictionary contains an empty content string, meta from the file,
+    and some basic information about the file.
+
+    Returns a dictionary.
+    """
+    return parse_doc_json(load_raw(pathlike, relative_to))
+
+
+def parse_doc_frontmatter(doc):
+    """
+    Split headmatter from doc content. Sets headmatter meta as doc meta.
+    Sets content as content.
+
+    If no meta is present, sets an empty dict as meta.
+    """
+    meta, content = frontmatter.parse(doc["content"])
+    return merge(doc, {
+        "meta": meta,
+        "content": content
+    })
+
+
+def parse_doc_yaml(doc):
+    """
+    Load doc content as YAML data
+    """
+    meta = yamltools.loads(doc["content"])
+    return merge(doc, {
+        "meta": meta,
+        "content": ""
+    })
+
+
+def parse_doc_json(doc):
+    """
+    Load doc content as JSON data
+    """
+    meta = json.loads(doc["content"])
+    return merge(doc, {
+        "meta": meta,
+        "content": ""
+    })
+
+
 def rm_content(doc):
     """
     Remove the content field.
diff --git a/lettersmith/docs.py b/lettersmith/docs.py
@@ -7,7 +7,7 @@
 
 def load(file_paths, relative_to=""):
     """
-    Given an iterable of fle paths, create an iterable of loaded docs.
+    Given an iterable of file paths, create an iterable of loaded docs.
     Ignores special files.
     """
     return (
@@ -17,6 +17,30 @@ def load(file_paths, relative_to=""):
     )
 
 
+def load_json(file_paths, relative_to=""):
+    """
+    Given an iterable of file paths, create an iterable of loaded docs.
+    Ignores special files.
+    """
+    return (
+        Doc.load_json(x, relative_to=relative_to)
+        for x in file_paths
+        if is_doc_file(x)
+    )
+
+
+def load_yaml(file_paths, relative_to=""):
+    """
+    Given an iterable of file paths, create an iterable of loaded docs.
+    Ignores special files.
+    """
+    return (
+        Doc.load_yaml(x, relative_to=relative_to)
+        for x in file_paths
+        if is_doc_file(x)
+    )
+
+
 def remove_drafts(docs):
     return (doc for doc in docs if not is_draft(doc["simple_path"]))
 
diff --git a/lettersmith/iterstore.py b/lettersmith/iterstore.py
@@ -0,0 +1,25 @@
+from lettersmith import jsontools
+
+
+class IterStore:
+    """
+    Consume an iterable, serializing it and saving it to disk.
+    You can read the stored iter back out via the `__iter__` method.
+    This allows you to use it as an iterator that may be consumed
+    multiple times from disk.
+    """
+    def __init__(self, iterable, file_path):
+        self.file_path = file_path
+        jsontools.write_chunks(iterable, file_path)
+
+    def __iter__(self):
+        return jsontools.load_chunks(self.file_path)
+
+
+def store(iterable, file_path):
+    """
+    Store the iterator at `file_path`.
+
+    Right now this is just a proxy for IterStore initialization.
+    """
+    return IterStore(iterable, file_path)
diff --git a/lettersmith/jsontools.py b/lettersmith/jsontools.py
@@ -99,7 +99,7 @@ def load_chunks(file_path, object_hook=decode_object_hook):
             yield json.loads(chunk, object_hook=object_hook)
 
 
-def write_chunks(file_path, iterable, default=encode_default):
+def write_chunks(iterable, file_path, default=encode_default):
     """
     Dump an iterable of JSON blobs into a file, line-by-line.
 
diff --git a/lettersmith/path.py b/lettersmith/path.py
@@ -119,6 +119,14 @@ def is_draft(pathlike):
     return PurePath(pathlike).name.startswith("_")
 
 
+def should_pub(pathlike, build_drafts=False):
+    """
+    Should you publish this? This function is just an ergonomic shortcut
+    for filtering out drafts based on build_drafts setting.
+    """
+    return build_drafts or not is_draft(pathlike)
+
+
 def is_dotfile(pathlike):
     return PurePath(pathlike).name.startswith(".")