Skip to content

Commit 561c2da

Browse files
authored
structure refactoring, new script for listing recent products (#2)
* structure refactoring, new script for listing recent products, fixed S3 failing with missing reduced measurement data, better error handling in register_stac
1 parent c202b58 commit 561c2da

File tree

8 files changed

+336
-118
lines changed

8 files changed

+336
-118
lines changed

README.md

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,36 @@
1-
# DhusPytools
2-
Python scripts for Sentinel Data Hub
1+
# Dependencies
2+
Install dependent libraries with `pip install -r requirements.txt`.
33

4-
## [register_stac.py](./register_stac)
5-
Fetches data from Sentinel Data Hub and generates STAC metadata. Pushes the data to a catalogue.
4+
# Generate new list
5+
The **gen_new_list.py** generates a list of new product IDs in the DHuS database added since the last run
6+
of this script and saves them. Otherwise, generates list of IDs from the last 30 days. The list of IDs is then
7+
stored in a file.
8+
9+
### Configuration
10+
**Authentication**: Basic auth is resolved automatically by the Requests library by reading a **~/.netrc** file. Make sure
11+
to set up the correct entry (DHuS URL) there.
12+
13+
**Configuration file**: The **sentinel_config.yml** contains the URL to the Sentinel data hub host.
14+
15+
**Command line arguments**: The command line options supersede the configuration file settings. Run help to list all
16+
configurable parameters: `./gen_new_list.py -h`
17+
18+
# Register Sentinel to STAC catalogue
19+
The **register_stack.py** script fetches Sentinel 1, 2, 3 and 5P metadata from a data hub
20+
and transforms it into a [STAC](https://stacspec.org/en) format. It can publish the results to a STAC catalogue.
21+
The transformation is done by imported [stactools](https://github.com/stac-utils/stactools) modules.
22+
23+
### Configuration
24+
There are several ways to configure the script's behaviour:
25+
26+
**Preconfigured mappings**: The **stac_collections.py** file contains general constants that should be updated
27+
by developers if necessary. You can modify the options if needed.
28+
29+
**Configuration file**: The **sentinel_config.yml** contains the URLs to the Sentinel data hub host and the STAC catalogue
30+
host, log file prefixes and path to a location to save the data.
31+
32+
**Command line arguments**: The command line options supersede the configuration file settings. Run help to list all
33+
configurable parameters: `./register_stack.py -h`
34+
35+
**Authentication**: Basic auth is resolved automatically by the Requests library by reading a **~/.netrc** file. Make sure
36+
to set up the correct entries (Sentinel and STAC host URL) there.

gen_new_list.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
#!/usr/bin/python3
2+
3+
import argparse
4+
import os
5+
from datetime import datetime, timedelta
6+
7+
import requests
8+
import yaml
9+
10+
DEBUG = False
11+
DATE_FORMAT = "%Y-%m-%dT%H:%M:%S.%f"
12+
CONFIG_FILE = "sentinel_config.yml"
13+
LIST_FILENAME = "gen_new_list_processed.txt"
14+
TIMESTAMP_FILENAME = "gen_new_list_timestamp.txt"
15+
16+
17+
def parse_arguments():
18+
"""
19+
Parse command line arguments. Check if combinations are valid.
20+
"""
21+
parser = argparse.ArgumentParser(
22+
description='Generates a list of Sentinel products recently published at DHuS endpoint.'
23+
'Example usage: ./gen_new_list.py')
24+
parser.add_argument('-r',
25+
'--dryRun',
26+
required=False,
27+
action='store_true',
28+
help='Do not store results')
29+
parser.add_argument('-f',
30+
"--fromTimestamp",
31+
required=False,
32+
type=lambda d: datetime.strptime(d, '%Y-%m-%d'),
33+
help="Alternative start date to use instead of stored timestamp.")
34+
parser.add_argument('-e',
35+
'--sentinelHost',
36+
required=False,
37+
help='URL of server to fetch Sentinel data from, for example https://dhr1.cesnet.cz/.'
38+
'Overwrites SENTINEL_HOST configuration option.')
39+
parser.add_argument('-d',
40+
'--debug',
41+
required=False,
42+
action="store_true",
43+
help='Enable to enable extended progress messages.')
44+
45+
args = parser.parse_args()
46+
return args
47+
48+
49+
def print_debug(msg):
50+
"""
51+
Prints debug message to console if DEBUG variable is True.
52+
"""
53+
if DEBUG:
54+
print(msg)
55+
56+
57+
def read_configuration():
58+
"""
59+
Read configuration file.
60+
"""
61+
with open(CONFIG_FILE, "r") as f:
62+
return yaml.safe_load(f)
63+
64+
65+
def get_timestamp(local_dir):
66+
"""
67+
Reads timestamp of last script run. If file does not exist or is malformed, fallbacks to last 31 days.
68+
Timestamp needs to be trimmed to max. 3 millisecond decimal places.
69+
"""
70+
timestamp_filepath = os.path.join(local_dir, TIMESTAMP_FILENAME)
71+
fallback_timestamp = (datetime.now() - timedelta(days=31)).strftime(DATE_FORMAT)[:-3]
72+
if not os.path.isfile(timestamp_filepath) or not os.path.getsize(timestamp_filepath):
73+
return fallback_timestamp
74+
with open(timestamp_filepath, "r") as f:
75+
content = f.read().strip()
76+
try:
77+
timestamp = content[:-3]
78+
print_debug(f"Using stored timestamp {timestamp}")
79+
return timestamp
80+
except ValueError:
81+
print("Timestamp file exists but is formatted incorrectly")
82+
return fallback_timestamp
83+
84+
85+
def create_missing_dir(dir_path):
86+
"""
87+
Creates directory, if it does not exist yet (including all missing directories in the path).
88+
"""
89+
if not os.path.exists(dir_path):
90+
os.makedirs(dir_path, exist_ok=True)
91+
92+
93+
def fetch_products(host_url, timestamp):
94+
"""
95+
Fetches all products created after given timestamp
96+
"""
97+
fetched_count = 100
98+
page_size = 100
99+
result = []
100+
101+
# header is included in response
102+
while fetched_count + 1 >= page_size:
103+
url = f"{host_url}/odata/v1/Products"
104+
params = {
105+
'$format': 'text/csv',
106+
'$select': 'Id',
107+
'$skip': len(result),
108+
'$top': page_size,
109+
'$filter': f"CreationDate ge datetime'{timestamp}'"
110+
}
111+
response = requests.get(url, params=params)
112+
113+
if not response.ok:
114+
raise Exception(f"Request to fetch products file failed with {response.status_code}.\n{response.text}")
115+
116+
product_ids = response.text.splitlines()[1:]
117+
result.extend(product_ids)
118+
fetched_count = len(product_ids)
119+
print_debug(f"Fetched {len(result)} products.")
120+
return result
121+
122+
123+
def load_cached_products(local_dir):
124+
"""
125+
Loads file containing last processed product ids, if the file exists.
126+
"""
127+
filepath = os.path.join(local_dir, LIST_FILENAME)
128+
if not os.path.exists(filepath):
129+
return []
130+
else:
131+
with open(filepath, "r") as f:
132+
return f.readlines()
133+
134+
135+
def store_new_timestamp(local_dir, new_timestamp):
136+
"""
137+
Overwrites last time of processing with new timestamp.
138+
"""
139+
timestamp_filepath = os.path.join(local_dir, TIMESTAMP_FILENAME)
140+
with open(timestamp_filepath, 'w') as f:
141+
f.write(new_timestamp)
142+
143+
144+
def store_new_list(local_dir, missing_products):
145+
"""
146+
Overwrites last processed product ids with new ones.
147+
"""
148+
list_filepath = os.path.join(local_dir, LIST_FILENAME)
149+
with open(list_filepath, 'w') as f:
150+
f.write("\n".join(missing_products))
151+
152+
153+
def main():
154+
args = parse_arguments()
155+
config = read_configuration()
156+
157+
global DEBUG
158+
DEBUG = args.debug
159+
sentinel_host = args.sentinelHost or config.get("SENTINEL_HOST")
160+
if not sentinel_host:
161+
raise Exception("SENTINEL_HOST is not defined and sentinelHost parameter not passed!")
162+
local_dir = config.get("LOCAL_DIR")
163+
164+
timestamp = args.fromTimestamp or get_timestamp(local_dir)
165+
166+
new_timestamp = datetime.now().strftime(DATE_FORMAT)
167+
fetched_products = fetch_products(sentinel_host, timestamp)
168+
stored_products = load_cached_products(local_dir)
169+
missing_products = list(set(fetched_products) - set(stored_products))
170+
print_debug(f"There are {len(missing_products)} unprocessed products.")
171+
172+
if not args.dryRun:
173+
store_new_timestamp(local_dir, new_timestamp)
174+
store_new_list(local_dir, missing_products)
175+
176+
177+
if __name__ == "__main__":
178+
main()

0 commit comments

Comments
 (0)