Coverage for pds_crawler/load/strategy.py: 91%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2# pds-crawler - ETL to index PDS data to pdssp
3# Copyright (C) 2023 - CNES (Jean-Christophe Malapert for Pôle Surfaces Planétaires)
4# This file is part of pds-crawler <https://github.com/pdssp/pds_crawler>
5# SPDX-License-Identifier: LGPL-3.0-or-later
6"""
7Module Name:
8 strategy
10Description:
11 the pds_to_stac provides a storage strategy for storing data with "id" with ":" and to avoid
12 a large number of items in one single directory.
14Classes:
15 LargeDataVolumeStrategy:
16 Specific strategy for organizing the STAC catalogs and items.
18.. uml::
20 class LargeDataVolumeStrategy {
21 - _remove_filename_if_needed(parent_dir: str, filename: str) -> str
22 - _fix_parent_directory(parent_dir: str) -> str
23 - _hash_storage(key, base_path) -> str
24 + get_strategy() -> CustomLayoutStrategy
25 }
26 class CustomLayoutStrategy {
27 + catalog_func
28 + collection_func
29 + item_func
30 + __init__(catalog_func, collection_func, item_func)
31 }
32 LargeDataVolumeStrategy --> CustomLayoutStrategy
34Author:
35 Jean-Christophe Malapert
36"""
37import os
38from typing import Callable
39from typing import List
41import pystac
42from pystac.layout import CustomLayoutStrategy
43from pystac.utils import join_path_or_url
44from pystac.utils import JoinType
47class LargeDataVolumeStrategy:
48 """Custom layout strategy for organizing the STAC catalogs and items for large items
49 in a collection."""
51 def __init__(self) -> None:
52 pass
54 def _remove_filename_if_needed(
55 self, parent_dir: str, filename: str
56 ) -> str:
57 if filename in parent_dir:
58 parent_dir = parent_dir.replace(filename, "")
59 return parent_dir
61 def _fix_parent_directory(self, parent_dir: str) -> str:
62 if "urn:" in parent_dir:
63 parent_dir = (
64 parent_dir[:-1] if parent_dir[-1] == "/" else parent_dir
65 )
66 paths: List[str] = parent_dir.split("/")
67 base = paths[:-1]
68 directory = paths[-1].split(":")[-1]
69 parent_dir = os.path.join("/".join(base), directory)
70 return parent_dir
72 def _hash_storage(self, key):
73 # Use the Python hash to generate an unique integer for the key
74 hashed_key = hash(key)
76 # Calculate the directory index using the modulo and the number of directories
77 num_dirs = 1000 # Number of directories
78 dir_index = hashed_key % num_dirs
79 return str(dir_index)
81 def get_strategy(self) -> CustomLayoutStrategy:
82 """Creates a strategy to define the directories name in STAC catalog and childrens
84 Returns:
85 CustomLayoutStrategy: A custom strategy for the name of the directories
86 """
88 def get_custom_catalog_func() -> (
89 Callable[[pystac.Catalog, str, bool], str]
90 ):
91 def fn(col: pystac.Catalog, parent_dir: str, is_root: bool) -> str:
92 parent_dir = self._remove_filename_if_needed(
93 parent_dir, "catalog.json"
94 )
95 path: str
96 if is_root:
97 # need to fix the parent_directory when root
98 parent_dir = self._fix_parent_directory(parent_dir)
99 path = join_path_or_url(
100 JoinType.URL, parent_dir, "catalog.json"
101 )
102 else:
103 new_id = str(col.id).split(":")[-1]
104 path = join_path_or_url(
105 JoinType.URL, parent_dir, new_id, "catalog.json"
106 )
107 return path
109 return fn
111 def get_custom_collection_func() -> (
112 Callable[[pystac.Collection, str, bool], str]
113 ):
114 def fn(
115 col: pystac.Collection, parent_dir: str, is_root: bool
116 ) -> str:
117 parent_dir = self._remove_filename_if_needed(
118 parent_dir, "collection.json"
119 )
120 path: str
121 if is_root:
122 parent_dir = self._fix_parent_directory(parent_dir)
123 path = join_path_or_url(
124 JoinType.URL, parent_dir, "collection.json"
125 )
126 else:
127 new_id = col.id.split(":")[-1]
128 path = join_path_or_url(
129 JoinType.URL, parent_dir, new_id, "collection.json"
130 )
131 return path
133 return fn
135 def get_custom_item_func() -> Callable[[pystac.Item, str], str]:
136 def fn(item: pystac.Item, parent_dir: str) -> str:
137 dir_index: str = self._hash_storage(item.id)
138 path = join_path_or_url(
139 JoinType.URL,
140 parent_dir,
141 dir_index,
142 ".".join((str(item.id), "json")),
143 )
144 return path
146 return fn
148 strategy = CustomLayoutStrategy(
149 catalog_func=get_custom_catalog_func(),
150 collection_func=get_custom_collection_func(),
151 item_func=get_custom_item_func(),
152 )
154 return strategy