Coverage for pds_crawler/load/strategy.py: 91%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

57 statements  

1# -*- coding: utf-8 -*- 

2# pds-crawler - ETL to index PDS data to pdssp 

3# Copyright (C) 2023 - CNES (Jean-Christophe Malapert for Pôle Surfaces Planétaires) 

4# This file is part of pds-crawler <https://github.com/pdssp/pds_crawler> 

5# SPDX-License-Identifier: LGPL-3.0-or-later 

6""" 

7Module Name: 

8 strategy 

9 

10Description: 

11 the pds_to_stac provides a storage strategy for storing data with "id" with ":" and to avoid 

12 a large number of items in one single directory. 

13 

14Classes: 

15 LargeDataVolumeStrategy: 

16 Specific strategy for organizing the STAC catalogs and items. 

17 

18.. uml:: 

19 

20 class LargeDataVolumeStrategy { 

21 - _remove_filename_if_needed(parent_dir: str, filename: str) -> str 

22 - _fix_parent_directory(parent_dir: str) -> str 

23 - _hash_storage(key, base_path) -> str 

24 + get_strategy() -> CustomLayoutStrategy 

25 } 

26 class CustomLayoutStrategy { 

27 + catalog_func 

28 + collection_func 

29 + item_func 

30 + __init__(catalog_func, collection_func, item_func) 

31 } 

32 LargeDataVolumeStrategy --> CustomLayoutStrategy 

33 

34Author: 

35 Jean-Christophe Malapert 

36""" 

37import os 

38from typing import Callable 

39from typing import List 

40 

41import pystac 

42from pystac.layout import CustomLayoutStrategy 

43from pystac.utils import join_path_or_url 

44from pystac.utils import JoinType 

45 

46 

47class LargeDataVolumeStrategy: 

48 """Custom layout strategy for organizing the STAC catalogs and items for large items 

49 in a collection.""" 

50 

51 def __init__(self) -> None: 

52 pass 

53 

54 def _remove_filename_if_needed( 

55 self, parent_dir: str, filename: str 

56 ) -> str: 

57 if filename in parent_dir: 

58 parent_dir = parent_dir.replace(filename, "") 

59 return parent_dir 

60 

61 def _fix_parent_directory(self, parent_dir: str) -> str: 

62 if "urn:" in parent_dir: 

63 parent_dir = ( 

64 parent_dir[:-1] if parent_dir[-1] == "/" else parent_dir 

65 ) 

66 paths: List[str] = parent_dir.split("/") 

67 base = paths[:-1] 

68 directory = paths[-1].split(":")[-1] 

69 parent_dir = os.path.join("/".join(base), directory) 

70 return parent_dir 

71 

72 def _hash_storage(self, key): 

73 # Use the Python hash to generate an unique integer for the key 

74 hashed_key = hash(key) 

75 

76 # Calculate the directory index using the modulo and the number of directories 

77 num_dirs = 1000 # Number of directories 

78 dir_index = hashed_key % num_dirs 

79 return str(dir_index) 

80 

81 def get_strategy(self) -> CustomLayoutStrategy: 

82 """Creates a strategy to define the directories name in STAC catalog and childrens 

83 

84 Returns: 

85 CustomLayoutStrategy: A custom strategy for the name of the directories 

86 """ 

87 

88 def get_custom_catalog_func() -> ( 

89 Callable[[pystac.Catalog, str, bool], str] 

90 ): 

91 def fn(col: pystac.Catalog, parent_dir: str, is_root: bool) -> str: 

92 parent_dir = self._remove_filename_if_needed( 

93 parent_dir, "catalog.json" 

94 ) 

95 path: str 

96 if is_root: 

97 # need to fix the parent_directory when root 

98 parent_dir = self._fix_parent_directory(parent_dir) 

99 path = join_path_or_url( 

100 JoinType.URL, parent_dir, "catalog.json" 

101 ) 

102 else: 

103 new_id = str(col.id).split(":")[-1] 

104 path = join_path_or_url( 

105 JoinType.URL, parent_dir, new_id, "catalog.json" 

106 ) 

107 return path 

108 

109 return fn 

110 

111 def get_custom_collection_func() -> ( 

112 Callable[[pystac.Collection, str, bool], str] 

113 ): 

114 def fn( 

115 col: pystac.Collection, parent_dir: str, is_root: bool 

116 ) -> str: 

117 parent_dir = self._remove_filename_if_needed( 

118 parent_dir, "collection.json" 

119 ) 

120 path: str 

121 if is_root: 

122 parent_dir = self._fix_parent_directory(parent_dir) 

123 path = join_path_or_url( 

124 JoinType.URL, parent_dir, "collection.json" 

125 ) 

126 else: 

127 new_id = col.id.split(":")[-1] 

128 path = join_path_or_url( 

129 JoinType.URL, parent_dir, new_id, "collection.json" 

130 ) 

131 return path 

132 

133 return fn 

134 

135 def get_custom_item_func() -> Callable[[pystac.Item, str], str]: 

136 def fn(item: pystac.Item, parent_dir: str) -> str: 

137 dir_index: str = self._hash_storage(item.id) 

138 path = join_path_or_url( 

139 JoinType.URL, 

140 parent_dir, 

141 dir_index, 

142 ".".join((str(item.id), "json")), 

143 ) 

144 return path 

145 

146 return fn 

147 

148 strategy = CustomLayoutStrategy( 

149 catalog_func=get_custom_catalog_func(), 

150 collection_func=get_custom_collection_func(), 

151 item_func=get_custom_item_func(), 

152 ) 

153 

154 return strategy