Source code for pymarine.utils.file_and_directory

"""
Collection of functions dealing with the system file and directories
"""

import errno
import logging
import os

import pandas as pd

from pymarine.utils.misc import (
    clear_path,
    get_regex_pattern,
    get_time_stamp_from_string,
)

MSG_FORMAT = "{:30s} : {}"

logger = logging.getLogger(__name__)


[docs] def get_path_depth(path_name): r""" Get the depth of a path or file name Parameters ---------- path_name : str Path name to get the depth from Returns ------- int depth of the path Examples -------- >>> get_path_depth("C:\Anaconda") 1 >>> get_path_depth("C:\Anaconda\share") 2 >>> get_path_depth("C:\Anaconda\share\pywafo") 3 >>> get_path_depth(".\imaginary\path\subdir\share") 4 """ if os.path.isfile(path_name) and os.path.exists(path_name): current_path = os.path.split(path_name)[0] else: current_path = path_name depth = 0 previous_path = current_path while current_path not in ("", "."): current_path = os.path.split(current_path)[0] if current_path == previous_path: # For a full path name we end at the root 'C:\'. # Detect that by comparing with the previous round break previous_path = current_path depth += 1 return depth
[docs] def scan_base_directory( walk_dir=".", supplied_file_list=None, file_has_string_pattern="", file_has_not_string_pattern="", dir_has_string_pattern="", dir_has_not_string_pattern="", start_date_time=None, end_date_time=None, time_zone=None, time_stamp_year_first=True, time_stamp_day_first=False, extension=None, max_depth=None, sort_file_base_names=False, ): """Recursively scan the directory `walk_dir` and get all files underneath obeying the search strings and/or date/time ranges Parameters ---------- walk_dir : str, optional The base directory to start the import. Default = "." supplied_file_list: list, optional In case walk dir is not given we can explicitly pass a file list to analyze. Default = None dir_has_string_pattern : str, optional Requires the directory name to have this pattern (Default value = ""). This selection is only made on the first directory level below the walk_dir dir_has_not_string_pattern : str, optional Requires the directory name NOT to have this pattern (Default value = ""). This selection is only made on the first directory level below the walk_dir file_has_string_pattern : str, optional Requires the file name to have this pattern (Default value = "", i.e., matches all) file_has_not_string_pattern : str, optional Requires the file name NOT to have this pattern (Default value = "") extension : str or None, optional Extension of the file to match. If None, also matches. Default = None max_depth : int, optional Sets a maximum depth to which the search is carried out. Default = None, which does not limit the search depth. For deep file structures setting a limit to the search depth speeds up the search. sort_file_base_names: bool, option If True, sort the resulting file list alphabetically based on the file base name. Default = False start_date_time: DateTime or None, optional If given, get the date time from the current file name and only add the files with a date/time equal or large the *start_date_time*. Default is None end_date_time: DateTime or None, optional If given, get the date time from the current file name and only add the files with a date/time smaller than the *end_date_time*. Default is None time_zone:str or None, optional If given add this time zone to the file stamp. The start and end time should also have a time zone time_stamp_year_first: bool, optional Passed to the datetime parser. If true, the year is first in the date/time string. Default = True time_stamp_day_first: bool, optional Passed to the datetime parser. If true, the day is first in the date/time string. Default = False Returns ------- list All the file names found below the input directory `walk_dir` obeying all the search strings Examples -------- Find all the python files under the share directory in the Anaconda installation folder >>> scan_dir = "C:\\Anaconda\\share" >>> file_list = scan_base_directory(scan_dir, extension='.py') Find all the python files under the share directory in the Anaconda installation folder belonging to the pywafo directory >>> file_list = scan_base_directory(scan_dir, extension='.py', ... dir_has_string_pattern="wafo") Note that wafo matches on the directory 'pywafo', which is the first directory level below the scan directory. However, if we would match on '^wafo' the returned list would be empty as the directory has to *start* with wafo. To get all the files with "test" in the name with a directory depth smaller than three do: >>> file_list = scan_base_directory(scan_dir, extension='.py', ... dir_has_string_pattern="wafo", ... file_has_string_pattern="test", max_depth=3) Test the date/time boundaries. First create a file list from 28 sep 2017 00:00 to 5:00 with a hour interval and convert it to a string list >>> file_names = ["AMS_{}.mdf".format(dt.strftime("%y%m%dT%H%M%S")) for dt in ... pd.date_range("20170928T000000", "20170928T030000", freq="30min")] >>> for file_name in file_names: ... print(file_name) AMS_170928T000000.mdf AMS_170928T003000.mdf AMS_170928T010000.mdf AMS_170928T013000.mdf AMS_170928T020000.mdf AMS_170928T023000.mdf AMS_170928T030000.mdf Use the scan_base_directory to get the files within a specific date/time range >>> file_selection = scan_base_directory(supplied_file_list=file_names, ... start_date_time="20170928T010000", end_date_time="20170928T023000") >>> for file_name in file_selection: ... print(file_name) AMS_170928T010000.mdf AMS_170928T013000.mdf AMS_170928T020000.mdf Note that the selected range run from 1 am until 2 am; the end_date_time of 2.30 am is not included """ # get the regular expression for the has_pattern and has_not_pattern of the files # and directories file_has_string = get_regex_pattern(file_has_string_pattern) file_has_not_string = get_regex_pattern(file_has_not_string_pattern) dir_has_string = get_regex_pattern(dir_has_string_pattern) dir_has_not_string = get_regex_pattern(dir_has_not_string_pattern) logger.debug(MSG_FORMAT.format("file_has_string", file_has_string)) logger.debug(MSG_FORMAT.format("file_has_not_string", file_has_not_string)) logger.debug(MSG_FORMAT.format("dir_has_string", dir_has_string)) logger.debug(MSG_FORMAT.format("dir_has_not_string", dir_has_not_string)) # use os.walk to recursively walk over all the file and directories top_directory = True file_list = list() logger.debug(f"Scanning directory {walk_dir}") for root, subdirs, files in os.walk(walk_dir): if supplied_file_list is not None: root = "." subdirs[:] = list() files = supplied_file_list logger.debug(f"root={root} sub={subdirs} files={files}") logger.debug(MSG_FORMAT.format("root", root)) logger.debug(MSG_FORMAT.format("sub dirs", subdirs)) logger.debug(MSG_FORMAT.format("files", files)) # get the relative path towards the top directory (walk_dir) relative_path = os.path.relpath(root, walk_dir) depth = get_path_depth(relative_path) if root == walk_dir: top_directory = True else: top_directory = False # Base on the first directory list we are going to make a choice of directories # to process if top_directory: include_dirs = list() for subdir in subdirs: add_dir = False if dir_has_string is None or bool(dir_has_string.search(subdir)): add_dir = True if add_dir and dir_has_not_string is not None: if bool(dir_has_not_string.search(subdir)): add_dir = False if add_dir: include_dirs.append(subdir) # Overrule the subdirectory list of os.walk: # http://stackoverflow.com/questions/19859840/ # excluding-directories-in-os-walk logger.debug(f"Overruling subdirs with {include_dirs}") subdirs[:] = include_dirs for filename in files: (filebase, ext) = os.path.splitext(filename) if extension is None or extension == ext: add_file = False if file_has_string is None or bool(file_has_string.search(filebase)): # if has_string is none, the search pattern was either empty or # invalid (which happens during typing the regex in the edit_box). # In this case, always add the file. # If not none, filter on the regex, so only add the file if the # search pattern is in the filename add_file = True # Do not add the file in case the has_not string edit has been set # (!="") and if the file contains the pattern if add_file and file_has_not_string is not None: if bool(file_has_not_string.search(filebase)): # in case we want to exclude the file, the has_not search # pattern must be valid so may not be None add_file = False if add_file and ( start_date_time is not None or end_date_time is not None ): # We have supplied a start time or a end time. See if we can get a # date time from the file name file_time_stamp = get_time_stamp_from_string( string_with_date_time=filebase, yearfirst=time_stamp_year_first, dayfirst=time_stamp_day_first, timezone=time_zone, ) if file_time_stamp is not None: # we found a file time stamp. Compare it with the start time if start_date_time is not None: if isinstance(start_date_time, str): # in case the start time was supplied as a string start_date_time = get_time_stamp_from_string( string_with_date_time=start_date_time, yearfirst=time_stamp_year_first, dayfirst=time_stamp_day_first, timezone=time_zone, ) if file_time_stamp < start_date_time: # the file time stamp is smaller, so don't add it add_file = False # if a end time is supplied. Also compare it with the end time if end_date_time is not None: if isinstance(end_date_time, str): end_date_time = get_time_stamp_from_string( string_with_date_time=end_date_time, yearfirst=time_stamp_year_first, dayfirst=time_stamp_day_first, timezone=time_zone, ) if file_time_stamp >= end_date_time: # the file time stamp is larger, so don't add it add_file = False if dir_has_string is not None and top_directory: # in case we have specified a directory name with a string search, # exclude the top directory add_file = False if max_depth is not None and depth > max_depth: add_file = False # create the full base name file file_name_to_add = os.path.join(walk_dir, relative_path, filebase) # get the path to the stl relative to the selected scan directory if add_file: logger.debug(f"Adding file {filebase}") file_list.append(clear_path(file_name_to_add + ext)) # Sort on the file name. First split the file base from the path, because if the # files are in different directories, the first file is not necessarily the oldest if sort_file_base_names: df = pd.DataFrame( data=file_list, index=[os.path.split(f)[1] for f in file_list], columns=["file_list"], ) df.sort_index(inplace=True) file_list = df.file_list.values return file_list
[docs] def make_directory(directory): """Create a directory in case it does not yet exist. Parameters ---------- directory : str Name of the directory to create Notes ----- This function is used to create directories without checking if it already exists. If the directory already exists, we can silently continue. Raises ------ OSError The OSError is only raised if it is not an `EEXIST` error. This implies that the creation of the directory failed due to another reason then the directory already being present. It could be that the file system is full or that we may not have write permission """ try: os.makedirs(directory) logger.debug(f"Created directory : {directory}") except OSError as exc: # an OSError was raised, see what is the cause if exc.errno == errno.EEXIST and os.path.isdir(directory): # the output directory already exists, that is ok so just continue logger.debug( "Directory {} already exists. No problem, we just continue".format( directory ) ) else: # something else was wrong. Raise an error logger.warning( "Failed to create the directory {} because raised:\n{}".format( directory, exc ) ) raise