"""
Collection of functions dealing with the system file and directories
"""
import errno
import logging
import os
import pandas as pd
from pymarine.utils.misc import (
clear_path,
get_regex_pattern,
get_time_stamp_from_string,
)
MSG_FORMAT = "{:30s} : {}"
logger = logging.getLogger(__name__)
[docs]
def get_path_depth(path_name):
r"""
Get the depth of a path or file name
Parameters
----------
path_name : str
Path name to get the depth from
Returns
-------
int
depth of the path
Examples
--------
>>> get_path_depth("C:\Anaconda")
1
>>> get_path_depth("C:\Anaconda\share")
2
>>> get_path_depth("C:\Anaconda\share\pywafo")
3
>>> get_path_depth(".\imaginary\path\subdir\share")
4
"""
if os.path.isfile(path_name) and os.path.exists(path_name):
current_path = os.path.split(path_name)[0]
else:
current_path = path_name
depth = 0
previous_path = current_path
while current_path not in ("", "."):
current_path = os.path.split(current_path)[0]
if current_path == previous_path:
# For a full path name we end at the root 'C:\'.
# Detect that by comparing with the previous round
break
previous_path = current_path
depth += 1
return depth
[docs]
def scan_base_directory(
walk_dir=".",
supplied_file_list=None,
file_has_string_pattern="",
file_has_not_string_pattern="",
dir_has_string_pattern="",
dir_has_not_string_pattern="",
start_date_time=None,
end_date_time=None,
time_zone=None,
time_stamp_year_first=True,
time_stamp_day_first=False,
extension=None,
max_depth=None,
sort_file_base_names=False,
):
"""Recursively scan the directory `walk_dir` and get all files underneath obeying
the search strings and/or date/time ranges
Parameters
----------
walk_dir : str, optional
The base directory to start the import. Default = "."
supplied_file_list: list, optional
In case walk dir is not given we can explicitly pass a file list to analyze.
Default = None
dir_has_string_pattern : str, optional
Requires the directory name to have this pattern (Default value = "").
This selection is only made on the first directory level below the walk_dir
dir_has_not_string_pattern : str, optional
Requires the directory name NOT to have this pattern (Default value = "").
This selection is only made on the first directory level below the walk_dir
file_has_string_pattern : str, optional
Requires the file name to have this pattern (Default value = "", i.e.,
matches all)
file_has_not_string_pattern : str, optional
Requires the file name NOT to have this pattern (Default value = "")
extension : str or None, optional
Extension of the file to match. If None, also matches. Default = None
max_depth : int, optional
Sets a maximum depth to which the search is carried out. Default = None, which
does not limit the search depth. For deep file structures setting a limit to the
search depth speeds
up the search.
sort_file_base_names: bool, option
If True, sort the resulting file list alphabetically based on the file base
name. Default = False
start_date_time: DateTime or None, optional
If given, get the date time from the current file name and only add the files
with a date/time equal or large the *start_date_time*. Default is None
end_date_time: DateTime or None, optional
If given, get the date time from the current file name and only add the files
with a date/time smaller than the *end_date_time*. Default is None
time_zone:str or None, optional
If given add this time zone to the file stamp. The start and end time should
also have a time zone
time_stamp_year_first: bool, optional
Passed to the datetime parser. If true, the year is first in the date/time
string. Default = True
time_stamp_day_first: bool, optional
Passed to the datetime parser. If true, the day is first in the date/time
string. Default = False
Returns
-------
list
All the file names found below the input directory `walk_dir` obeying all the
search strings
Examples
--------
Find all the python files under the share directory in the Anaconda installation
folder
>>> scan_dir = "C:\\Anaconda\\share"
>>> file_list = scan_base_directory(scan_dir, extension='.py')
Find all the python files under the share directory in the Anaconda installation
folder belonging to the pywafo directory
>>> file_list = scan_base_directory(scan_dir, extension='.py',
... dir_has_string_pattern="wafo")
Note that wafo matches on the directory 'pywafo', which is the first directory level
below the scan directory. However, if we would match on '^wafo' the returned list
would be empty as the directory has to *start* with wafo.
To get all the files with "test" in the name with a directory depth smaller than
three do:
>>> file_list = scan_base_directory(scan_dir, extension='.py',
... dir_has_string_pattern="wafo",
... file_has_string_pattern="test", max_depth=3)
Test the date/time boundaries. First create a file list from 28 sep 2017 00:00 to
5:00 with a hour interval and convert it to a string list
>>> file_names = ["AMS_{}.mdf".format(dt.strftime("%y%m%dT%H%M%S")) for dt in
... pd.date_range("20170928T000000", "20170928T030000", freq="30min")]
>>> for file_name in file_names:
... print(file_name)
AMS_170928T000000.mdf
AMS_170928T003000.mdf
AMS_170928T010000.mdf
AMS_170928T013000.mdf
AMS_170928T020000.mdf
AMS_170928T023000.mdf
AMS_170928T030000.mdf
Use the scan_base_directory to get the files within a specific date/time range
>>> file_selection = scan_base_directory(supplied_file_list=file_names,
... start_date_time="20170928T010000", end_date_time="20170928T023000")
>>> for file_name in file_selection:
... print(file_name)
AMS_170928T010000.mdf
AMS_170928T013000.mdf
AMS_170928T020000.mdf
Note that the selected range run from 1 am until 2 am; the end_date_time of 2.30 am
is not included
"""
# get the regular expression for the has_pattern and has_not_pattern of the files
# and directories
file_has_string = get_regex_pattern(file_has_string_pattern)
file_has_not_string = get_regex_pattern(file_has_not_string_pattern)
dir_has_string = get_regex_pattern(dir_has_string_pattern)
dir_has_not_string = get_regex_pattern(dir_has_not_string_pattern)
logger.debug(MSG_FORMAT.format("file_has_string", file_has_string))
logger.debug(MSG_FORMAT.format("file_has_not_string", file_has_not_string))
logger.debug(MSG_FORMAT.format("dir_has_string", dir_has_string))
logger.debug(MSG_FORMAT.format("dir_has_not_string", dir_has_not_string))
# use os.walk to recursively walk over all the file and directories
top_directory = True
file_list = list()
logger.debug(f"Scanning directory {walk_dir}")
for root, subdirs, files in os.walk(walk_dir):
if supplied_file_list is not None:
root = "."
subdirs[:] = list()
files = supplied_file_list
logger.debug(f"root={root} sub={subdirs} files={files}")
logger.debug(MSG_FORMAT.format("root", root))
logger.debug(MSG_FORMAT.format("sub dirs", subdirs))
logger.debug(MSG_FORMAT.format("files", files))
# get the relative path towards the top directory (walk_dir)
relative_path = os.path.relpath(root, walk_dir)
depth = get_path_depth(relative_path)
if root == walk_dir:
top_directory = True
else:
top_directory = False
# Base on the first directory list we are going to make a choice of directories
# to process
if top_directory:
include_dirs = list()
for subdir in subdirs:
add_dir = False
if dir_has_string is None or bool(dir_has_string.search(subdir)):
add_dir = True
if add_dir and dir_has_not_string is not None:
if bool(dir_has_not_string.search(subdir)):
add_dir = False
if add_dir:
include_dirs.append(subdir)
# Overrule the subdirectory list of os.walk:
# http://stackoverflow.com/questions/19859840/
# excluding-directories-in-os-walk
logger.debug(f"Overruling subdirs with {include_dirs}")
subdirs[:] = include_dirs
for filename in files:
(filebase, ext) = os.path.splitext(filename)
if extension is None or extension == ext:
add_file = False
if file_has_string is None or bool(file_has_string.search(filebase)):
# if has_string is none, the search pattern was either empty or
# invalid (which happens during typing the regex in the edit_box).
# In this case, always add the file.
# If not none, filter on the regex, so only add the file if the
# search pattern is in the filename
add_file = True
# Do not add the file in case the has_not string edit has been set
# (!="") and if the file contains the pattern
if add_file and file_has_not_string is not None:
if bool(file_has_not_string.search(filebase)):
# in case we want to exclude the file, the has_not search
# pattern must be valid so may not be None
add_file = False
if add_file and (
start_date_time is not None or end_date_time is not None
):
# We have supplied a start time or a end time. See if we can get a
# date time from the file name
file_time_stamp = get_time_stamp_from_string(
string_with_date_time=filebase,
yearfirst=time_stamp_year_first,
dayfirst=time_stamp_day_first,
timezone=time_zone,
)
if file_time_stamp is not None:
# we found a file time stamp. Compare it with the start time
if start_date_time is not None:
if isinstance(start_date_time, str):
# in case the start time was supplied as a string
start_date_time = get_time_stamp_from_string(
string_with_date_time=start_date_time,
yearfirst=time_stamp_year_first,
dayfirst=time_stamp_day_first,
timezone=time_zone,
)
if file_time_stamp < start_date_time:
# the file time stamp is smaller, so don't add it
add_file = False
# if a end time is supplied. Also compare it with the end time
if end_date_time is not None:
if isinstance(end_date_time, str):
end_date_time = get_time_stamp_from_string(
string_with_date_time=end_date_time,
yearfirst=time_stamp_year_first,
dayfirst=time_stamp_day_first,
timezone=time_zone,
)
if file_time_stamp >= end_date_time:
# the file time stamp is larger, so don't add it
add_file = False
if dir_has_string is not None and top_directory:
# in case we have specified a directory name with a string search,
# exclude the top directory
add_file = False
if max_depth is not None and depth > max_depth:
add_file = False
# create the full base name file
file_name_to_add = os.path.join(walk_dir, relative_path, filebase)
# get the path to the stl relative to the selected scan directory
if add_file:
logger.debug(f"Adding file {filebase}")
file_list.append(clear_path(file_name_to_add + ext))
# Sort on the file name. First split the file base from the path, because if the
# files are in different directories, the first file is not necessarily the oldest
if sort_file_base_names:
df = pd.DataFrame(
data=file_list,
index=[os.path.split(f)[1] for f in file_list],
columns=["file_list"],
)
df.sort_index(inplace=True)
file_list = df.file_list.values
return file_list
[docs]
def make_directory(directory):
"""Create a directory in case it does not yet exist.
Parameters
----------
directory : str
Name of the directory to create
Notes
-----
This function is used to create directories without checking if it already exists.
If the directory already exists, we can silently continue.
Raises
------
OSError
The OSError is only raised if it is not an `EEXIST` error. This implies that the
creation of the directory failed due to another reason then the directory
already being present.
It could be that the file system is full or that we may not have write
permission
"""
try:
os.makedirs(directory)
logger.debug(f"Created directory : {directory}")
except OSError as exc:
# an OSError was raised, see what is the cause
if exc.errno == errno.EEXIST and os.path.isdir(directory):
# the output directory already exists, that is ok so just continue
logger.debug(
"Directory {} already exists. No problem, we just continue".format(
directory
)
)
else:
# something else was wrong. Raise an error
logger.warning(
"Failed to create the directory {} because raised:\n{}".format(
directory, exc
)
)
raise