Checking if two files are logically equal
For some use cases it is useful to be able to determine if two files are logically equal, namely they have the same extension and contents but their metadata can differ.
One approach for checking if two files are logically equal in Python, with metadata exclusion/ignore support for csv and Parquet files, is given here and shown below.
import filecmp
from pathlib import Path
from typing import TypeAlias
import polars as pl
FilePath: TypeAlias = str | Path
def ensure_existing_file_path(file_path: FilePath) -> Path:
path = Path(file_path) if isinstance(file_path, str) else file_path
if not path.exists() or not path.is_file():
raise FileNotFoundError(f"{path} should point to an existing file and it does not.")
return path
def files_are_logically_equal(file1: FilePath, file2: FilePath) -> bool:
path1 = ensure_existing_file_path(file1)
path2 = ensure_existing_file_path(file2)
if path1.samefile(path2):
return True
if path1.stat().st_size == 0 and path2.stat().st_size == 0:
return True
ext1 = path1.suffix.lower()
ext2 = path2.suffix.lower()
if ext1 != ext2:
return False
match ext1:
case '.csv':
return pl.scan_csv(path1).collect(engine="streaming").equals(pl.scan_csv(path2).collect(engine="streaming"))
case '.parquet':
return pl.scan_parquet(path1).collect(engine="streaming").equals(pl.scan_parquet(path2).collect(engine="streaming"))
case _:
return filecmp.cmp(path1, path2, shallow=False)