#!/usr/bin/env bash set -u set -o pipefail MOUNT="/opt/data/TESSERA_WORLD" OUTDIR="/home/otivm/OTIVM/data" OUTFILE="$OUTDIR/tessera_world_usb_inventory.txt" PYTHON="/home/otivm/pipeline-venv/bin/python3" mkdir -p "$OUTDIR" { echo "TESSERA WORLD USB Drive Inventory and Benchmark" echo "Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)" echo "Mount: $MOUNT" echo "" echo "=== PHASE 0: MOUNT CHECK ===" echo "--- Step 1: Mount contents ---" ls "$MOUNT" || exit 1 echo "" echo "--- Step 2: Mount details ---" df -h "$MOUNT" mount | grep TESSERA_WORLD || true echo "" echo "--- Step 3: Read-only status ---" touch "$MOUNT/test_write_attempt" 2>&1 || echo "CONFIRMED READ-ONLY" echo "" echo "=== PHASE 1: TOP-LEVEL STRUCTURE ===" echo "--- Step 4: Top-level directory listing ---" ls -lah "$MOUNT/" echo "" echo "--- Step 5: Recursive directory tree, directories only ---" find "$MOUNT" -type d | sort echo "" echo "--- Step 6: File count by directory ---" find "$MOUNT" -type f | sed 's|/[^/]*$||' | sort | uniq -c | sort -rn echo "" echo "=== PHASE 2: FILE TYPE CENSUS ===" echo "--- Step 7: File extensions and counts ---" find "$MOUNT" -type f | sed 's|.*\.||' | sort | uniq -c | sort -rn echo "" echo "--- Step 8: Total size by top-level subdirectory ---" du -sh "$MOUNT"/*/ 2>/dev/null | sort -rh echo "" echo "--- Step 9: Total size of mount ---" du -sh "$MOUNT/" echo "" echo "=== PHASE 3: PER-DATASET INVENTORY ===" echo "--- Step 10: GeoTIFF files ---" find "$MOUNT" -type f \( -name "*.tif" -o -name "*.tiff" \) -print0 \ | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}' | sort -rh echo "" echo "--- Step 11: HDF5 files ---" find "$MOUNT" -type f \( -name "*.h5" -o -name "*.hdf5" \) -print0 \ | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}' echo "" echo "--- Step 12: NetCDF files ---" find "$MOUNT" -type f -name "*.nc" -print0 \ | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}' echo "" echo "--- Step 13: CSV and TSV files ---" find "$MOUNT" -type f \( -name "*.csv" -o -name "*.tsv" \) -print0 \ | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}' echo "" echo "--- Step 14: SQLite files ---" find "$MOUNT" -type f \( -name "*.sqlite" -o -name "*.sqlite3" -o -name "*.db" \) -print0 \ | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}' echo "" echo "--- Step 15: Shapefile components ---" find "$MOUNT" -type f \( -name "*.shp" -o -name "*.dbf" -o -name "*.shx" -o -name "*.prj" \) -print0 \ | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}' | sort echo "" echo "--- Step 16: Other file types ---" find "$MOUNT" -type f \ ! -name "*.tif" ! -name "*.tiff" ! -name "*.h5" ! -name "*.hdf5" \ ! -name "*.nc" ! -name "*.csv" ! -name "*.tsv" \ ! -name "*.sqlite" ! -name "*.sqlite3" ! -name "*.db" \ ! -name "*.shp" ! -name "*.dbf" ! -name "*.shx" ! -name "*.prj" \ -print0 | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}' | sort echo "" echo "=== PHASE 4: GEOTIFF METADATA ===" "$PYTHON" - <<'PYEOF' import glob import os try: import rasterio except Exception as e: print(f"ERROR: rasterio unavailable: {e}") raise SystemExit(0) patterns = [ "/opt/data/TESSERA_WORLD/**/*.tif", "/opt/data/TESSERA_WORLD/**/*.tiff", ] files = [] for p in patterns: files.extend(glob.glob(p, recursive=True)) files.sort() if not files: print("No GeoTIFF files found.") else: for f in files: try: with rasterio.open(f) as ds: print("---") print(f"FILE: {f}") print(f"SIZE: {os.path.getsize(f) / 1e9:.2f} GB") print(f"CRS: {ds.crs}") print(f"RES: {ds.res}") print(f"BOUNDS: {ds.bounds}") print(f"SHAPE: {ds.width} x {ds.height} px") print(f"BANDS: {ds.count}") print(f"DTYPE: {ds.dtypes}") print(f"NODATA: {ds.nodata}") except Exception as e: print(f"ERROR reading {f}: {e}") PYEOF echo "" echo "=== PHASE 5: READ SPEED BENCHMARKS ===" echo "--- Step 18: Raw sequential read speed of largest GeoTIFF ---" LARGEST=$(find "$MOUNT" -type f \( -name "*.tif" -o -name "*.tiff" \) -print0 \ | xargs -0 -r ls -s 2>/dev/null | sort -rn | head -1 | awk '{print $2}') echo "Benchmarking: $LARGEST" if [ -n "${LARGEST:-}" ]; then dd if="$LARGEST" of=/dev/null bs=1M status=progress 2>&1 | tail -3 else echo "No GeoTIFF found for sequential benchmark." fi echo "" echo "--- Step 19: Random point sample speed, 2401 points from first GeoTIFF ---" "$PYTHON" - <<'PYEOF' import glob import time try: import rasterio import numpy as np except Exception as e: print(f"ERROR: rasterio/numpy unavailable: {e}") raise SystemExit(0) files = glob.glob("/opt/data/TESSERA_WORLD/**/*.tif", recursive=True) files += glob.glob("/opt/data/TESSERA_WORLD/**/*.tiff", recursive=True) if not files: print("No GeoTIFF found for benchmark.") else: f = sorted(files)[0] print(f"Benchmarking random point reads from: {f}") with rasterio.open(f) as ds: bounds = ds.bounds rng = np.random.default_rng(42) lons = rng.uniform(bounds.left, bounds.right, 2401) lats = rng.uniform(bounds.bottom, bounds.top, 2401) coords = list(zip(lons, lats)) t0 = time.perf_counter() results = list(ds.sample(coords)) t1 = time.perf_counter() elapsed = t1 - t0 print(f"2401 point samples: {elapsed:.3f}s") print(f"Per-point: {elapsed/2401*1000:.3f}ms") print(f"Projected H5 time: {elapsed:.1f}s per H5 hex") print(f"Five H5s: {elapsed*5:.1f}s total") PYEOF echo "" echo "--- Step 20: SQLite write speed baseline on local disk ---" "$PYTHON" - <<'PYEOF' import sqlite3 import time import tempfile import os tmp = tempfile.mktemp(suffix=".sqlite3") con = sqlite3.connect(tmp) con.execute("PRAGMA journal_mode=WAL") con.execute("PRAGMA synchronous=NORMAL") con.execute(""" CREATE TABLE bench ( id INTEGER PRIMARY KEY, h9 INTEGER, h7 INTEGER, h5 INTEGER, lat REAL, lon REAL, elev_cm INTEGER, terrain INTEGER, hydro INTEGER, geo_dep INTEGER, geo_flag INTEGER, occ_flag INTEGER, status INTEGER, run_id INTEGER, created_at TEXT ) """) rows = [ (i, i*10, i*100, i*1000, 40.0 + i*0.001, 12.0 + i*0.001, 100, 1, 0, 255, 0, 0, 1, 1, "2026-04-26T00:00:00Z") for i in range(2401) ] t0 = time.perf_counter() con.executemany(""" INSERT INTO bench (id, h9, h7, h5, lat, lon, elev_cm, terrain, hydro, geo_dep, geo_flag, occ_flag, status, run_id, created_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) """, rows) con.commit() t1 = time.perf_counter() elapsed = t1 - t0 print(f"2401 row INSERT+COMMIT: {elapsed:.3f}s") print(f"Per-row: {elapsed/2401*1000:.3f}ms") con.close() os.unlink(tmp) PYEOF echo "" echo "=== PHASE 6: REPORT LOCATION ===" echo "Inventory written to $OUTFILE" } | tee "$OUTFILE" echo "" wc -l "$OUTFILE"