#!/usr/bin/env bash
set -u
set -o pipefail

MOUNT="/opt/data/TESSERA_WORLD"
OUTDIR="/home/otivm/OTIVM/data"
OUTFILE="$OUTDIR/tessera_world_usb_inventory.txt"
PYTHON="/home/otivm/pipeline-venv/bin/python3"

mkdir -p "$OUTDIR"

{
echo "TESSERA WORLD USB Drive Inventory and Benchmark"
echo "Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
echo "Mount: $MOUNT"
echo ""

echo "=== PHASE 0: MOUNT CHECK ==="
echo "--- Step 1: Mount contents ---"
ls "$MOUNT" || exit 1
echo ""

echo "--- Step 2: Mount details ---"
df -h "$MOUNT"
mount | grep TESSERA_WORLD || true
echo ""

echo "--- Step 3: Read-only status ---"
touch "$MOUNT/test_write_attempt" 2>&1 || echo "CONFIRMED READ-ONLY"
echo ""

echo "=== PHASE 1: TOP-LEVEL STRUCTURE ==="
echo "--- Step 4: Top-level directory listing ---"
ls -lah "$MOUNT/"
echo ""

echo "--- Step 5: Recursive directory tree, directories only ---"
find "$MOUNT" -type d | sort
echo ""

echo "--- Step 6: File count by directory ---"
find "$MOUNT" -type f | sed 's|/[^/]*$||' | sort | uniq -c | sort -rn
echo ""

echo "=== PHASE 2: FILE TYPE CENSUS ==="
echo "--- Step 7: File extensions and counts ---"
find "$MOUNT" -type f | sed 's|.*\.||' | sort | uniq -c | sort -rn
echo ""

echo "--- Step 8: Total size by top-level subdirectory ---"
du -sh "$MOUNT"/*/ 2>/dev/null | sort -rh
echo ""

echo "--- Step 9: Total size of mount ---"
du -sh "$MOUNT/"
echo ""

echo "=== PHASE 3: PER-DATASET INVENTORY ==="
echo "--- Step 10: GeoTIFF files ---"
find "$MOUNT" -type f \( -name "*.tif" -o -name "*.tiff" \) -print0 \
  | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}' | sort -rh
echo ""

echo "--- Step 11: HDF5 files ---"
find "$MOUNT" -type f \( -name "*.h5" -o -name "*.hdf5" \) -print0 \
  | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}'
echo ""

echo "--- Step 12: NetCDF files ---"
find "$MOUNT" -type f -name "*.nc" -print0 \
  | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}'
echo ""

echo "--- Step 13: CSV and TSV files ---"
find "$MOUNT" -type f \( -name "*.csv" -o -name "*.tsv" \) -print0 \
  | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}'
echo ""

echo "--- Step 14: SQLite files ---"
find "$MOUNT" -type f \( -name "*.sqlite" -o -name "*.sqlite3" -o -name "*.db" \) -print0 \
  | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}'
echo ""

echo "--- Step 15: Shapefile components ---"
find "$MOUNT" -type f \( -name "*.shp" -o -name "*.dbf" -o -name "*.shx" -o -name "*.prj" \) -print0 \
  | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}' | sort
echo ""

echo "--- Step 16: Other file types ---"
find "$MOUNT" -type f \
  ! -name "*.tif" ! -name "*.tiff" ! -name "*.h5" ! -name "*.hdf5" \
  ! -name "*.nc" ! -name "*.csv" ! -name "*.tsv" \
  ! -name "*.sqlite" ! -name "*.sqlite3" ! -name "*.db" \
  ! -name "*.shp" ! -name "*.dbf" ! -name "*.shx" ! -name "*.prj" \
  -print0 | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}' | sort
echo ""

echo "=== PHASE 4: GEOTIFF METADATA ==="
"$PYTHON" - <<'PYEOF'
import glob
import os

try:
    import rasterio
except Exception as e:
    print(f"ERROR: rasterio unavailable: {e}")
    raise SystemExit(0)

patterns = [
    "/opt/data/TESSERA_WORLD/**/*.tif",
    "/opt/data/TESSERA_WORLD/**/*.tiff",
]

files = []
for p in patterns:
    files.extend(glob.glob(p, recursive=True))
files.sort()

if not files:
    print("No GeoTIFF files found.")
else:
    for f in files:
        try:
            with rasterio.open(f) as ds:
                print("---")
                print(f"FILE:     {f}")
                print(f"SIZE:     {os.path.getsize(f) / 1e9:.2f} GB")
                print(f"CRS:      {ds.crs}")
                print(f"RES:      {ds.res}")
                print(f"BOUNDS:   {ds.bounds}")
                print(f"SHAPE:    {ds.width} x {ds.height} px")
                print(f"BANDS:    {ds.count}")
                print(f"DTYPE:    {ds.dtypes}")
                print(f"NODATA:   {ds.nodata}")
        except Exception as e:
            print(f"ERROR reading {f}: {e}")
PYEOF
echo ""

echo "=== PHASE 5: READ SPEED BENCHMARKS ==="
echo "--- Step 18: Raw sequential read speed of largest GeoTIFF ---"
LARGEST=$(find "$MOUNT" -type f \( -name "*.tif" -o -name "*.tiff" \) -print0 \
  | xargs -0 -r ls -s 2>/dev/null | sort -rn | head -1 | awk '{print $2}')
echo "Benchmarking: $LARGEST"
if [ -n "${LARGEST:-}" ]; then
  dd if="$LARGEST" of=/dev/null bs=1M status=progress 2>&1 | tail -3
else
  echo "No GeoTIFF found for sequential benchmark."
fi
echo ""

echo "--- Step 19: Random point sample speed, 2401 points from first GeoTIFF ---"
"$PYTHON" - <<'PYEOF'
import glob
import time

try:
    import rasterio
    import numpy as np
except Exception as e:
    print(f"ERROR: rasterio/numpy unavailable: {e}")
    raise SystemExit(0)

files = glob.glob("/opt/data/TESSERA_WORLD/**/*.tif", recursive=True)
files += glob.glob("/opt/data/TESSERA_WORLD/**/*.tiff", recursive=True)

if not files:
    print("No GeoTIFF found for benchmark.")
else:
    f = sorted(files)[0]
    print(f"Benchmarking random point reads from: {f}")
    with rasterio.open(f) as ds:
        bounds = ds.bounds
        rng = np.random.default_rng(42)
        lons = rng.uniform(bounds.left, bounds.right, 2401)
        lats = rng.uniform(bounds.bottom, bounds.top, 2401)
        coords = list(zip(lons, lats))

        t0 = time.perf_counter()
        results = list(ds.sample(coords))
        t1 = time.perf_counter()

    elapsed = t1 - t0
    print(f"2401 point samples: {elapsed:.3f}s")
    print(f"Per-point:          {elapsed/2401*1000:.3f}ms")
    print(f"Projected H5 time:  {elapsed:.1f}s per H5 hex")
    print(f"Five H5s:           {elapsed*5:.1f}s total")
PYEOF
echo ""

echo "--- Step 20: SQLite write speed baseline on local disk ---"
"$PYTHON" - <<'PYEOF'
import sqlite3
import time
import tempfile
import os

tmp = tempfile.mktemp(suffix=".sqlite3")
con = sqlite3.connect(tmp)
con.execute("PRAGMA journal_mode=WAL")
con.execute("PRAGMA synchronous=NORMAL")
con.execute("""
    CREATE TABLE bench (
        id INTEGER PRIMARY KEY,
        h9 INTEGER, h7 INTEGER, h5 INTEGER,
        lat REAL, lon REAL,
        elev_cm INTEGER, terrain INTEGER, hydro INTEGER,
        geo_dep INTEGER, geo_flag INTEGER, occ_flag INTEGER,
        status INTEGER, run_id INTEGER, created_at TEXT
    )
""")

rows = [
    (i, i*10, i*100, i*1000,
     40.0 + i*0.001, 12.0 + i*0.001,
     100, 1, 0, 255, 0, 0,
     1, 1, "2026-04-26T00:00:00Z")
    for i in range(2401)
]

t0 = time.perf_counter()
con.executemany("""
    INSERT INTO bench
    (id, h9, h7, h5, lat, lon, elev_cm, terrain, hydro,
     geo_dep, geo_flag, occ_flag, status, run_id, created_at)
    VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
""", rows)
con.commit()
t1 = time.perf_counter()

elapsed = t1 - t0
print(f"2401 row INSERT+COMMIT: {elapsed:.3f}s")
print(f"Per-row:               {elapsed/2401*1000:.3f}ms")
con.close()
os.unlink(tmp)
PYEOF
echo ""

echo "=== PHASE 6: REPORT LOCATION ==="
echo "Inventory written to $OUTFILE"
} | tee "$OUTFILE"

echo ""
wc -l "$OUTFILE"