Switching to BS5

This commit is contained in:
otivm
2026-05-03 14:31:21 +00:00
parent 31ed382c01
commit 4018f006cb
6 changed files with 6082 additions and 520 deletions

244
benchmark_tessera_world_usb.sh Executable file
View File

@@ -0,0 +1,244 @@
#!/usr/bin/env bash
set -u
set -o pipefail
MOUNT="/opt/data/TESSERA_WORLD"
OUTDIR="/home/otivm/OTIVM/data"
OUTFILE="$OUTDIR/tessera_world_usb_inventory.txt"
PYTHON="/home/otivm/pipeline-venv/bin/python3"
mkdir -p "$OUTDIR"
{
echo "TESSERA WORLD USB Drive Inventory and Benchmark"
echo "Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
echo "Mount: $MOUNT"
echo ""
echo "=== PHASE 0: MOUNT CHECK ==="
echo "--- Step 1: Mount contents ---"
ls "$MOUNT" || exit 1
echo ""
echo "--- Step 2: Mount details ---"
df -h "$MOUNT"
mount | grep TESSERA_WORLD || true
echo ""
echo "--- Step 3: Read-only status ---"
touch "$MOUNT/test_write_attempt" 2>&1 || echo "CONFIRMED READ-ONLY"
echo ""
echo "=== PHASE 1: TOP-LEVEL STRUCTURE ==="
echo "--- Step 4: Top-level directory listing ---"
ls -lah "$MOUNT/"
echo ""
echo "--- Step 5: Recursive directory tree, directories only ---"
find "$MOUNT" -type d | sort
echo ""
echo "--- Step 6: File count by directory ---"
find "$MOUNT" -type f | sed 's|/[^/]*$||' | sort | uniq -c | sort -rn
echo ""
echo "=== PHASE 2: FILE TYPE CENSUS ==="
echo "--- Step 7: File extensions and counts ---"
find "$MOUNT" -type f | sed 's|.*\.||' | sort | uniq -c | sort -rn
echo ""
echo "--- Step 8: Total size by top-level subdirectory ---"
du -sh "$MOUNT"/*/ 2>/dev/null | sort -rh
echo ""
echo "--- Step 9: Total size of mount ---"
du -sh "$MOUNT/"
echo ""
echo "=== PHASE 3: PER-DATASET INVENTORY ==="
echo "--- Step 10: GeoTIFF files ---"
find "$MOUNT" -type f \( -name "*.tif" -o -name "*.tiff" \) -print0 \
| xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}' | sort -rh
echo ""
echo "--- Step 11: HDF5 files ---"
find "$MOUNT" -type f \( -name "*.h5" -o -name "*.hdf5" \) -print0 \
| xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}'
echo ""
echo "--- Step 12: NetCDF files ---"
find "$MOUNT" -type f -name "*.nc" -print0 \
| xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}'
echo ""
echo "--- Step 13: CSV and TSV files ---"
find "$MOUNT" -type f \( -name "*.csv" -o -name "*.tsv" \) -print0 \
| xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}'
echo ""
echo "--- Step 14: SQLite files ---"
find "$MOUNT" -type f \( -name "*.sqlite" -o -name "*.sqlite3" -o -name "*.db" \) -print0 \
| xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}'
echo ""
echo "--- Step 15: Shapefile components ---"
find "$MOUNT" -type f \( -name "*.shp" -o -name "*.dbf" -o -name "*.shx" -o -name "*.prj" \) -print0 \
| xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}' | sort
echo ""
echo "--- Step 16: Other file types ---"
find "$MOUNT" -type f \
! -name "*.tif" ! -name "*.tiff" ! -name "*.h5" ! -name "*.hdf5" \
! -name "*.nc" ! -name "*.csv" ! -name "*.tsv" \
! -name "*.sqlite" ! -name "*.sqlite3" ! -name "*.db" \
! -name "*.shp" ! -name "*.dbf" ! -name "*.shx" ! -name "*.prj" \
-print0 | xargs -0 -r ls -lh 2>/dev/null | awk '{print $5, $9}' | sort
echo ""
echo "=== PHASE 4: GEOTIFF METADATA ==="
"$PYTHON" - <<'PYEOF'
import glob
import os
try:
import rasterio
except Exception as e:
print(f"ERROR: rasterio unavailable: {e}")
raise SystemExit(0)
patterns = [
"/opt/data/TESSERA_WORLD/**/*.tif",
"/opt/data/TESSERA_WORLD/**/*.tiff",
]
files = []
for p in patterns:
files.extend(glob.glob(p, recursive=True))
files.sort()
if not files:
print("No GeoTIFF files found.")
else:
for f in files:
try:
with rasterio.open(f) as ds:
print("---")
print(f"FILE: {f}")
print(f"SIZE: {os.path.getsize(f) / 1e9:.2f} GB")
print(f"CRS: {ds.crs}")
print(f"RES: {ds.res}")
print(f"BOUNDS: {ds.bounds}")
print(f"SHAPE: {ds.width} x {ds.height} px")
print(f"BANDS: {ds.count}")
print(f"DTYPE: {ds.dtypes}")
print(f"NODATA: {ds.nodata}")
except Exception as e:
print(f"ERROR reading {f}: {e}")
PYEOF
echo ""
echo "=== PHASE 5: READ SPEED BENCHMARKS ==="
echo "--- Step 18: Raw sequential read speed of largest GeoTIFF ---"
LARGEST=$(find "$MOUNT" -type f \( -name "*.tif" -o -name "*.tiff" \) -print0 \
| xargs -0 -r ls -s 2>/dev/null | sort -rn | head -1 | awk '{print $2}')
echo "Benchmarking: $LARGEST"
if [ -n "${LARGEST:-}" ]; then
dd if="$LARGEST" of=/dev/null bs=1M status=progress 2>&1 | tail -3
else
echo "No GeoTIFF found for sequential benchmark."
fi
echo ""
echo "--- Step 19: Random point sample speed, 2401 points from first GeoTIFF ---"
"$PYTHON" - <<'PYEOF'
import glob
import time
try:
import rasterio
import numpy as np
except Exception as e:
print(f"ERROR: rasterio/numpy unavailable: {e}")
raise SystemExit(0)
files = glob.glob("/opt/data/TESSERA_WORLD/**/*.tif", recursive=True)
files += glob.glob("/opt/data/TESSERA_WORLD/**/*.tiff", recursive=True)
if not files:
print("No GeoTIFF found for benchmark.")
else:
f = sorted(files)[0]
print(f"Benchmarking random point reads from: {f}")
with rasterio.open(f) as ds:
bounds = ds.bounds
rng = np.random.default_rng(42)
lons = rng.uniform(bounds.left, bounds.right, 2401)
lats = rng.uniform(bounds.bottom, bounds.top, 2401)
coords = list(zip(lons, lats))
t0 = time.perf_counter()
results = list(ds.sample(coords))
t1 = time.perf_counter()
elapsed = t1 - t0
print(f"2401 point samples: {elapsed:.3f}s")
print(f"Per-point: {elapsed/2401*1000:.3f}ms")
print(f"Projected H5 time: {elapsed:.1f}s per H5 hex")
print(f"Five H5s: {elapsed*5:.1f}s total")
PYEOF
echo ""
echo "--- Step 20: SQLite write speed baseline on local disk ---"
"$PYTHON" - <<'PYEOF'
import sqlite3
import time
import tempfile
import os
tmp = tempfile.mktemp(suffix=".sqlite3")
con = sqlite3.connect(tmp)
con.execute("PRAGMA journal_mode=WAL")
con.execute("PRAGMA synchronous=NORMAL")
con.execute("""
CREATE TABLE bench (
id INTEGER PRIMARY KEY,
h9 INTEGER, h7 INTEGER, h5 INTEGER,
lat REAL, lon REAL,
elev_cm INTEGER, terrain INTEGER, hydro INTEGER,
geo_dep INTEGER, geo_flag INTEGER, occ_flag INTEGER,
status INTEGER, run_id INTEGER, created_at TEXT
)
""")
rows = [
(i, i*10, i*100, i*1000,
40.0 + i*0.001, 12.0 + i*0.001,
100, 1, 0, 255, 0, 0,
1, 1, "2026-04-26T00:00:00Z")
for i in range(2401)
]
t0 = time.perf_counter()
con.executemany("""
INSERT INTO bench
(id, h9, h7, h5, lat, lon, elev_cm, terrain, hydro,
geo_dep, geo_flag, occ_flag, status, run_id, created_at)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
""", rows)
con.commit()
t1 = time.perf_counter()
elapsed = t1 - t0
print(f"2401 row INSERT+COMMIT: {elapsed:.3f}s")
print(f"Per-row: {elapsed/2401*1000:.3f}ms")
con.close()
os.unlink(tmp)
PYEOF
echo ""
echo "=== PHASE 6: REPORT LOCATION ==="
echo "Inventory written to $OUTFILE"
} | tee "$OUTFILE"
echo ""
wc -l "$OUTFILE"