diff --git a/.gitignore b/.gitignore index 7386f6e..3c2c698 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,7 @@ logs/ # Environment .env .env.local + +# SQLite databases — never commit +data/*.sqlite3 +data/staging_*.sqlite3 diff --git a/data/create_otivm_db.sql b/data/create_otivm_db.sql new file mode 100644 index 0000000..ebc86ff --- /dev/null +++ b/data/create_otivm_db.sql @@ -0,0 +1,173 @@ +PRAGMA journal_mode = WAL; +PRAGMA foreign_keys = ON; + +-- --------------------------------------------------------------- +-- 1. Lookup tables (written once, never modified) +-- --------------------------------------------------------------- + +CREATE TABLE lifecycle_states ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE +); + +INSERT INTO lifecycle_states VALUES + (1, 'draft'), + (2, 'current'), + (3, 'superseded'), + (4, 'retired'); + +CREATE TABLE confidence_grades ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + description TEXT NOT NULL +); + +INSERT INTO confidence_grades VALUES + (1, 'measured', 'Directly observed or instrumentally measured. Published dataset with explicit methodology.'), + (2, 'indicated', 'Recorded in registry or survey without direct measurement. Classification may be broad.'), + (3, 'inferred', 'Derived from landscape position, proximity to measured cells, or modelled from adjacent data.'), + (4, 'no_data', 'Source dataset has no coverage for this cell. Field value is a known placeholder.'); + +CREATE TABLE source_registry ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source_key TEXT NOT NULL UNIQUE, -- e.g. 'GEBCO_2025' + source_name TEXT NOT NULL, + source_url TEXT, + version TEXT NOT NULL, + license TEXT, + citation TEXT, + registered_at TEXT NOT NULL -- ISO 8601 UTC +); + +INSERT INTO source_registry (source_key, source_name, source_url, version, license, citation, registered_at) VALUES + ('GEBCO_2025', + 'GEBCO 2025 Grid', + 'https://www.gebco.net/data_and_products/gridded_bathymetry_data/', + '2025', + 'CC-BY 4.0', + 'GEBCO Compilation Group (2025) GEBCO 2025 Grid (doi:10.5285/a29c5465-b138-234d-e053-6c86abc0dc7f)', + '2026-04-26T00:00:00Z'), + + ('ESA_WORLDCOVER_V200', + 'ESA WorldCover v200', + 'https://esa-worldcover.org/', + 'v2.0.0', + 'CC-BY 4.0', + 'Zanaga et al. (2022) ESA WorldCover 10m 2021 v200 (doi:10.5281/zenodo.7254221)', + '2026-04-26T00:00:00Z'), + + ('HYDROSHEDS_V11', + 'HydroSHEDS v1.1', + 'https://www.hydrosheds.org/', + '1.1', + 'CC-BY 4.0', + 'Lehner et al. (2022) HydroSHEDS v1.1 Technical Documentation. WWF US, Washington DC.', + '2026-04-26T00:00:00Z'), + + ('USGS_MRDS', + 'USGS Mineral Resources Data System', + 'https://mrdata.usgs.gov/mrds/', + '2022-08-23', + 'public domain', + 'USGS (2022) Mineral Resources Data System (MRDS). U.S. Geological Survey Data Release.', + '2026-04-26T00:00:00Z'), + + ('BGR_IGME5000', + 'BGR IGME5000', + 'https://www.bgr.bund.de/igme5000/', + '2007', + 'Geonutz 2013', + 'Asch K. (2005) The 1:5 Million International Geological Map of Europe and Adjacent Areas. BGR, Hannover.', + '2026-04-26T00:00:00Z'), + + ('TESSERA3_SEED', + 'TESSERA 3.0 seed extraction', + NULL, + '2026-04-26', + 'internal', + 'TheRON/tesserav3 pipeline, tessera.db SpatiaLite export, 2026-04-26.', + '2026-04-26T00:00:00Z'); + +-- --------------------------------------------------------------- +-- 2. Pipeline tracking +-- --------------------------------------------------------------- + +CREATE TABLE pipeline_runs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_key TEXT NOT NULL UNIQUE, -- e.g. 'tessera3-seed-2026-04-26' + started_at TEXT NOT NULL, -- ISO 8601 UTC + completed_at TEXT, -- NULL while running + status INTEGER NOT NULL REFERENCES lifecycle_states(id), + h5_cells TEXT NOT NULL, -- JSON array of H3 res-5 integer IDs + fields_updated TEXT NOT NULL, -- JSON array of field names + source_versions TEXT NOT NULL, -- JSON object: {source_key: version} + row_count INTEGER, -- NULL while running + notes TEXT +); + +-- --------------------------------------------------------------- +-- 3. Core cell table +-- --------------------------------------------------------------- + +-- H3 cell IDs are stored as INTEGER (64-bit H3 index), not TEXT. +-- Use h3.str_to_int() / h3.int_to_str() in Python for conversion. + +CREATE TABLE tessera_cells ( + -- Identity + id INTEGER PRIMARY KEY AUTOINCREMENT, + h9 INTEGER NOT NULL, -- H3 res-9 index (64-bit) + h7 INTEGER NOT NULL, -- H3 res-7 parent index + h5 INTEGER NOT NULL, -- H3 res-5 grandparent index (waypoint) + lat REAL NOT NULL, -- H9 centroid latitude + lon REAL NOT NULL, -- H9 centroid longitude + + -- Physical fields (RFC-TESSERA-2.0-001 byte layout preserved) + elev_cm INTEGER, -- Elevation in cm, signed 24-bit range + terrain INTEGER, -- Appendix A terrain code + hydro INTEGER, -- Hydrology code + geo_dep INTEGER, -- Geology deposit code + geo_flag INTEGER, -- Geology flag code + occ_flag INTEGER, -- RFC-TESSERA-3.0-OCC-001 Section 2 code + + -- Provenance per field (source FK + confidence FK) + elev_src INTEGER REFERENCES source_registry(id), + elev_conf INTEGER REFERENCES confidence_grades(id), + terr_src INTEGER REFERENCES source_registry(id), + terr_conf INTEGER REFERENCES confidence_grades(id), + hydro_src INTEGER REFERENCES source_registry(id), + hydro_conf INTEGER REFERENCES confidence_grades(id), + gdep_src INTEGER REFERENCES source_registry(id), + gdep_conf INTEGER REFERENCES confidence_grades(id), + gflag_src INTEGER REFERENCES source_registry(id), + gflag_conf INTEGER REFERENCES confidence_grades(id), + occ_src INTEGER REFERENCES source_registry(id), + occ_conf INTEGER REFERENCES confidence_grades(id), + + -- Lifecycle + status INTEGER NOT NULL DEFAULT 1 + REFERENCES lifecycle_states(id), + run_id INTEGER NOT NULL REFERENCES pipeline_runs(id), + created_at TEXT NOT NULL, -- ISO 8601 UTC + superseded_by INTEGER REFERENCES tessera_cells(id), + retired_reason TEXT +); + +CREATE INDEX idx_cells_h9_status ON tessera_cells(h9, status); +CREATE INDEX idx_cells_h5_status ON tessera_cells(h5, status); +CREATE INDEX idx_cells_h7_status ON tessera_cells(h7, status); +CREATE INDEX idx_cells_run ON tessera_cells(run_id); + +-- --------------------------------------------------------------- +-- 4. H5 coverage completeness tracking +-- --------------------------------------------------------------- + +CREATE TABLE h5_coverage ( + h5 INTEGER PRIMARY KEY, -- H3 res-5 index + status INTEGER NOT NULL REFERENCES lifecycle_states(id), + -- 1=draft (in progress), 2=current (complete), 4=retired + h9_total INTEGER NOT NULL, -- Expected H9 count (typically 2401) + h9_current INTEGER NOT NULL DEFAULT 0, + last_updated TEXT NOT NULL, -- ISO 8601 UTC + run_id INTEGER NOT NULL REFERENCES pipeline_runs(id), + notes TEXT +);