diff --git a/CHANGELOG.md b/CHANGELOG.md index 79bfcb4935..8e2abf2126 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,9 +4,27 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## Unreleased +## [0.149.0] - 2023-12-23 ### Added - Added `KAMU_WORKSPACE` env var to handle custom workspace path if needed +- Added `event-bus` crate: an utility component based on Observer design pattern, + which allows event producers and event consumers not to know about each other +- Applied `event-bus` component to inform consumers of dataset removal, dependency changes, + task completions +- Added in-memory dataset dependency graph instead of continous rescanning of all datasets: + - the initial dependencies are computed on demand on first request + - using `petgraph` project to represent dataset dependencies in the form of directed acyclic graph + - further events like new/removed dependency or dataset removal update the graph + - simplified GraphQL APIs and dataset removal check using new dependency graph +- Added prototype of flow management system: + - flows are automatically launched activities, which are either dataset related or represent system process + - flows can have a schedule configuration, using time delta or CRON expressions + - flows system manages activation of flows according to the dynamically changing configuration + - flows system manages triggering of dependent dataset flows, when their inputs have events + - derived flows may have throttling settings +### Changed +- Integrated latest `dill=0.8` version, which removes a need in registering simple dependency binds +- Using new `dill=0.8` to organize bindings of structs to implemented traits via declarative attributes ## [0.148.0] - 2023-12-20 ### Added diff --git a/Cargo.lock b/Cargo.lock index 404be94d2b..e6b890dfc3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -159,9 +159,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.75" +version = "1.0.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" +checksum = "59d2a3357dde987206219e78ecfbbb6e8dad06cbb65292758d3270e6254f7355" [[package]] name = "approx" @@ -535,7 +535,7 @@ dependencies = [ "proc-macro2", "quote", "strum", - "syn 2.0.41", + "syn 2.0.42", "thiserror", ] @@ -571,7 +571,7 @@ checksum = "5fd55a5ba1179988837d24ab4c7cc8ed6efdeff578ede0416b4225a5fca35bd0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -593,18 +593,18 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] name = "async-trait" -version = "0.1.74" +version = "0.1.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9" +checksum = "fdf6721fb0140e4f897002dd086c06f6c27775df19cfe1fccb21181a48fd2c98" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -1480,7 +1480,7 @@ checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" [[package]] name = "container-runtime" -version = "0.148.0" +version = "0.149.0" dependencies = [ "async-trait", "cfg-if", @@ -1762,7 +1762,7 @@ checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -1786,7 +1786,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -1797,7 +1797,7 @@ checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5" dependencies = [ "darling_core", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -2084,9 +2084,9 @@ dependencies = [ [[package]] name = "dill" -version = "0.7.2" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3ecd8df0b8dd4697b1e06e96e27b33a1486d58362b29dbbd56fbd7f170a444b" +checksum = "c8465cf5d6f0f0389d5b2e26f5b9f88d826ced639db09093abbd9b5f957566a4" dependencies = [ "dill-impl", "multimap", @@ -2095,13 +2095,13 @@ dependencies = [ [[package]] name = "dill-impl" -version = "0.7.2" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "805173fe24f50fa870af109652b052ea8ab275374ea6d5eaa0cbab7460b60d7e" +checksum = "4d0f263fd9b684a66c34bea8aa1e814f82f451d9101078ed465193708da174c4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -2269,12 +2269,12 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] name = "enum-variants" -version = "0.148.0" +version = "0.149.0" [[package]] name = "env_logger" @@ -2305,10 +2305,27 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "event-bus" +version = "0.149.0" +dependencies = [ + "async-trait", + "dill", + "env_logger", + "futures", + "internal-error", + "test-group", + "test-log", + "thiserror", + "tokio", + "tracing-subscriber", +] + [[package]] name = "event-sourcing" -version = "0.148.0" +version = "0.149.0" dependencies = [ + "async-stream", "async-trait", "chrono", "event-sourcing-macros", @@ -2322,10 +2339,10 @@ dependencies = [ [[package]] name = "event-sourcing-macros" -version = "0.148.0" +version = "0.149.0" dependencies = [ "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -2489,7 +2506,7 @@ checksum = "53b153fd91e4b0147f4aced87be237c98248656bb01050b96bf3ee89220a8ddb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -3013,7 +3030,7 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "internal-error" -version = "0.148.0" +version = "0.149.0" dependencies = [ "test-log", "thiserror", @@ -3129,7 +3146,7 @@ dependencies = [ [[package]] name = "kamu" -version = "0.148.0" +version = "0.149.0" dependencies = [ "async-recursion", "async-stream", @@ -3151,6 +3168,7 @@ dependencies = [ "digest", "dill", "env_logger", + "event-bus", "filetime", "flatbuffers", "flate2", @@ -3170,6 +3188,7 @@ dependencies = [ "mockall", "object_store", "opendatafabric", + "petgraph", "pin-project", "rand", "regex", @@ -3202,11 +3221,12 @@ dependencies = [ [[package]] name = "kamu-adapter-auth-oso" -version = "0.148.0" +version = "0.149.0" dependencies = [ "async-trait", "dill", "env_logger", + "event-bus", "kamu", "kamu-core", "opendatafabric", @@ -3222,7 +3242,7 @@ dependencies = [ [[package]] name = "kamu-adapter-flight-sql" -version = "0.148.0" +version = "0.149.0" dependencies = [ "arrow-flight", "async-trait", @@ -3246,7 +3266,7 @@ dependencies = [ [[package]] name = "kamu-adapter-graphql" -version = "0.148.0" +version = "0.149.0" dependencies = [ "async-graphql", "async-trait", @@ -3254,12 +3274,14 @@ dependencies = [ "datafusion", "dill", "env_logger", + "event-bus", "futures", "indoc 2.0.4", "internal-error", "kamu", "kamu-core", "kamu-data-utils", + "kamu-flow-system", "kamu-task-system", "mockall", "opendatafabric", @@ -3270,13 +3292,14 @@ dependencies = [ "test-log", "thiserror", "tokio", + "tokio-stream", "tracing", "tracing-subscriber", ] [[package]] name = "kamu-adapter-http" -version = "0.148.0" +version = "0.149.0" dependencies = [ "async-trait", "axum", @@ -3286,6 +3309,7 @@ dependencies = [ "container-runtime", "dill", "env_logger", + "event-bus", "flate2", "fs_extra", "futures", @@ -3319,7 +3343,7 @@ dependencies = [ [[package]] name = "kamu-adapter-oauth" -version = "0.148.0" +version = "0.149.0" dependencies = [ "async-trait", "dill", @@ -3342,7 +3366,7 @@ dependencies = [ [[package]] name = "kamu-cli" -version = "0.148.0" +version = "0.149.0" dependencies = [ "arrow-flight", "async-graphql", @@ -3363,6 +3387,7 @@ dependencies = [ "dirs", "duration-string", "env_logger", + "event-bus", "fs_extra", "futures", "glob", @@ -3380,6 +3405,7 @@ dependencies = [ "kamu-adapter-http", "kamu-adapter-oauth", "kamu-data-utils", + "kamu-flow-system-inmem", "kamu-task-system-inmem", "libc", "merge", @@ -3425,7 +3451,7 @@ dependencies = [ [[package]] name = "kamu-core" -version = "0.148.0" +version = "0.149.0" dependencies = [ "async-stream", "async-trait", @@ -3452,7 +3478,7 @@ dependencies = [ [[package]] name = "kamu-data-utils" -version = "0.148.0" +version = "0.149.0" dependencies = [ "arrow-digest", "async-trait", @@ -3470,9 +3496,59 @@ dependencies = [ "url", ] +[[package]] +name = "kamu-flow-system" +version = "0.149.0" +dependencies = [ + "async-trait", + "chrono", + "enum-variants", + "event-sourcing", + "internal-error", + "kamu-core", + "kamu-task-system", + "opendatafabric", + "serde", + "serde_with", + "thiserror", + "tokio-stream", + "tracing", + "url", +] + +[[package]] +name = "kamu-flow-system-inmem" +version = "0.149.0" +dependencies = [ + "async-stream", + "async-trait", + "chrono", + "dill", + "env_logger", + "event-bus", + "futures", + "kamu", + "kamu-core", + "kamu-flow-system", + "kamu-task-system", + "kamu-task-system-inmem", + "mockall", + "opendatafabric", + "serde", + "serde_with", + "tempfile", + "test-log", + "thiserror", + "tokio", + "tokio-stream", + "tracing", + "tracing-subscriber", + "url", +] + [[package]] name = "kamu-ingest-datafusion" -version = "0.148.0" +version = "0.149.0" dependencies = [ "async-trait", "chrono", @@ -3510,7 +3586,7 @@ dependencies = [ [[package]] name = "kamu-repo-tools" -version = "0.148.0" +version = "0.149.0" dependencies = [ "chrono", "clap", @@ -3524,7 +3600,7 @@ dependencies = [ [[package]] name = "kamu-task-system" -version = "0.148.0" +version = "0.149.0" dependencies = [ "async-trait", "chrono", @@ -3543,13 +3619,14 @@ dependencies = [ [[package]] name = "kamu-task-system-inmem" -version = "0.148.0" +version = "0.149.0" dependencies = [ "async-stream", "async-trait", "chrono", "dill", "env_logger", + "event-bus", "futures", "kamu-core", "kamu-task-system", @@ -3560,6 +3637,7 @@ dependencies = [ "test-log", "thiserror", "tokio", + "tokio-stream", "tracing", "tracing-subscriber", "url", @@ -3900,9 +3978,9 @@ dependencies = [ [[package]] name = "minus" -version = "5.5.2" +version = "5.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ded94404aa868079ffb7ee9f419e78d81494511dc92cf5f4de2278149e8a0178" +checksum = "14b5f31d6666667f707078608f25e7615c48c2243a06b66ca0fa6c4ecb96362d" dependencies = [ "crossbeam-channel", "crossterm", @@ -4194,7 +4272,7 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" [[package]] name = "opendatafabric" -version = "0.148.0" +version = "0.149.0" dependencies = [ "arrow", "base64 0.21.5", @@ -4266,9 +4344,9 @@ dependencies = [ [[package]] name = "oso" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b1284fdd195f8468d55f44113097bab0700202cdddead5722726cf2d0aa7d2b" +checksum = "39eef9c07e27299175fa31a3731166124a744e8518a5cdd312b5c83817c8e0e2" dependencies = [ "impl-trait-for-tuples", "lazy_static", @@ -4281,9 +4359,9 @@ dependencies = [ [[package]] name = "oso-derive" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9834041394623a69a49b9e863061f77ce9e63fece53c98fd665f736511e1ad2d" +checksum = "fee1a92f9b7bffca70109cc17ad19602f9239ee868f7ab0ab6ed32af47ab0041" dependencies = [ "quote", "syn 1.0.109", @@ -4460,7 +4538,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -4548,7 +4626,7 @@ checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -4585,9 +4663,9 @@ dependencies = [ [[package]] name = "pkg-config" -version = "0.3.27" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a" [[package]] name = "platforms" @@ -4625,9 +4703,9 @@ dependencies = [ [[package]] name = "polar-core" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "167066349d2cbb9b9b05e0389e6d036f5e466aadcb66f256986703a844284fd4" +checksum = "9cb37e0c341d7adea21ccd491beeedcb84ab2ac3dfbb6ff00f512c840015b756" dependencies = [ "indoc 1.0.9", "js-sys", @@ -4759,9 +4837,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.70" +version = "1.0.71" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39278fbbf5fb4f646ce651690877f89d1c5811a3d4acb27700c1cb3cdb78fd3b" +checksum = "75cb1540fadbd5b8fbccc4dddad2734eba435053f725621c070711a14bb5f4b8" dependencies = [ "unicode-ident", ] @@ -4786,7 +4864,7 @@ dependencies = [ "itertools 0.11.0", "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -5070,7 +5148,7 @@ dependencies = [ "quote", "rust-embed-utils", "shellexpand", - "syn 2.0.41", + "syn 2.0.42", "walkdir", ] @@ -5267,7 +5345,7 @@ checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -5293,9 +5371,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.4" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12022b835073e5b11e90a14f86838ceb1c8fb0325b72416845c487ac0fa95e80" +checksum = "eb3622f419d1296904700073ea6cc23ad690adbd66f13ea683df73298736f0c1" dependencies = [ "serde", ] @@ -5338,14 +5416,14 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] name = "serde_yaml" -version = "0.9.27" +version = "0.9.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cc7a1570e38322cfe4154732e5110f887ea57e22b76f4bfd32b5bdd3368666c" +checksum = "a15e0ef66bf939a7c890a0bf6d5a733c70202225f9888a89ed5c62298b019129" dependencies = [ "indexmap 2.1.0", "itoa", @@ -5661,7 +5739,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -5683,9 +5761,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.41" +version = "2.0.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44c8b28c477cc3bf0e7966561e3460130e1255f7a1cf71931075f1c5e7a7e269" +checksum = "5b7d0a2c048d661a1a59fcd7355baa232f7ed34e0ee4df2eef3c1c1c0d3852d8" dependencies = [ "proc-macro2", "quote", @@ -5779,7 +5857,7 @@ dependencies = [ "proc-macro2", "quote", "sha2", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -5801,7 +5879,7 @@ checksum = "7ba277e77219e9eea169e8508942db1bf5d8a41ff2db9b20aab5a5aadc9fa25d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -5830,7 +5908,7 @@ checksum = "01742297787513b79cf8e29d1056ede1313e2420b7b3b15d0a768b4921f549df" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -5921,9 +5999,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.35.0" +version = "1.35.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d45b238a16291a4e1584e61820b8ae57d696cc5015c459c229ccc6990cc1c" +checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104" dependencies = [ "backtrace", "bytes", @@ -5956,7 +6034,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -6168,7 +6246,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] @@ -6223,7 +6301,7 @@ dependencies = [ [[package]] name = "tracing-perfetto" -version = "0.148.0" +version = "0.149.0" dependencies = [ "env_logger", "serde", @@ -6539,7 +6617,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", "wasm-bindgen-shared", ] @@ -6573,7 +6651,7 @@ checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -6937,22 +7015,22 @@ checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" [[package]] name = "zerocopy" -version = "0.7.31" +version = "0.7.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.31" +version = "0.7.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.41", + "syn 2.0.42", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 65e1119cfc..ecb7c63f0b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "src/utils/container-runtime", "src/utils/data-utils", "src/utils/enum-variants", + "src/utils/event-bus", "src/utils/event-sourcing", "src/utils/event-sourcing-macros", "src/utils/internal-error", @@ -13,10 +14,12 @@ members = [ "src/domain/opendatafabric", "src/domain/core", "src/domain/task-system", + "src/domain/flow-system", # Infra "src/infra/core", "src/infra/ingest-datafusion", "src/infra/task-system-inmem", + "src/infra/flow-system-inmem", # Adapters "src/adapter/auth-oso", "src/adapter/flight-sql", @@ -31,30 +34,33 @@ resolver = "2" [workspace.dependencies] # Utils -container-runtime = { version = "0.148.0", path = "src/utils/container-runtime", default-features = false } -enum-variants = { version = "0.148.0", path = "src/utils/enum-variants", default-features = false } -event-sourcing = { version = "0.148.0", path = "src/utils/event-sourcing", default-features = false } -event-sourcing-macros = { version = "0.148.0", path = "src/utils/event-sourcing-macros", default-features = false } -internal-error = { version = "0.148.0", path = "src/utils/internal-error", default-features = false } -kamu-data-utils = { version = "0.148.0", path = "src/utils/data-utils", default-features = false } -tracing-perfetto = { version = "0.148.0", path = "src/utils/tracing-perfetto", default-features = false } +container-runtime = { version = "0.149.0", path = "src/utils/container-runtime", default-features = false } +enum-variants = { version = "0.149.0", path = "src/utils/enum-variants", default-features = false } +event-bus = { version = "0.149.0", path = "src/utils/event-bus", default-features = false} +event-sourcing = { version = "0.149.0", path = "src/utils/event-sourcing", default-features = false } +event-sourcing-macros = { version = "0.149.0", path = "src/utils/event-sourcing-macros", default-features = false } +internal-error = { version = "0.149.0", path = "src/utils/internal-error", default-features = false } +kamu-data-utils = { version = "0.149.0", path = "src/utils/data-utils", default-features = false } +tracing-perfetto = { version = "0.149.0", path = "src/utils/tracing-perfetto", default-features = false } # Domain -opendatafabric = { version = "0.148.0", path = "src/domain/opendatafabric", default-features = false } -kamu-core = { version = "0.148.0", path = "src/domain/core", default-features = false } -kamu-task-system = { version = "0.148.0", path = "src/domain/task-system", default-features = false } +opendatafabric = { version = "0.149.0", path = "src/domain/opendatafabric", default-features = false } +kamu-core = { version = "0.149.0", path = "src/domain/core", default-features = false } +kamu-task-system = { version = "0.149.0", path = "src/domain/task-system", default-features = false } +kamu-flow-system = { version = "0.149.0", path = "src/domain/flow-system", default-features = false } # Infra -kamu = { version = "0.148.0", path = "src/infra/core", default-features = false } -kamu-ingest-datafusion = { version = "0.148.0", path = "src/infra/ingest-datafusion", default-features = false } -kamu-task-system-inmem = { version = "0.148.0", path = "src/infra/task-system-inmem", default-features = false } +kamu = { version = "0.149.0", path = "src/infra/core", default-features = false } +kamu-ingest-datafusion = { version = "0.149.0", path = "src/infra/ingest-datafusion", default-features = false } +kamu-task-system-inmem = { version = "0.149.0", path = "src/infra/task-system-inmem", default-features = false } +kamu-flow-system-inmem = { version = "0.149.0", path = "src/infra/flow-system-inmem", default-features = false } # Adapters -kamu-adapter-auth-oso = { version = "0.148.0", path = "src/adapter/auth-oso", default-features = false } -kamu-adapter-flight-sql = { version = "0.148.0", path = "src/adapter/flight-sql", default-features = false } -kamu-adapter-graphql = { version = "0.148.0", path = "src/adapter/graphql", default-features = false } -kamu-adapter-http = { version = "0.148.0", path = "src/adapter/http", default-features = false } -kamu-adapter-oauth = { version = "0.148.0", path = "src/adapter/oauth", defualt-features = false } +kamu-adapter-auth-oso = { version = "0.149.0", path = "src/adapter/auth-oso", default-features = false } +kamu-adapter-flight-sql = { version = "0.149.0", path = "src/adapter/flight-sql", default-features = false } +kamu-adapter-graphql = { version = "0.149.0", path = "src/adapter/graphql", default-features = false } +kamu-adapter-http = { version = "0.149.0", path = "src/adapter/http", default-features = false } +kamu-adapter-oauth = { version = "0.149.0", path = "src/adapter/oauth", defualt-features = false } [workspace.package] -version = "0.148.0" +version = "0.149.0" edition = "2021" homepage = "https://github.com/kamu-data/kamu-cli" repository = "https://github.com/kamu-data/kamu-cli" diff --git a/LICENSE.txt b/LICENSE.txt index e72bb70497..aa22827329 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -11,7 +11,7 @@ Business Source License 1.1 Licensor: Kamu Data, Inc. -Licensed Work: Kamu CLI Version 0.148.0 +Licensed Work: Kamu CLI Version 0.149.0 The Licensed Work is © 2023 Kamu Data, Inc. Additional Use Grant: You may use the Licensed Work for any purpose, @@ -24,7 +24,7 @@ Additional Use Grant: You may use the Licensed Work for any purpose, Licensed Work where data or transformations are controlled by such third parties. -Change Date: 2027-12-20 +Change Date: 2027-12-22 Change License: Apache License, Version 2.0 diff --git a/src/adapter/auth-oso/Cargo.toml b/src/adapter/auth-oso/Cargo.toml index f8a882fa18..db6492b85c 100644 --- a/src/adapter/auth-oso/Cargo.toml +++ b/src/adapter/auth-oso/Cargo.toml @@ -21,7 +21,7 @@ opendatafabric = { workspace = true } kamu-core = { workspace = true } async-trait = "0.1" -dill = "0.7" +dill = "0.8" # Authorization oso = "0.27" @@ -29,6 +29,7 @@ oso-derive = "0.27" [dev-dependencies] kamu = { workspace = true } +event-bus = { workspace = true } env_logger = "0.10" tempfile = "3" diff --git a/src/adapter/auth-oso/src/oso_dataset_authorizer.rs b/src/adapter/auth-oso/src/oso_dataset_authorizer.rs index edd565765f..26d6d0600b 100644 --- a/src/adapter/auth-oso/src/oso_dataset_authorizer.rs +++ b/src/adapter/auth-oso/src/oso_dataset_authorizer.rs @@ -11,7 +11,7 @@ use std::collections::HashSet; use std::str::FromStr; use std::sync::Arc; -use dill::component; +use dill::*; use kamu_core::auth::*; use kamu_core::{AccessError, CurrentAccountSubject, ErrorIntoInternal}; use opendatafabric::DatasetHandle; @@ -31,6 +31,7 @@ pub struct OsoDatasetAuthorizer { /////////////////////////////////////////////////////////////////////////////// #[component(pub)] +#[interface(dyn DatasetActionAuthorizer)] impl OsoDatasetAuthorizer { pub fn new( kamu_auth_oso: Arc, diff --git a/src/adapter/auth-oso/tests/tests/test_oso_dataset_authorizer.rs b/src/adapter/auth-oso/tests/tests/test_oso_dataset_authorizer.rs index b9b317998b..8fdc3502b7 100644 --- a/src/adapter/auth-oso/tests/tests/test_oso_dataset_authorizer.rs +++ b/src/adapter/auth-oso/tests/tests/test_oso_dataset_authorizer.rs @@ -11,8 +11,10 @@ use std::assert_matches::assert_matches; use std::collections::HashSet; use std::sync::Arc; +use dill::Component; +use event_bus::EventBus; use kamu::testing::MetadataFactory; -use kamu::DatasetRepositoryLocalFs; +use kamu::{DatasetRepositoryLocalFs, DependencyGraphServiceInMemory}; use kamu_adapter_auth_oso::{KamuAuthOso, OsoDatasetAuthorizer}; use kamu_core::auth::{DatasetAction, DatasetActionAuthorizer, DatasetActionUnauthorizedError}; use kamu_core::{AccessError, CurrentAccountSubject, DatasetRepository}; @@ -93,7 +95,7 @@ async fn test_guest_can_read_but_not_write() { pub struct DatasetAuthorizerHarness { tempdir: TempDir, dataset_repository: Arc, - dataset_authorizer: Arc, + dataset_authorizer: Arc, } impl DatasetAuthorizerHarness { @@ -102,21 +104,24 @@ impl DatasetAuthorizerHarness { let datasets_dir = tempdir.path().join("datasets"); std::fs::create_dir(&datasets_dir).unwrap(); - let current_account_subject = Arc::new(CurrentAccountSubject::logged( - AccountName::new_unchecked(current_account_name), - )); - - let dataset_authorizer = Arc::new(OsoDatasetAuthorizer::new( - Arc::new(KamuAuthOso::new()), - current_account_subject.clone(), - )); - - let dataset_repository = Arc::new(DatasetRepositoryLocalFs::new( - datasets_dir, - current_account_subject.clone(), - dataset_authorizer.clone(), - true, - )); + let catalog = dill::CatalogBuilder::new() + .add::() + .add_value(CurrentAccountSubject::logged(AccountName::new_unchecked( + current_account_name, + ))) + .add::() + .add::() + .add::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(datasets_dir) + .with_multi_tenant(true), + ) + .bind::() + .build(); + + let dataset_repository = catalog.get_one::().unwrap(); + let dataset_authorizer = catalog.get_one::().unwrap(); Self { tempdir, diff --git a/src/adapter/graphql/Cargo.toml b/src/adapter/graphql/Cargo.toml index 7818c7b362..b4b2a8aeea 100644 --- a/src/adapter/graphql/Cargo.toml +++ b/src/adapter/graphql/Cargo.toml @@ -20,20 +20,24 @@ doctest = false [dependencies] internal-error = { workspace = true } opendatafabric = { workspace = true } + kamu-data-utils = { workspace = true } kamu-core = { workspace = true } kamu-task-system = { workspace = true } +kamu-flow-system = { workspace = true } +event-bus = { workspace = true } async-graphql = { version = "6", features = ["chrono", "url", "apollo_tracing"] } async-trait = { version = "0.1", default-features = false } chrono = "0.4" datafusion = "33" # TODO: Currently needed for type conversions but ideally should be encapsulated by kamu-core -dill = "0.7" +dill = "0.8" futures = "0.3" indoc = "2" serde = "1" serde_json = "1" tokio = { version = "1", default-features = false, features = [] } +tokio-stream = { version = "0.1", default-features = false } tracing = "0.1" thiserror = { version = "1", default-features = false } @@ -41,6 +45,7 @@ thiserror = { version = "1", default-features = false } [dev-dependencies] # TODO: Limit to mock or in-memory implementations only kamu = { workspace = true } +event-bus = {workspace = true } env_logger = "0.10" mockall = "0.11" diff --git a/src/adapter/graphql/src/lib.rs b/src/adapter/graphql/src/lib.rs index 1663c0825b..dfc4122c60 100644 --- a/src/adapter/graphql/src/lib.rs +++ b/src/adapter/graphql/src/lib.rs @@ -10,6 +10,7 @@ #![feature(error_generic_member_access)] #![feature(error_in_core)] #![feature(int_roundings)] +#![feature(async_closure)] pub mod extensions; pub(crate) mod mutations; diff --git a/src/adapter/graphql/src/queries/datasets/dataset_metadata.rs b/src/adapter/graphql/src/queries/datasets/dataset_metadata.rs index f232c4058a..6e6998b475 100644 --- a/src/adapter/graphql/src/queries/datasets/dataset_metadata.rs +++ b/src/adapter/graphql/src/queries/datasets/dataset_metadata.rs @@ -8,7 +8,6 @@ // by the Apache License, Version 2.0. use chrono::prelude::*; -use futures::TryStreamExt; use kamu_core::{self as domain, MetadataChainExt, TryStreamExtExt}; use opendatafabric as odf; use opendatafabric::{AsTypedBlock, VariantOf}; @@ -93,39 +92,59 @@ impl DatasetMetadata { /// Current upstream dependencies of a dataset async fn current_upstream_dependencies(&self, ctx: &Context<'_>) -> Result> { - let dataset_repo = from_catalog::(ctx).unwrap(); + let dependency_graph_service = + from_catalog::(ctx).unwrap(); - let dataset = self.get_dataset(ctx).await?; - let summary = dataset - .get_summary(domain::GetSummaryOpts::default()) + use tokio_stream::StreamExt; + let upstream_dataset_ids: Vec<_> = dependency_graph_service + .get_upstream_dependencies(&self.dataset_handle.id) .await - .int_err()?; + .int_err()? + .collect() + .await; - let mut dependencies: Vec<_> = Vec::new(); - for input in summary.dependencies.into_iter() { - let dataset_id = input.id.unwrap().clone(); - let dataset_handle = dataset_repo - .resolve_dataset_ref(&dataset_id.as_local_ref()) + let dataset_repo = from_catalog::(ctx).unwrap(); + let mut upstream = Vec::with_capacity(upstream_dataset_ids.len()); + for upstream_dataset_id in upstream_dataset_ids { + let hdl = dataset_repo + .resolve_dataset_ref(&upstream_dataset_id.as_local_ref()) .await .int_err()?; - dependencies.push(Dataset::new( - Account::from_dataset_alias(ctx, &dataset_handle.alias), - dataset_handle, + upstream.push(Dataset::new( + Account::from_dataset_alias(ctx, &hdl.alias), + hdl, )); } - Ok(dependencies) + + Ok(upstream) } // TODO: Convert to collection /// Current downstream dependencies of a dataset async fn current_downstream_dependencies(&self, ctx: &Context<'_>) -> Result> { - let dataset_repo = from_catalog::(ctx).unwrap(); + let dependency_graph_service = + from_catalog::(ctx).unwrap(); - let downstream: Vec<_> = dataset_repo - .get_downstream_dependencies(&self.dataset_handle.as_local_ref()) - .map_ok(|hdl| Dataset::new(Account::from_dataset_alias(ctx, &hdl.alias), hdl)) - .try_collect() - .await?; + use tokio_stream::StreamExt; + let downstream_dataset_ids: Vec<_> = dependency_graph_service + .get_downstream_dependencies(&self.dataset_handle.id) + .await + .int_err()? + .collect() + .await; + + let dataset_repo = from_catalog::(ctx).unwrap(); + let mut downstream = Vec::with_capacity(downstream_dataset_ids.len()); + for downstream_dataset_id in downstream_dataset_ids { + let hdl = dataset_repo + .resolve_dataset_ref(&downstream_dataset_id.as_local_ref()) + .await + .int_err()?; + downstream.push(Dataset::new( + Account::from_dataset_alias(ctx, &hdl.alias), + hdl, + )); + } Ok(downstream) } diff --git a/src/adapter/graphql/src/queries/tasks/tasks.rs b/src/adapter/graphql/src/queries/tasks/tasks.rs index 98fe2bf3f2..d9d99dc5bb 100644 --- a/src/adapter/graphql/src/queries/tasks/tasks.rs +++ b/src/adapter/graphql/src/queries/tasks/tasks.rs @@ -35,6 +35,7 @@ impl Tasks { /// Returns states of tasks associated with a given dataset ordered by /// creation time from newest to oldest + // TODO: reconsider performance impact async fn list_tasks_by_dataset( &self, ctx: &Context<'_>, @@ -47,8 +48,11 @@ impl Tasks { let page = page.unwrap_or(0); let per_page = per_page.unwrap_or(Self::DEFAULT_PER_PAGE); - let mut nodes: Vec<_> = task_sched + let tasks_stream = task_sched .list_tasks_by_dataset(&dataset_id) + .map_err(|e| e.int_err())?; + + let mut nodes: Vec<_> = tasks_stream .skip(page * per_page) .take(per_page + 1) // Take one extra to see if next page exists .map_ok(|t| Task::new(t)) diff --git a/src/adapter/graphql/tests/tests/test_error_handling.rs b/src/adapter/graphql/tests/tests/test_error_handling.rs index bf8d1114d3..ed0b04d741 100644 --- a/src/adapter/graphql/tests/tests/test_error_handling.rs +++ b/src/adapter/graphql/tests/tests/test_error_handling.rs @@ -7,8 +7,8 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use std::sync::Arc; - +use dill::Component; +use event_bus::EventBus; use indoc::indoc; use kamu::*; use kamu_core::*; @@ -57,19 +57,22 @@ async fn test_malformed_argument() { async fn test_internal_error() { let tempdir = tempfile::tempdir().unwrap(); - // Note: Not creating a repo to cause an error - let dataset_repo = DatasetRepositoryLocalFs::new( - tempdir.path().join("datasets"), - Arc::new(CurrentAccountSubject::new_test()), - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - false, - ); - let cat = dill::CatalogBuilder::new() - .add_value(dataset_repo) + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add::() + .add::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(tempdir.path().join("datasets")) + .with_multi_tenant(false), + ) .bind::() .build(); + // Note: Not creating a repo to cause an error + let _ = cat.get_one::().unwrap(); + let schema = kamu_adapter_graphql::schema_quiet(); let res = schema.execute(async_graphql::Request::new(indoc!( r#" diff --git a/src/adapter/graphql/tests/tests/test_gql_data.rs b/src/adapter/graphql/tests/tests/test_gql_data.rs index d886d68658..dfe38d0fca 100644 --- a/src/adapter/graphql/tests/tests/test_gql_data.rs +++ b/src/adapter/graphql/tests/tests/test_gql_data.rs @@ -13,6 +13,8 @@ use std::sync::Arc; use datafusion::arrow::array::*; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::arrow::record_batch::RecordBatch; +use dill::Component; +use event_bus::EventBus; use kamu::testing::{MetadataFactory, ParquetWriterHelper}; use kamu::*; use kamu_core::*; @@ -22,21 +24,19 @@ use opendatafabric::*; async fn create_catalog_with_local_workspace(tempdir: &Path) -> dill::Catalog { dill::CatalogBuilder::new() + .add::() + .add::() .add_builder( - dill::builder_for::() + DatasetRepositoryLocalFs::builder() .with_root(tempdir.join("datasets")) .with_current_account_subject(Arc::new(CurrentAccountSubject::new_test())) .with_multi_tenant(false), ) .bind::() .add::() - .bind::() .add::() - .bind::() - .add_value(ObjectStoreBuilderLocalFs::new()) - .bind::() + .add::() .add::() - .bind::() .build() } diff --git a/src/adapter/graphql/tests/tests/test_gql_datasets.rs b/src/adapter/graphql/tests/tests/test_gql_datasets.rs index 15df5080d4..24dd5713e7 100644 --- a/src/adapter/graphql/tests/tests/test_gql_datasets.rs +++ b/src/adapter/graphql/tests/tests/test_gql_datasets.rs @@ -8,6 +8,8 @@ // by the Apache License, Version 2.0. use async_graphql::*; +use dill::Component; +use event_bus::EventBus; use indoc::indoc; use kamu::testing::MetadataFactory; use kamu::*; @@ -375,6 +377,7 @@ async fn dataset_rename_name_collision() { #[test_log::test(tokio::test)] async fn dataset_delete_success() { let harness = GraphQLDatasetsHarness::new(); + harness.init_dependencies_graph().await; let foo_result = harness .create_root_dataset(DatasetName::new_unchecked("foo")) @@ -424,6 +427,7 @@ async fn dataset_delete_success() { #[test_log::test(tokio::test)] async fn dataset_delete_dangling_ref() { let harness = GraphQLDatasetsHarness::new(); + harness.init_dependencies_graph().await; let foo_result = harness .create_root_dataset(DatasetName::new_unchecked("foo")) @@ -529,7 +533,7 @@ async fn dataset_view_permissions() { struct GraphQLDatasetsHarness { _tempdir: tempfile::TempDir, - _base_catalog: dill::Catalog, + base_catalog: dill::Catalog, catalog_authorized: dill::Catalog, catalog_anonymous: dill::Catalog, } @@ -541,8 +545,10 @@ impl GraphQLDatasetsHarness { std::fs::create_dir(&datasets_dir).unwrap(); let base_catalog = dill::CatalogBuilder::new() + .add::() + .add::() .add_builder( - dill::builder_for::() + DatasetRepositoryLocalFs::builder() .with_root(datasets_dir) .with_multi_tenant(false), ) @@ -550,19 +556,33 @@ impl GraphQLDatasetsHarness { .add_value(kamu::testing::MockAuthenticationService::built_in()) .bind::() .add::() - .bind::() .build(); let (catalog_anonymous, catalog_authorized) = authentication_catalogs(&base_catalog); Self { _tempdir: tempdir, - _base_catalog: base_catalog, + base_catalog, catalog_anonymous, catalog_authorized, } } + pub async fn init_dependencies_graph(&self) { + let dataset_repo = self + .catalog_authorized + .get_one::() + .unwrap(); + let dependency_graph_service = self + .base_catalog + .get_one::() + .unwrap(); + dependency_graph_service + .eager_initialization(&DependencyGraphRepositoryInMemory::new(dataset_repo)) + .await + .unwrap(); + } + pub async fn create_root_dataset(&self, name: DatasetName) -> CreateDatasetResult { let dataset_repo = self .catalog_authorized diff --git a/src/adapter/graphql/tests/tests/test_gql_metadata.rs b/src/adapter/graphql/tests/tests/test_gql_metadata.rs index d66ba1305b..c69daed618 100644 --- a/src/adapter/graphql/tests/tests/test_gql_metadata.rs +++ b/src/adapter/graphql/tests/tests/test_gql_metadata.rs @@ -9,6 +9,7 @@ use async_graphql::*; use dill::*; +use event_bus::EventBus; use indoc::indoc; use kamu::testing::MetadataFactory; use kamu::*; @@ -24,6 +25,7 @@ async fn test_current_push_sources() { let tempdir = tempfile::tempdir().unwrap(); let base_catalog = CatalogBuilder::new() + .add::() .add_builder( DatasetRepositoryLocalFs::builder() .with_root(tempdir.path().join("datasets")) @@ -33,13 +35,10 @@ async fn test_current_push_sources() { .add_builder(PushIngestServiceImpl::builder().with_run_info_dir(tempdir.path().join("run"))) .bind::() .add::() - .bind::() .add::() - .bind::() .add::() - .bind::() .add::() - .bind::() + .add::() .build(); // Init dataset with no sources diff --git a/src/adapter/graphql/tests/tests/test_gql_metadata_chain.rs b/src/adapter/graphql/tests/tests/test_gql_metadata_chain.rs index 77d18ddad3..8e059b43ba 100644 --- a/src/adapter/graphql/tests/tests/test_gql_metadata_chain.rs +++ b/src/adapter/graphql/tests/tests/test_gql_metadata_chain.rs @@ -10,6 +10,8 @@ use std::sync::Arc; use async_graphql::*; +use dill::Component; +use event_bus::EventBus; use indoc::indoc; use kamu::testing::MetadataFactory; use kamu::*; @@ -26,14 +28,15 @@ async fn test_metadata_chain_events() { let tempdir = tempfile::tempdir().unwrap(); let base_catalog = dill::CatalogBuilder::new() + .add::() + .add::() .add_builder( - dill::builder_for::() + DatasetRepositoryLocalFs::builder() .with_root(tempdir.path().join("datasets")) .with_multi_tenant(false), ) .bind::() .add::() - .bind::() .build(); // Init dataset @@ -175,14 +178,15 @@ async fn metadata_chain_append_event() { let tempdir = tempfile::tempdir().unwrap(); let base_catalog = dill::CatalogBuilder::new() + .add::() + .add::() .add_builder( - dill::builder_for::() + DatasetRepositoryLocalFs::builder() .with_root(tempdir.path().join("datasets")) .with_multi_tenant(false), ) .bind::() .add::() - .bind::() .build(); let (catalog_anonymous, catalog_authorized) = authentication_catalogs(&base_catalog); @@ -271,14 +275,15 @@ async fn metadata_update_readme_new() { let tempdir = tempfile::tempdir().unwrap(); let base_catalog = dill::CatalogBuilder::new() + .add::() + .add::() .add_builder( - dill::builder_for::() + DatasetRepositoryLocalFs::builder() .with_root(tempdir.path().join("datasets")) .with_multi_tenant(false), ) .bind::() .add::() - .bind::() .build(); let (catalog_anonymous, catalog_authorized) = authentication_catalogs(&base_catalog); diff --git a/src/adapter/graphql/tests/tests/test_gql_search.rs b/src/adapter/graphql/tests/tests/test_gql_search.rs index cff72f9ffe..3b4ba00d6e 100644 --- a/src/adapter/graphql/tests/tests/test_gql_search.rs +++ b/src/adapter/graphql/tests/tests/test_gql_search.rs @@ -8,6 +8,8 @@ // by the Apache License, Version 2.0. use async_graphql::*; +use dill::Component; +use event_bus::EventBus; use kamu::testing::MetadataFactory; use kamu::*; use kamu_core::*; @@ -18,11 +20,12 @@ async fn query() { let tempdir = tempfile::tempdir().unwrap(); let cat = dill::CatalogBuilder::new() + .add::() + .add::() .add_value(CurrentAccountSubject::new_test()) .add::() - .bind::() .add_builder( - dill::builder_for::() + DatasetRepositoryLocalFs::builder() .with_root(tempdir.path().join("datasets")) .with_multi_tenant(false), ) diff --git a/src/adapter/graphql/tests/tests/test_tasks.rs b/src/adapter/graphql/tests/tests/test_tasks.rs index 31d2091c47..26d92829cf 100644 --- a/src/adapter/graphql/tests/tests/test_tasks.rs +++ b/src/adapter/graphql/tests/tests/test_tasks.rs @@ -19,7 +19,7 @@ mockall::mock! { #[async_trait::async_trait] impl TaskScheduler for TaskScheduler { async fn get_task(&self, task_id: TaskID) -> Result; - fn list_tasks_by_dataset<'a>(&'a self, dataset_id: &DatasetID) -> TaskStateStream<'a>; + fn list_tasks_by_dataset<'a>(&'a self, dataset_id: &DatasetID) -> Result, ListTasksByDatasetError>; async fn create_task(&self, plan: LogicalPlan) -> Result; async fn cancel_task(&self, task_id: TaskID) -> Result; async fn take(&self) -> Result; @@ -150,7 +150,11 @@ async fn test_task_list_by_dataset() { let mut task_sched_mock = MockTaskScheduler::new(); task_sched_mock .expect_list_tasks_by_dataset() - .return_once(move |_| Box::pin(futures::stream::iter([Ok(returned_task)].into_iter()))); + .return_once(move |_| { + Ok(Box::pin(futures::stream::iter( + [Ok(returned_task)].into_iter(), + ))) + }); let cat = dill::CatalogBuilder::new() .add_value(task_sched_mock) diff --git a/src/adapter/http/Cargo.toml b/src/adapter/http/Cargo.toml index 814d28bc80..c087a3d632 100644 --- a/src/adapter/http/Cargo.toml +++ b/src/adapter/http/Cargo.toml @@ -21,13 +21,14 @@ doctest = false opendatafabric = { workspace = true } # TODO: Adapters should depend only on kamu-domain crate and be implementation-agnostic kamu = { workspace = true } +event-bus = { workspace = true } axum = { version = "0.6", features = ["ws", "headers"] } axum-extra = { version = "0.8", features = ["async-read-body"] } async-trait = "0.1" bytes = "1" chrono = { version = "0.4", features = ["serde"] } -dill = "0.7" +dill = "0.8" flate2 = "1" # GZip decoder futures = "0.3" http = "0.2" @@ -48,6 +49,7 @@ url = { version = "2", features = ["serde"] } [dev-dependencies] container-runtime = { workspace = true } +event-bus = { workspace = true } env_logger = "0.10" fs_extra = "1.3" # Recursive folder copy diff --git a/src/adapter/http/src/simple_protocol/handlers.rs b/src/adapter/http/src/simple_protocol/handlers.rs index d0ec3aa2a5..b7b48c14da 100644 --- a/src/adapter/http/src/simple_protocol/handlers.rs +++ b/src/adapter/http/src/simple_protocol/handlers.rs @@ -19,6 +19,7 @@ use std::str::FromStr; use std::sync::Arc; +use event_bus::EventBus; use kamu::domain::*; use opendatafabric::serde::flatbuffers::FlatbuffersMetadataBlockSerializer; use opendatafabric::serde::MetadataBlockSerializer; @@ -225,9 +226,12 @@ pub async fn dataset_push_ws_upgrade_handler( Err(err) => Err(err.api_err()), }?; + let event_bus = catalog.get_one::().unwrap(); + Ok(ws.on_upgrade(|socket| { AxumServerPushProtocolInstance::new( socket, + event_bus, dataset_repo, dataset_ref, dataset, diff --git a/src/adapter/http/src/smart_protocol/axum_server_push_protocol.rs b/src/adapter/http/src/smart_protocol/axum_server_push_protocol.rs index 3b662008c8..3e4e8e080a 100644 --- a/src/adapter/http/src/smart_protocol/axum_server_push_protocol.rs +++ b/src/adapter/http/src/smart_protocol/axum_server_push_protocol.rs @@ -10,12 +10,15 @@ use std::collections::VecDeque; use std::sync::Arc; +use event_bus::EventBus; +use kamu::domain::events::DatasetEventDependenciesUpdated; use kamu::domain::{ BlockRef, CorruptedSourceError, Dataset, DatasetRepository, ErrorIntoInternal, + GetSummaryOpts, ResultIntoInternal, }; use opendatafabric::{AsTypedBlock, DatasetRef, MetadataBlock, Multihash}; @@ -36,6 +39,7 @@ const MIN_UPLOAD_PROGRESS_PING_DELAY_SEC: u64 = 10; pub struct AxumServerPushProtocolInstance { socket: axum::extract::ws::WebSocket, + event_bus: Arc, dataset_repo: Arc, dataset_ref: DatasetRef, dataset: Option>, @@ -46,6 +50,7 @@ pub struct AxumServerPushProtocolInstance { impl AxumServerPushProtocolInstance { pub fn new( socket: axum::extract::ws::WebSocket, + event_bus: Arc, dataset_repo: Arc, dataset_ref: DatasetRef, dataset: Option>, @@ -54,6 +59,7 @@ impl AxumServerPushProtocolInstance { ) -> Self { Self { socket, + event_bus, dataset_repo, dataset_ref, dataset, @@ -314,12 +320,29 @@ impl AxumServerPushProtocolInstance { tracing::debug!("Push client sent a complete request. Commiting the dataset"); if new_blocks.len() > 0 { - dataset_append_metadata(self.dataset.as_ref().unwrap().as_ref(), new_blocks) + let dataset = self.dataset.as_ref().unwrap().as_ref(); + let response = dataset_append_metadata(dataset, new_blocks) .await .map_err(|e| { tracing::debug!("Appending dataset metadata failed with error: {}", e); PushServerError::Internal(e.int_err()) })?; + + // TODO: encapsulate this inside dataset/chain + if !response.new_upstream_ids.is_empty() { + let summary = dataset + .get_summary(GetSummaryOpts::default()) + .await + .int_err()?; + + self.event_bus + .dispatch_event(DatasetEventDependenciesUpdated { + dataset_id: summary.id.clone(), + new_upstream_ids: response.new_upstream_ids, + }) + .await + .int_err()?; + } } tracing::debug!("Sending completion confirmation"); diff --git a/src/adapter/http/src/smart_protocol/protocol_dataset_helper.rs b/src/adapter/http/src/smart_protocol/protocol_dataset_helper.rs index e7a12da2e6..cd97aa8b4b 100644 --- a/src/adapter/http/src/smart_protocol/protocol_dataset_helper.rs +++ b/src/adapter/http/src/smart_protocol/protocol_dataset_helper.rs @@ -194,16 +194,41 @@ pub async fn decode_metadata_batch( ///////////////////////////////////////////////////////////////////////////////////////// +pub struct AppendMetadataResponse { + pub new_upstream_ids: Vec, +} + pub async fn dataset_append_metadata( dataset: &dyn Dataset, metadata: VecDeque<(Multihash, MetadataBlock)>, -) -> Result<(), AppendError> { +) -> Result { let old_head = metadata.front().unwrap().1.prev_block_hash.clone(); let new_head = metadata.back().unwrap().0.clone(); let metadata_chain = dataset.as_metadata_chain(); + + let mut new_upstream_ids: Vec = vec![]; + for (hash, block) in metadata { tracing::debug!(sequence_numer = %block.sequence_number, hash = %hash, "Appending block"); + + if let opendatafabric::MetadataEvent::SetTransform(transform) = &block.event { + // Collect only the latest upstream dataset IDs + new_upstream_ids.clear(); + for new_input in transform.inputs.iter() { + if let Some(id) = &new_input.id { + new_upstream_ids.push(id.clone()); + } else { + return Err(AppendError::InvalidBlock( + AppendValidationError::InvalidEvent(InvalidEventError::new( + block.event, + "Transform input with unresolved ID", + )), + )); + } + } + } + metadata_chain .append( block, @@ -227,7 +252,7 @@ pub async fn dataset_append_metadata( ) .await?; - Ok(()) + Ok(AppendMetadataResponse { new_upstream_ids }) } ///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/adapter/http/src/smart_protocol/ws_tungstenite_client.rs b/src/adapter/http/src/smart_protocol/ws_tungstenite_client.rs index a522d8f151..ace68e5e4a 100644 --- a/src/adapter/http/src/smart_protocol/ws_tungstenite_client.rs +++ b/src/adapter/http/src/smart_protocol/ws_tungstenite_client.rs @@ -10,8 +10,10 @@ use std::sync::atomic::{AtomicI32, Ordering}; use std::sync::Arc; -use dill::component; +use dill::*; +use event_bus::EventBus; use futures::SinkExt; +use kamu::domain::events::DatasetEventDependenciesUpdated; use kamu::domain::*; use kamu::utils::smart_transfer_protocol::{ DatasetFactoryFn, @@ -35,15 +37,21 @@ use crate::ws_common::{self, ReadMessageError, WriteMessageError}; ///////////////////////////////////////////////////////////////////////////////////////// pub struct WsSmartTransferProtocolClient { + event_bus: Arc, dataset_credential_resolver: Arc, } ///////////////////////////////////////////////////////////////////////////////////////// #[component(pub)] +#[interface(dyn SmartTransferProtocolClient)] impl WsSmartTransferProtocolClient { - pub fn new(dataset_credential_resolver: Arc) -> Self { + pub fn new( + event_bus: Arc, + dataset_credential_resolver: Arc, + ) -> Self { Self { + event_bus, dataset_credential_resolver, } } @@ -594,13 +602,26 @@ impl SmartTransferProtocolClient for WsSmartTransferProtocolClient { .await?; } - dataset_append_metadata(dst.as_ref(), new_blocks) + let response = dataset_append_metadata(dst.as_ref(), new_blocks) .await .map_err(|e| { tracing::debug!("Appending dataset metadata failed with error: {}", e); SyncError::Internal(e.int_err()) })?; + // TODO: encapsulate this inside dataset/chain + if !response.new_upstream_ids.is_empty() { + let summary = dst.get_summary(GetSummaryOpts::default()).await.int_err()?; + + self.event_bus + .dispatch_event(DatasetEventDependenciesUpdated { + dataset_id: summary.id.clone(), + new_upstream_ids: response.new_upstream_ids, + }) + .await + .int_err()?; + } + let new_dst_head = dst .as_metadata_chain() .get_ref(&BlockRef::Head) diff --git a/src/adapter/http/tests/harness/client_side_harness.rs b/src/adapter/http/tests/harness/client_side_harness.rs index 80f7913411..dc64225d9f 100644 --- a/src/adapter/http/tests/harness/client_side_harness.rs +++ b/src/adapter/http/tests/harness/client_side_harness.rs @@ -11,9 +11,9 @@ use std::path::PathBuf; use std::sync::Arc; use container_runtime::ContainerRuntime; -use dill::builder_for; +use dill::Component; +use event_bus::EventBus; use kamu::domain::*; -use kamu::utils::smart_transfer_protocol::SmartTransferProtocolClient; use kamu::*; use kamu_adapter_http::SmartTransferProtocolClientWs; use opendatafabric::{AccountName, DatasetID, DatasetRef, DatasetRefAny, DatasetRefRemote}; @@ -53,70 +53,61 @@ impl ClientSideHarness { let mut b = dill::CatalogBuilder::new(); + b.add::(); + + b.add::(); + b.add_value(CurrentAccountSubject::logged(AccountName::new_unchecked( CLIENT_ACCOUNT_NAME, ))); - b.add::() - .bind::(); + b.add::(); if options.authenticated_remotely { b.add::(); - b.bind::(); } else { b.add_value(kamu::testing::MockOdfServerAccessTokenResolver::empty()); b.bind::(); } b.add::(); - b.bind::(); b.add_builder( - builder_for::() + DatasetRepositoryLocalFs::builder() .with_root(datasets_dir) .with_multi_tenant(options.multi_tenant), ) .bind::(); - b.add_builder(builder_for::().with_repos_dir(repos_dir)) + b.add_builder(RemoteRepositoryRegistryImpl::builder().with_repos_dir(repos_dir)) .bind::(); - b.add::() - .bind::(); + b.add::(); - b.add_value(EngineProvisionerNull) - .bind::(); + b.add::(); - b.add::() - .bind::(); + b.add::(); b.add::(); - b.bind::(); b.add_builder( - builder_for::() + PollingIngestServiceImpl::builder() .with_run_info_dir(run_info_dir) .with_cache_dir(cache_dir), ) .bind::(); - b.add::() - .bind::(); + b.add::(); - b.add::() - .bind::(); + b.add::(); - b.add::() - .bind::(); + b.add::(); - b.add::() - .bind::(); + b.add::(); - b.add::() - .bind::(); + b.add::(); - b.add::() - .bind::(); + b.add::(); b.add_value(ContainerRuntime::default()); b.add_value(kamu::utils::ipfs_wrapper::IpfsClient::default()); diff --git a/src/adapter/http/tests/harness/server_side_harness.rs b/src/adapter/http/tests/harness/server_side_harness.rs index 18b308d952..a7e4ea446f 100644 --- a/src/adapter/http/tests/harness/server_side_harness.rs +++ b/src/adapter/http/tests/harness/server_side_harness.rs @@ -80,7 +80,6 @@ pub(crate) fn create_cli_user_catalog(base_catalog: &dill::Catalog) -> dill::Cat SERVER_ACCOUNT_NAME, ))) .add::() - .bind::() .build() } @@ -92,9 +91,7 @@ pub(crate) fn create_web_user_catalog( ) -> dill::Catalog { let mut web_catalog_builder = dill::CatalogBuilder::new_chained(&base_catalog); if options.authorized_writes { - web_catalog_builder - .add::() - .bind::(); + web_catalog_builder.add::(); } else { let mut mock_dataset_action_authorizer = MockDatasetActionAuthorizer::new(); mock_dataset_action_authorizer diff --git a/src/adapter/http/tests/harness/server_side_local_fs_harness.rs b/src/adapter/http/tests/harness/server_side_local_fs_harness.rs index 70e635f481..99092c91fc 100644 --- a/src/adapter/http/tests/harness/server_side_local_fs_harness.rs +++ b/src/adapter/http/tests/harness/server_side_local_fs_harness.rs @@ -12,7 +12,8 @@ use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; -use dill::builder_for; +use dill::Component; +use event_bus::EventBus; use kamu::domain::{ auth, DatasetRepository, @@ -22,7 +23,7 @@ use kamu::domain::{ SystemTimeSourceStub, }; use kamu::testing::MockAuthenticationService; -use kamu::{DatasetLayout, DatasetRepositoryLocalFs}; +use kamu::{DatasetLayout, DatasetRepositoryLocalFs, DependencyGraphServiceInMemory}; use opendatafabric::{AccountName, DatasetAlias, DatasetHandle}; use tempfile::TempDir; use url::Url; @@ -62,10 +63,12 @@ impl ServerSideLocalFsHarness { let time_source = SystemTimeSourceStub::new(); base_catalog_builder + .add::() .add_value(time_source.clone()) .bind::() + .add::() .add_builder( - builder_for::() + DatasetRepositoryLocalFs::builder() .with_root(datasets_dir) .with_multi_tenant(options.multi_tenant), ) diff --git a/src/adapter/http/tests/harness/server_side_s3_harness.rs b/src/adapter/http/tests/harness/server_side_s3_harness.rs index 189ea80ebd..be0574e344 100644 --- a/src/adapter/http/tests/harness/server_side_s3_harness.rs +++ b/src/adapter/http/tests/harness/server_side_s3_harness.rs @@ -12,9 +12,10 @@ use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; +use dill::Component; +use event_bus::EventBus; use kamu::domain::{ auth, - CurrentAccountSubject, DatasetRepository, InternalError, ResultIntoInternal, @@ -23,7 +24,7 @@ use kamu::domain::{ }; use kamu::testing::{LocalS3Server, MockAuthenticationService}; use kamu::utils::s3_context::S3Context; -use kamu::{DatasetLayout, DatasetRepositoryS3}; +use kamu::{DatasetLayout, DatasetRepositoryS3, DependencyGraphServiceInMemory}; use opendatafabric::{AccountName, DatasetAlias, DatasetHandle}; use url::Url; @@ -51,6 +52,7 @@ pub(crate) struct ServerSideS3Harness { impl ServerSideS3Harness { pub async fn new(options: ServerSideHarnessOptions) -> Self { let s3 = LocalS3Server::new().await; + let s3_context = S3Context::from_url(&s3.url).await; let time_source = SystemTimeSourceStub::new(); @@ -58,7 +60,13 @@ impl ServerSideS3Harness { base_catalog_builder .add_value(time_source.clone()) .bind::() - .add_value(s3_repo(&s3, options.multi_tenant).await) + .add::() + .add::() + .add_builder( + DatasetRepositoryS3::builder() + .with_s3_context(s3_context) + .with_multi_tenant(false), + ) .bind::() .add_value(server_authentication_mock()) .bind::(); @@ -145,15 +153,3 @@ impl ServerSideHarness for ServerSideS3Harness { } ///////////////////////////////////////////////////////////////////////////////////////// - -pub async fn s3_repo(s3: &LocalS3Server, multi_tenant: bool) -> DatasetRepositoryS3 { - let s3_context = S3Context::from_url(&s3.url).await; - DatasetRepositoryS3::new( - s3_context, - Arc::new(CurrentAccountSubject::new_test()), - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - multi_tenant, - ) -} - -///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/adapter/http/tests/tests/test_data.rs b/src/adapter/http/tests/tests/test_data.rs index 53f90e6d69..12412c7148 100644 --- a/src/adapter/http/tests/tests/test_data.rs +++ b/src/adapter/http/tests/tests/test_data.rs @@ -8,6 +8,7 @@ // by the Apache License, Version 2.0. use chrono::{TimeZone, Utc}; +use dill::Component; use indoc::indoc; use kamu::domain::*; use kamu::testing::DatasetDataHelper; @@ -27,16 +28,12 @@ async fn test_data_push_ingest_handler() { let catalog = dill::CatalogBuilder::new() .add::() - .bind::() .add_builder( - dill::builder_for::() - .with_run_info_dir(run_info_dir.path().to_path_buf()), + PushIngestServiceImpl::builder().with_run_info_dir(run_info_dir.path().to_path_buf()), ) .bind::() .add::() - .bind::() - .add_value(ObjectStoreBuilderLocalFs::new()) - .bind::() + .add::() .build(); let server_harness = ServerSideLocalFsHarness::new(ServerSideHarnessOptions { diff --git a/src/adapter/http/tests/tests/test_dataset_authorization_layer.rs b/src/adapter/http/tests/tests/test_dataset_authorization_layer.rs index 1207a8825a..cde6df27f7 100644 --- a/src/adapter/http/tests/tests/test_dataset_authorization_layer.rs +++ b/src/adapter/http/tests/tests/test_dataset_authorization_layer.rs @@ -12,6 +12,8 @@ use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use std::str::FromStr; +use dill::Component; +use event_bus::EventBus; use kamu::domain::auth::DatasetAction; use kamu::domain::{ AnonymousAccountReason, @@ -21,7 +23,7 @@ use kamu::domain::{ ResultIntoInternal, }; use kamu::testing::{MetadataFactory, MockDatasetActionAuthorizer}; -use kamu::DatasetRepositoryLocalFs; +use kamu::{DatasetRepositoryLocalFs, DependencyGraphServiceInMemory}; use mockall::predicate::{eq, function}; use opendatafabric::{DatasetAlias, DatasetHandle, DatasetKind, DatasetName, DatasetRef}; use url::Url; @@ -211,6 +213,8 @@ impl ServerHarness { let temp_dir = tempfile::TempDir::new().unwrap(); let mut catalog_builder = dill::CatalogBuilder::new(); + catalog_builder.add::(); + catalog_builder.add::(); catalog_builder.add_value( kamu::testing::MockAuthenticationService::resolving_token(kamu::domain::auth::DUMMY_ACCESS_TOKEN, kamu::domain::auth::AccountInfo::dummy()) ) @@ -221,7 +225,7 @@ impl ServerHarness { catalog_builder.add_value(current_account_subject); catalog_builder .add_builder( - dill::builder_for::() + DatasetRepositoryLocalFs::builder() .with_multi_tenant(false) .with_root(temp_dir.path().join("datasets")), ) diff --git a/src/adapter/http/tests/tests/test_protocol_dataset_helpers.rs b/src/adapter/http/tests/tests/test_protocol_dataset_helpers.rs index 886f456aa2..d8d6e06a29 100644 --- a/src/adapter/http/tests/tests/test_protocol_dataset_helpers.rs +++ b/src/adapter/http/tests/tests/test_protocol_dataset_helpers.rs @@ -11,7 +11,7 @@ use std::assert_matches::assert_matches; use std::sync::Arc; use kamu::domain::auth::DUMMY_ACCESS_TOKEN; -use kamu::domain::{Dataset, DatasetRepositoryExt}; +use kamu::domain::Dataset; use kamu::testing::{MetadataFactory, TEST_BUCKET_NAME}; use kamu_adapter_http::smart_protocol::protocol_dataset_helper::*; use kamu_adapter_http::smart_protocol::{messages, BearerHeader}; diff --git a/src/adapter/http/tests/tests/test_routing.rs b/src/adapter/http/tests/tests/test_routing.rs index 15def0790d..f9108fb868 100644 --- a/src/adapter/http/tests/tests/test_routing.rs +++ b/src/adapter/http/tests/tests/test_routing.rs @@ -8,13 +8,13 @@ // by the Apache License, Version 2.0. use std::net::{IpAddr, Ipv4Addr, SocketAddr}; -use std::sync::Arc; use ::serde::Deserialize; use axum::extract::{FromRequestParts, Path}; use axum::routing::IntoMakeService; use axum::Router; -use dill::builder_for; +use dill::Component; +use event_bus::EventBus; use hyper::server::conn::AddrIncoming; use kamu::domain::*; use kamu::testing::*; @@ -38,15 +38,16 @@ async fn setup_repo() -> RepoFixture { std::fs::create_dir(&datasets_dir).unwrap(); let catalog = dill::CatalogBuilder::new() + .add::() + .add::() .add_builder( - builder_for::() + DatasetRepositoryLocalFs::builder() .with_root(datasets_dir) .with_multi_tenant(false), ) .bind::() .add_value(CurrentAccountSubject::new_test()) .add::() - .bind::() .build(); let dataset_repo = catalog.get_one::().unwrap(); @@ -110,9 +111,15 @@ where ///////////////////////////////////////////////////////////////////////////////////////// async fn setup_client(dataset_url: url::Url, head_expected: Multihash) { + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .build(); + let dataset = DatasetFactoryImpl::new( IpfsGateway::default(), - Arc::new(auth::DummyOdfServerAccessTokenResolver::new()), + catalog.get_one().unwrap(), + catalog.get_one().unwrap(), ) .get_dataset(&dataset_url, false) .await diff --git a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_up_to_date_dataset.rs b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_up_to_date_dataset.rs index f50fdc7b44..164e0b9f6a 100644 --- a/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_up_to_date_dataset.rs +++ b/src/adapter/http/tests/tests/tests_pull/scenarios/scenario_existing_up_to_date_dataset.rs @@ -7,7 +7,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use kamu::domain::*; use kamu::testing::MetadataFactory; use kamu::DatasetLayout; use opendatafabric::*; diff --git a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_up_to_date_dataset.rs b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_up_to_date_dataset.rs index 42b5ff0d0a..c95eee50b3 100644 --- a/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_up_to_date_dataset.rs +++ b/src/adapter/http/tests/tests/tests_push/scenarios/scenario_existing_up_to_date_dataset.rs @@ -7,7 +7,6 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use kamu::domain::*; use kamu::testing::MetadataFactory; use kamu::DatasetLayout; use opendatafabric::*; diff --git a/src/adapter/oauth/Cargo.toml b/src/adapter/oauth/Cargo.toml index 25cdb7d8a9..ff7e2bd64c 100644 --- a/src/adapter/oauth/Cargo.toml +++ b/src/adapter/oauth/Cargo.toml @@ -21,7 +21,7 @@ opendatafabric = { workspace = true } kamu-core = { workspace = true } async-trait = "0.1" -dill = "0.7" +dill = "0.8" http = "0.2" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls", "json"] } serde = "1" diff --git a/src/adapter/oauth/src/oauth_github.rs b/src/adapter/oauth/src/oauth_github.rs index 713f2b82d3..ee8ec78770 100644 --- a/src/adapter/oauth/src/oauth_github.rs +++ b/src/adapter/oauth/src/oauth_github.rs @@ -10,7 +10,7 @@ use std::collections::HashMap; use std::sync::Mutex; -use dill::{component, scope, Singleton}; +use dill::*; use kamu_core::auth::{AccountInfo, AccountType}; use kamu_core::{ErrorIntoInternal, InternalError, ResultIntoInternal}; use opendatafabric::{AccountName, FAKE_ACCOUNT_ID}; @@ -30,6 +30,7 @@ pub struct OAuthGithub { } #[component(pub)] +#[interface(dyn kamu_core::auth::AuthenticationProvider)] #[scope(Singleton)] impl OAuthGithub { pub fn new() -> Self { diff --git a/src/app/cli/Cargo.toml b/src/app/cli/Cargo.toml index 1f7a0e33ff..3348bcdd78 100644 --- a/src/app/cli/Cargo.toml +++ b/src/app/cli/Cargo.toml @@ -33,10 +33,12 @@ ftp = ["kamu/ftp"] # Kamu internal-error = { workspace = true } container-runtime = { workspace = true } +event-bus = { workspace = true } kamu-data-utils = { workspace = true } opendatafabric = { workspace = true } kamu = { workspace = true } kamu-task-system-inmem = { workspace = true } +kamu-flow-system-inmem = { workspace = true } kamu-adapter-auth-oso = { workspace = true } kamu-adapter-flight-sql = { workspace = true } kamu-adapter-graphql = { workspace = true } @@ -96,7 +98,7 @@ async-trait = "0.1" chrono = "0.4" cfg-if = "1" # Conditional compilation datafusion = "33" -dill = "0.7" +dill = "0.8" dirs = "5" fs_extra = "1.3" futures = "0.3" diff --git a/src/app/cli/src/app.rs b/src/app/cli/src/app.rs index 9ab95a42d6..accae1b852 100644 --- a/src/app/cli/src/app.rs +++ b/src/app/cli/src/app.rs @@ -13,7 +13,6 @@ use std::sync::Arc; use container_runtime::{ContainerRuntime, ContainerRuntimeConfig}; use dill::*; use kamu::domain::*; -use kamu::utils::smart_transfer_protocol::SmartTransferProtocolClient; use kamu::*; use crate::error::*; @@ -61,9 +60,19 @@ pub async fn run( // Configure application let (guards, base_catalog, cli_catalog, output_config) = { + let dependencies_graph_repository = prepare_dependencies_graph_repository( + &workspace_layout, + workspace_svc.is_multi_tenant_workspace(), + current_account.to_current_account_subject(), + ); + let mut base_catalog_builder = configure_base_catalog(&workspace_layout, workspace_svc.is_multi_tenant_workspace()); + base_catalog_builder + .add_value(dependencies_graph_repository) + .bind::(); + let output_config = configure_output_format(&matches, &workspace_svc); base_catalog_builder.add_value(output_config.clone()); @@ -145,6 +154,33 @@ pub async fn run( // Catalog ///////////////////////////////////////////////////////////////////////////////////////// +pub fn prepare_dependencies_graph_repository( + workspace_layout: &WorkspaceLayout, + multi_tenant_workspace: bool, + current_account_subject: CurrentAccountSubject, +) -> DependencyGraphRepositoryInMemory { + // Construct a special catalog just to create 1 object, but with a repository + // bound to CLI user. It also should be authorized to access any dataset. + + let special_catalog_for_graph = CatalogBuilder::new() + .add::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(workspace_layout.datasets_dir.clone()) + .with_multi_tenant(multi_tenant_workspace), + ) + .bind::() + .add_value(current_account_subject) + .add::() + .add::() + // Don't add it's own initializer, leave optional dependency uninitialized + .build(); + + let dataset_repo = special_catalog_for_graph.get_one().unwrap(); + + DependencyGraphRepositoryInMemory::new(dataset_repo) +} + // Public only for tests pub fn configure_base_catalog( workspace_layout: &WorkspaceLayout, @@ -155,112 +191,99 @@ pub fn configure_base_catalog( b.add::(); b.add::(); - b.bind::(); + + b.add::(); b.add_builder( - builder_for::() + DatasetRepositoryLocalFs::builder() .with_root(workspace_layout.datasets_dir.clone()) .with_multi_tenant(multi_tenant_workspace), ); b.bind::(); b.add::(); - b.bind::(); b.add_builder( - builder_for::() - .with_repos_dir(workspace_layout.repos_dir.clone()), + RemoteRepositoryRegistryImpl::builder().with_repos_dir(workspace_layout.repos_dir.clone()), ); b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); b.add_builder( - builder_for::() + PollingIngestServiceImpl::builder() .with_run_info_dir(workspace_layout.run_info_dir.clone()) .with_cache_dir(workspace_layout.cache_dir.clone()), ); b.bind::(); b.add_builder( - builder_for::() - .with_run_info_dir(workspace_layout.run_info_dir.clone()), + PushIngestServiceImpl::builder().with_run_info_dir(workspace_layout.run_info_dir.clone()), ); b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); - b.add_value(ObjectStoreBuilderLocalFs::new()); - b.bind::(); + b.add::(); b.add_builder( - builder_for::() - .with_run_info_dir(workspace_layout.run_info_dir.clone()), + EngineProvisionerLocal::builder().with_run_info_dir(workspace_layout.run_info_dir.clone()), ); b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); b.add::(); - b.bind::(); + + // TODO: initialize graph dependencies when starting API server + b.add::(); + + b.add::(); + b.add::(); + b.add_value(kamu_flow_system_inmem::domain::FlowServiceRunConfig::new( + chrono::Duration::seconds(1), + )); + + b.add::(); + b.add::(); b.add::(); - b.bind::(); // No Github login possible for single-tenant workspace if multi_tenant_workspace { b.add::(); - b.bind::(); } b.add::(); - b.bind::(); b.add::(); b.add::(); - b.bind::(); b } @@ -274,10 +297,8 @@ pub fn configure_cli_catalog(base_catalog: &Catalog) -> CatalogBuilder { b.add::(); b.add::(); - b.bind::(); b.add::(); b.add::(); - b.bind::(); b } diff --git a/src/app/cli/src/explore/api_server.rs b/src/app/cli/src/explore/api_server.rs index 25aacbb655..bc78e9a99f 100644 --- a/src/app/cli/src/explore/api_server.rs +++ b/src/app/cli/src/explore/api_server.rs @@ -12,6 +12,8 @@ use std::sync::Arc; use dill::Catalog; use internal_error::*; +use kamu::domain::SystemTimeSource; +use kamu_flow_system_inmem::domain::FlowService; use kamu_task_system_inmem::domain::TaskExecutor; ///////////////////////////////////////////////////////////////////////////////////////// @@ -22,6 +24,8 @@ pub struct APIServer { axum::routing::IntoMakeService, >, task_executor: Arc, + flow_service: Arc, + time_source: Arc, } impl APIServer { @@ -35,6 +39,10 @@ impl APIServer { let task_executor = base_catalog.get_one().unwrap(); + let flow_service = base_catalog.get_one().unwrap(); + + let time_source = base_catalog.get_one().unwrap(); + let gql_schema = kamu_adapter_graphql::schema(); let app = axum::Router::new() @@ -84,6 +92,8 @@ impl APIServer { Self { server, task_executor, + flow_service, + time_source, } } @@ -95,6 +105,7 @@ impl APIServer { tokio::select! { res = self.server => { res.int_err() }, res = self.task_executor.run() => { res.int_err() }, + res = self.flow_service.run(self.time_source.now()) => { res.int_err() } } } } diff --git a/src/app/cli/src/services/accounts/account_service.rs b/src/app/cli/src/services/accounts/account_service.rs index c5247ef12d..96777be5ab 100644 --- a/src/app/cli/src/services/accounts/account_service.rs +++ b/src/app/cli/src/services/accounts/account_service.rs @@ -11,7 +11,7 @@ use std::collections::HashMap; use std::sync::Arc; use clap::ArgMatches; -use dill::component; +use dill::*; use internal_error::{InternalError, ResultIntoInternal}; use kamu::domain::auth::{self, AccountInfo, AccountType}; use opendatafabric::{AccountName, FAKE_ACCOUNT_ID}; @@ -32,6 +32,7 @@ pub struct AccountService { } #[component(pub)] +#[interface(dyn auth::AuthenticationProvider)] impl AccountService { pub fn new(users_config: Arc) -> Self { let mut predefined_accounts: HashMap = HashMap::new(); diff --git a/src/app/cli/src/services/odf_server/access_token_registry_service.rs b/src/app/cli/src/services/odf_server/access_token_registry_service.rs index e372282d88..f4373ba70b 100644 --- a/src/app/cli/src/services/odf_server/access_token_registry_service.rs +++ b/src/app/cli/src/services/odf_server/access_token_registry_service.rs @@ -10,7 +10,7 @@ use std::path::PathBuf; use std::sync::{Arc, Mutex}; -use dill::component; +use dill::*; use internal_error::{InternalError, ResultIntoInternal}; use kamu::domain::CurrentAccountSubject; use opendatafabric::serde::yaml::Manifest; @@ -32,6 +32,7 @@ pub struct AccessTokenRegistryService { } #[component(pub)] +#[interface(dyn kamu::domain::auth::OdfServerAccessTokenResolver)] impl AccessTokenRegistryService { pub fn new( storage: Arc, @@ -236,6 +237,7 @@ pub struct CLIAccessTokenStore { } #[component(pub)] +#[interface(dyn AccessTokenStore)] impl CLIAccessTokenStore { pub fn new(workspace_layout: &WorkspaceLayout) -> Self { let user_token_store_path = dirs::home_dir() diff --git a/src/app/cli/tests/tests/test_workspace_svc.rs b/src/app/cli/tests/tests/test_workspace_svc.rs index 185974a5b7..5de57ab0b5 100644 --- a/src/app/cli/tests/tests/test_workspace_svc.rs +++ b/src/app/cli/tests/tests/test_workspace_svc.rs @@ -11,6 +11,7 @@ use std::assert_matches::assert_matches; use std::error::Error; use std::path::Path; +use event_bus::EventBus; use kamu::domain::*; use kamu::testing::{MetadataFactory, ParquetWriterHelper}; use kamu::*; @@ -29,7 +30,12 @@ async fn init_v0_workspace(workspace_path: &Path) { let dataset_name = DatasetName::new_unchecked("foo"); let dataset_dir = datasets_dir.join(&dataset_name); - let dataset = DatasetFactoryImpl::get_local_fs(DatasetLayout::create(&dataset_dir).unwrap()); + + let catalog = dill::CatalogBuilder::new().add::().build(); + let dataset = DatasetFactoryImpl::get_local_fs( + DatasetLayout::create(&dataset_dir).unwrap(), + catalog.get_one().unwrap(), + ); // Metadata & refs dataset diff --git a/src/app/cli/tests/utils/kamu.rs b/src/app/cli/tests/utils/kamu.rs index 9e4254a682..2b4abaecce 100644 --- a/src/app/cli/tests/utils/kamu.rs +++ b/src/app/cli/tests/utils/kamu.rs @@ -12,6 +12,8 @@ use std::io::Write; use std::path::{Path, PathBuf}; use std::sync::Arc; +use dill::Component; +use event_bus::EventBus; use kamu::domain::*; use kamu::testing::ParquetReaderHelper; use kamu::*; @@ -64,12 +66,20 @@ impl Kamu { } pub async fn get_last_data_slice(&self, dataset_name: &DatasetName) -> ParquetReaderHelper { - let dataset_repo = DatasetRepositoryLocalFs::new( - self.workspace_layout.datasets_dir.clone(), - Arc::new(self.current_account.to_current_account_subject()), - Arc::new(domain::auth::AlwaysHappyDatasetActionAuthorizer::new()), - false, - ); + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add_value(self.current_account.to_current_account_subject()) + .add::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(self.workspace_layout.datasets_dir.clone()) + .with_multi_tenant(false), + ) + .bind::() + .build(); + + let dataset_repo = catalog.get_one::().unwrap(); let dataset = dataset_repo .get_dataset(&dataset_name.as_local_ref()) @@ -121,12 +131,22 @@ impl Kamu { S: Into, { let cli = kamu_cli::cli(); - let dataset_repo = Arc::new(DatasetRepositoryLocalFs::new( - self.workspace_layout.datasets_dir.clone(), - Arc::new(CurrentAccountSubject::new_test()), - Arc::new(domain::auth::AlwaysHappyDatasetActionAuthorizer::new()), - false, - )); + + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add_value(self.current_account.to_current_account_subject()) + .add::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(self.workspace_layout.datasets_dir.clone()) + .with_multi_tenant(false), + ) + .bind::() + .build(); + + let dataset_repo = catalog.get_one::().unwrap(); + let config_service = Arc::new(kamu_cli::config::ConfigService::new(&self.workspace_layout)); let mut buf = Vec::new(); diff --git a/src/domain/core/Cargo.toml b/src/domain/core/Cargo.toml index f5e3463b26..999e2590fb 100644 --- a/src/domain/core/Cargo.toml +++ b/src/domain/core/Cargo.toml @@ -26,7 +26,7 @@ async-stream = { version = "0.3", default-features = false } async-trait = { version = "0.1", default-features = false } bytes = { version = "1", default-features = false } chrono = { version = "0.4", default-features = false } -dill = { version = "0.7", default-features = false } +dill = { version = "0.8", default-features = false } futures = { version = "0.3", default-features = false } http = { version = "0.2" } pathdiff = { version = "0.2", default-features = false } diff --git a/src/domain/core/src/auth/dataset_action_authorizer.rs b/src/domain/core/src/auth/dataset_action_authorizer.rs index dfb87e9889..a23262a9bf 100644 --- a/src/domain/core/src/auth/dataset_action_authorizer.rs +++ b/src/domain/core/src/auth/dataset_action_authorizer.rs @@ -10,7 +10,7 @@ use std::collections::HashSet; use std::str::FromStr; -use dill::component; +use dill::*; use internal_error::{ErrorIntoInternal, InternalError}; use opendatafabric::{DatasetHandle, DatasetRef}; use thiserror::Error; @@ -82,6 +82,7 @@ pub struct DatasetActionNotEnoughPermissionsError { /////////////////////////////////////////////////////////////////////////////// #[component(pub)] +#[interface(dyn DatasetActionAuthorizer)] pub struct AlwaysHappyDatasetActionAuthorizer {} impl AlwaysHappyDatasetActionAuthorizer { diff --git a/src/domain/core/src/auth/odf_server_access_token_resolver.rs b/src/domain/core/src/auth/odf_server_access_token_resolver.rs index 428e665106..2f0ff8075e 100644 --- a/src/domain/core/src/auth/odf_server_access_token_resolver.rs +++ b/src/domain/core/src/auth/odf_server_access_token_resolver.rs @@ -7,7 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use dill::component; +use dill::*; use url::Url; /////////////////////////////////////////////////////////////////////////////// @@ -19,6 +19,7 @@ pub trait OdfServerAccessTokenResolver: Send + Sync { /////////////////////////////////////////////////////////////////////////////// #[component(pub)] +#[interface(dyn OdfServerAccessTokenResolver)] pub struct DummyOdfServerAccessTokenResolver {} impl DummyOdfServerAccessTokenResolver { diff --git a/src/domain/core/src/entities/events.rs b/src/domain/core/src/entities/events.rs new file mode 100644 index 0000000000..65bac91f92 --- /dev/null +++ b/src/domain/core/src/entities/events.rs @@ -0,0 +1,43 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use opendatafabric::DatasetID; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum DatasetEvent { + Created(DatasetEventCreated), + Deleted(DatasetEventDeleted), + DependenciesUpdated(DatasetEventDependenciesUpdated), +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DatasetEventCreated { + pub dataset_id: DatasetID, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DatasetEventDeleted { + pub dataset_id: DatasetID, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DatasetEventDependenciesUpdated { + pub dataset_id: DatasetID, + pub new_upstream_ids: Vec, +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/entities/mod.rs b/src/domain/core/src/entities/mod.rs index 1d0bdbbad3..4dd67aa47b 100644 --- a/src/domain/core/src/entities/mod.rs +++ b/src/domain/core/src/entities/mod.rs @@ -11,6 +11,7 @@ pub mod current_account_subject; pub mod dataset; pub mod dataset_summary; pub mod engine; +pub mod events; pub mod metadata_chain; pub mod metadata_stream; diff --git a/src/domain/core/src/repos/dataset_repository.rs b/src/domain/core/src/repos/dataset_repository.rs index 521df92d28..fc0f026d80 100644 --- a/src/domain/core/src/repos/dataset_repository.rs +++ b/src/domain/core/src/repos/dataset_repository.rs @@ -65,6 +65,12 @@ pub trait DatasetRepository: DatasetRegistry + Sync + Send { seed_block: MetadataBlockTyped, ) -> Result; + async fn create_dataset_from_snapshot( + &self, + account_name: Option, + mut snapshot: DatasetSnapshot, + ) -> Result; + async fn rename_dataset( &self, dataset_ref: &DatasetRef, @@ -72,11 +78,6 @@ pub trait DatasetRepository: DatasetRegistry + Sync + Send { ) -> Result<(), RenameDatasetError>; async fn delete_dataset(&self, dataset_ref: &DatasetRef) -> Result<(), DeleteDatasetError>; - - fn get_downstream_dependencies<'s>( - &'s self, - dataset_ref: &'s DatasetRef, - ) -> DatasetHandleStream<'s>; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -106,12 +107,6 @@ pub trait DatasetRepositoryExt: DatasetRepository { seed: Seed, ) -> Result; - async fn create_dataset_from_snapshot( - &self, - account_name: Option, - mut snapshot: DatasetSnapshot, - ) -> Result; - async fn create_datasets_from_snapshots( &self, account_name: Option, @@ -172,125 +167,6 @@ where .await } - async fn create_dataset_from_snapshot( - &self, - account_name: Option, - mut snapshot: DatasetSnapshot, - ) -> Result { - // Validate / resolve events - // TODO: Move some of the validation to MetadataChain - for event in snapshot.metadata.iter_mut() { - match event { - MetadataEvent::Seed(_) => Err(InvalidSnapshotError::new( - "Seed event is generated and cannot be specified explicitly", - ) - .into()), - MetadataEvent::SetPollingSource(_) | MetadataEvent::AddPushSource(_) => { - if snapshot.kind != DatasetKind::Root { - Err(InvalidSnapshotError { - reason: format!("Event is only allowed on root datasets: {:?}", event), - } - .into()) - } else { - Ok(()) - } - } - MetadataEvent::SetTransform(e) => { - if snapshot.kind != DatasetKind::Derivative { - Err(InvalidSnapshotError::new( - "SetTransform is only allowed on derivative datasets", - ) - .into()) - } else { - resolve_transform_inputs(self, &snapshot.name, &mut e.inputs).await - } - } - MetadataEvent::SetDataSchema(_) => { - // It shouldn't be common to provide schema as part of the snapshot. In most - // cases it will inferred upon first ingest/transform. But no reason not to - // allow it. - Ok(()) - } - MetadataEvent::SetAttachments(_) - | MetadataEvent::SetInfo(_) - | MetadataEvent::SetLicense(_) - | MetadataEvent::SetVocab(_) => Ok(()), - MetadataEvent::AddData(_) - | MetadataEvent::ExecuteQuery(_) - | MetadataEvent::SetWatermark(_) - | MetadataEvent::DisablePollingSource(_) - | MetadataEvent::DisablePushSource(_) => Err(InvalidSnapshotError::new(format!( - "Event is not allowed to appear in a DatasetSnapshot: {:?}", - event - )) - .into()), - }?; - } - - // We are generating a key pair and deriving a dataset ID from it. - // The key pair is discarded for now, but in future can be used for - // proof of control over dataset and metadata signing. - let (_keypair, dataset_id) = DatasetID::from_new_keypair_ed25519(); - - let system_time = Utc::now(); - - let create_result = self - .create_dataset( - &DatasetAlias::new(account_name, snapshot.name), - MetadataBlockTyped { - system_time, - prev_block_hash: None, - event: Seed { - dataset_id, - dataset_kind: snapshot.kind, - }, - sequence_number: 0, - }, - ) - .await?; - - let chain = create_result.dataset.as_metadata_chain(); - let mut head = create_result.head.clone(); - let mut sequence_number = 1; - - for event in snapshot.metadata { - head = chain - .append( - MetadataBlock { - system_time, - prev_block_hash: Some(head), - event, - sequence_number, - }, - AppendOpts { - update_ref: None, - ..AppendOpts::default() - }, - ) - .await - .int_err()?; - - sequence_number += 1; - } - - chain - .set_ref( - &BlockRef::Head, - &head, - SetRefOpts { - validate_block_present: false, - check_ref_is: Some(Some(&create_result.head)), - }, - ) - .await - .int_err()?; - - Ok(CreateDatasetResult { - head, - ..create_result - }) - } - async fn create_datasets_from_snapshots( &self, account_name: Option, @@ -315,64 +191,6 @@ where ///////////////////////////////////////////////////////////////////////////////////////// -async fn resolve_transform_inputs( - repo: &T, - dataset_name: &DatasetName, - inputs: &mut Vec, -) -> Result<(), CreateDatasetFromSnapshotError> -where - T: DatasetRepository, - T: ?Sized, -{ - for input in inputs.iter_mut() { - if let Some(input_id) = &input.id { - // Input is referenced by ID - in this case we allow any name - match repo.resolve_dataset_ref(&input_id.as_local_ref()).await { - Ok(_) => Ok(()), - Err(GetDatasetError::NotFound(_)) => Err( - CreateDatasetFromSnapshotError::MissingInputs(MissingInputsError { - dataset_ref: dataset_name.into(), - missing_inputs: vec![input_id.as_local_ref()], - }), - ), - Err(GetDatasetError::Internal(e)) => Err(e.into()), - }?; - } else { - // When ID is not specified we try resolving it by name or a reference - - // When reference is available, it dominates - let input_local_ref = if let Some(dataset_ref) = &input.dataset_ref { - match dataset_ref.as_local_ref(|_| !repo.is_multi_tenant()) { - Ok(local_ref) => local_ref, - Err(_) => { - unimplemented!("Deriving from remote dataset is not supported yet"); - } - } - } else { - // Derive reference purely from a name assuming a default account - let input_alias = DatasetAlias::new(None, input.name.clone()); - input_alias.as_local_ref() - }; - - let hdl = match repo.resolve_dataset_ref(&input_local_ref).await { - Ok(hdl) => Ok(hdl), - Err(GetDatasetError::NotFound(_)) => Err( - CreateDatasetFromSnapshotError::MissingInputs(MissingInputsError { - dataset_ref: dataset_name.into(), - missing_inputs: vec![input_local_ref], - }), - ), - Err(GetDatasetError::Internal(e)) => Err(e.into()), - }?; - - input.id = Some(hdl.id); - } - } - Ok(()) -} - -///////////////////////////////////////////////////////////////////////////////////////// - fn sort_snapshots_in_dependency_order( mut snapshots: LinkedList, ) -> Vec { diff --git a/src/domain/core/src/services/dependency_graph_repository.rs b/src/domain/core/src/services/dependency_graph_repository.rs new file mode 100644 index 0000000000..d1f72b30ec --- /dev/null +++ b/src/domain/core/src/services/dependency_graph_repository.rs @@ -0,0 +1,28 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::pin::Pin; + +use futures::Stream; +use internal_error::InternalError; +use opendatafabric::DatasetID; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait DependencyGraphRepository: Sync + Send { + fn list_dependencies_of_all_datasets(&self) -> DatasetDependenciesIDStream; +} + +///////////////////////////////////////////////////////////////////////////////////////// + +pub type DatasetDependenciesIDStream<'a> = + Pin> + Send + 'a>>; + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/dependency_graph_service.rs b/src/domain/core/src/services/dependency_graph_service.rs new file mode 100644 index 0000000000..cfddae763f --- /dev/null +++ b/src/domain/core/src/services/dependency_graph_service.rs @@ -0,0 +1,73 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use internal_error::InternalError; +use opendatafabric::DatasetID; +use thiserror::Error; +use tokio_stream::Stream; + +use crate::DependencyGraphRepository; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait DependencyGraphService: Sync + Send { + /// Forces initialization of graph data, if it wasn't initialized already. + /// Ignored if called multiple times + async fn eager_initialization( + &self, + repository: &dyn DependencyGraphRepository, + ) -> Result<(), InternalError>; + + /// Iterates over 1st level of dataset's downstream dependencies + async fn get_downstream_dependencies( + &self, + dataset_id: &DatasetID, + ) -> Result; + + /// Iterates over 1st level of dataset's upstream dependencies + async fn get_upstream_dependencies( + &self, + dataset_id: &DatasetID, + ) -> Result; +} + +///////////////////////////////////////////////////////////////////////////////////////// + +pub type DatasetIDStream<'a> = std::pin::Pin + Send + 'a>>; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Error, Debug)] +pub enum GetDownstreamDependenciesError { + #[error(transparent)] + Internal(InternalError), + + #[error(transparent)] + DatasetNotFound(#[from] DatasetNodeNotFoundError), +} + +#[derive(Error, Debug)] +pub enum GetUpstreamDependenciesError { + #[error(transparent)] + Internal(InternalError), + + #[error(transparent)] + DatasetNotFound(#[from] DatasetNodeNotFoundError), +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Error, Debug)] +#[error("Dataset {dataset_id} not found")] +pub struct DatasetNodeNotFoundError { + pub dataset_id: DatasetID, +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/core/src/services/mod.rs b/src/domain/core/src/services/mod.rs index 268b284e63..3f2c926d0b 100644 --- a/src/domain/core/src/services/mod.rs +++ b/src/domain/core/src/services/mod.rs @@ -10,6 +10,8 @@ // Re-exports pub use container_runtime::{NullPullImageListener, PullImageListener}; +pub mod dependency_graph_repository; +pub mod dependency_graph_service; pub mod engine_provisioner; pub mod ingest; pub mod provenance_service; @@ -26,6 +28,8 @@ pub mod sync_service; pub mod transform_service; pub mod verification_service; +pub use dependency_graph_repository::*; +pub use dependency_graph_service::*; pub use engine_provisioner::*; pub use ingest::*; pub use provenance_service::*; diff --git a/src/domain/core/src/utils/time_source.rs b/src/domain/core/src/utils/time_source.rs index 4bb1e37659..c0459b1bae 100644 --- a/src/domain/core/src/utils/time_source.rs +++ b/src/domain/core/src/utils/time_source.rs @@ -21,6 +21,7 @@ pub trait SystemTimeSource: Send + Sync { ///////////////////////////////////////////////////////////////////////////////////////// #[dill::component] +#[dill::interface(dyn SystemTimeSource)] pub struct SystemTimeSourceDefault; impl SystemTimeSource for SystemTimeSourceDefault { diff --git a/src/domain/flow-system/Cargo.toml b/src/domain/flow-system/Cargo.toml new file mode 100644 index 0000000000..7cfbd6a83c --- /dev/null +++ b/src/domain/flow-system/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "kamu-flow-system" +description = "Domain model of the flows management for scheduled dataset and system activities" +version = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +readme = { workspace = true } +license-file = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +publish = { workspace = true } + + +[lib] +doctest = false + + +[dependencies] +enum-variants = { workspace = true } +event-sourcing = { workspace = true } +internal-error = { workspace = true } +opendatafabric = { workspace = true } +kamu-core = { workspace = true } +kamu-task-system = { workspace = true } + +async-trait = { version = "0.1", default-features = false } +chrono = { version = "0.4", default-features = false } +thiserror = { version = "1", default-features = false } +tokio-stream = { version = "0.1", default-features = false } +tracing = { version = "0.1", default-features = false } +url = { version = "2", default-features = false } + +# TODO: Make serde optional +serde = { version = "1", default-features = false, features = ["derive"] } +serde_with = { version = "3", default-features = false } diff --git a/src/domain/flow-system/src/aggregates/flow/flow.rs b/src/domain/flow-system/src/aggregates/flow/flow.rs new file mode 100644 index 0000000000..10ead14a8a --- /dev/null +++ b/src/domain/flow-system/src/aggregates/flow/flow.rs @@ -0,0 +1,147 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::{DateTime, Utc}; +use event_sourcing::*; +use kamu_task_system::{TaskID, TaskOutcome}; +use opendatafabric::{AccountID, AccountName}; + +use crate::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Aggregate, Debug)] +pub struct Flow(Aggregate); + +impl Flow { + /// Creates a flow + pub fn new( + now: DateTime, + flow_id: FlowID, + flow_key: FlowKey, + trigger: FlowTrigger, + ) -> Self { + Self( + Aggregate::new( + flow_id, + FlowEventInitiated { + event_time: now, + flow_id, + flow_key, + trigger, + }, + ) + .unwrap(), + ) + } + + /// Define start condition for the history + pub fn define_start_condition( + &mut self, + now: DateTime, + start_condition: FlowStartCondition, + ) -> Result<(), ProjectionError> { + let event = FlowEventStartConditionDefined { + event_time: now, + flow_id: self.flow_id, + start_condition, + }; + self.apply(event) + } + + /// Activate at time + pub fn activate_at_time( + &mut self, + now: DateTime, + activate_at: DateTime, + ) -> Result<(), ProjectionError> { + let event = FlowEventQueued { + event_time: now, + flow_id: self.flow_id, + activate_at, + }; + self.apply(event) + } + + /// Extra trigger + pub fn add_trigger( + &mut self, + now: DateTime, + trigger: FlowTrigger, + ) -> Result<(), ProjectionError> { + let event = FlowEventTriggerAdded { + event_time: now, + flow_id: self.flow_id, + trigger, + }; + self.apply(event) + } + + /// Attaches a scheduled task + pub fn on_task_scheduled( + &mut self, + now: DateTime, + task_id: TaskID, + ) -> Result<(), ProjectionError> { + let event = FlowEventTaskScheduled { + event_time: now, + flow_id: self.flow_id, + task_id, + }; + self.apply(event) + } + + /// Task finished + pub fn on_task_finished( + &mut self, + now: DateTime, + task_id: TaskID, + task_outcome: TaskOutcome, + ) -> Result<(), ProjectionError> { + let event = FlowEventTaskFinished { + event_time: now, + flow_id: self.flow_id, + task_id, + task_outcome, + }; + self.apply(event) + } + + /// Checks if flow may be cancelled + pub fn can_cancel(&self) -> bool { + self.0.as_state().can_cancel() + } + + /// Cancel flow before task started + pub fn cancel( + &mut self, + now: DateTime, + by_account_id: AccountID, + by_account_name: AccountName, + ) -> Result<(), ProjectionError> { + let event = FlowEventCancelled { + event_time: now, + flow_id: self.flow_id, + by_account_id, + by_account_name, + }; + self.apply(event) + } + + /// Abort flow + pub fn abort(&mut self, now: DateTime) -> Result<(), ProjectionError> { + let event = FlowEventAborted { + event_time: now, + flow_id: self.flow_id, + }; + self.apply(event) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/aggregates/flow/mod.rs b/src/domain/flow-system/src/aggregates/flow/mod.rs new file mode 100644 index 0000000000..fe498c14b5 --- /dev/null +++ b/src/domain/flow-system/src/aggregates/flow/mod.rs @@ -0,0 +1,12 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow; + +pub use flow::*; diff --git a/src/domain/flow-system/src/aggregates/flow_configuration/flow_configuration.rs b/src/domain/flow-system/src/aggregates/flow_configuration/flow_configuration.rs new file mode 100644 index 0000000000..e5222244ae --- /dev/null +++ b/src/domain/flow-system/src/aggregates/flow_configuration/flow_configuration.rs @@ -0,0 +1,73 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::{DateTime, Utc}; +use event_sourcing::*; + +use crate::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Aggregate, Debug)] +pub struct FlowConfiguration( + Aggregate, +); + +impl FlowConfiguration { + /// Creates a flow configuration + pub fn new( + now: DateTime, + flow_key: FlowKey, + paused: bool, + rule: FlowConfigurationRule, + ) -> Self { + Self( + Aggregate::new( + flow_key.clone(), + FlowConfigurationEventCreated { + event_time: now, + flow_key, + paused, + rule, + }, + ) + .unwrap(), + ) + } + + /// Modify configuration + pub fn modify_configuration( + &mut self, + now: DateTime, + paused: bool, + new_rule: FlowConfigurationRule, + ) -> Result<(), ProjectionError> { + let event = FlowConfigurationEventModified { + event_time: now, + flow_key: self.flow_key.clone(), + paused, + rule: new_rule, + }; + self.apply(event) + } + + /// Handle dataset removal + pub fn notify_dataset_removed( + &mut self, + now: DateTime, + ) -> Result<(), ProjectionError> { + let event = FlowConfigurationEventDatasetRemoved { + event_time: now, + flow_key: self.flow_key.clone(), + }; + self.apply(event) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/aggregates/flow_configuration/mod.rs b/src/domain/flow-system/src/aggregates/flow_configuration/mod.rs new file mode 100644 index 0000000000..7d2f744bf5 --- /dev/null +++ b/src/domain/flow-system/src/aggregates/flow_configuration/mod.rs @@ -0,0 +1,12 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow_configuration; + +pub use flow_configuration::*; diff --git a/src/domain/flow-system/src/aggregates/mod.rs b/src/domain/flow-system/src/aggregates/mod.rs new file mode 100644 index 0000000000..0ba9707f2c --- /dev/null +++ b/src/domain/flow-system/src/aggregates/mod.rs @@ -0,0 +1,14 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow; +mod flow_configuration; + +pub use flow::*; +pub use flow_configuration::*; diff --git a/src/domain/flow-system/src/entities/flow/flow_event.rs b/src/domain/flow-system/src/entities/flow/flow_event.rs new file mode 100644 index 0000000000..995801279c --- /dev/null +++ b/src/domain/flow-system/src/entities/flow/flow_event.rs @@ -0,0 +1,156 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::{DateTime, Utc}; +use enum_variants::*; +use kamu_task_system::{TaskID, TaskOutcome}; +use opendatafabric::{AccountID, AccountName}; + +use crate::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FlowEvent { + /// Flow initiated + Initiated(FlowEventInitiated), + /// Start condition defined + StartConditionDefined(FlowEventStartConditionDefined), + /// Queued for time + Queued(FlowEventQueued), + /// Secondary triger added + TriggerAdded(FlowEventTriggerAdded), + /// Scheduled/Rescheduled a task + TaskScheduled(FlowEventTaskScheduled), + /// Finished task + TaskFinished(FlowEventTaskFinished), + /// Cancelled update (user or admin initiative) + Cancelled(FlowEventCancelled), + /// Aborted update (system factor, such as dataset delete) + Aborted(FlowEventAborted), +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowEventInitiated { + pub event_time: DateTime, + pub flow_id: FlowID, + pub flow_key: FlowKey, + pub trigger: FlowTrigger, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowEventStartConditionDefined { + pub event_time: DateTime, + pub flow_id: FlowID, + pub start_condition: FlowStartCondition, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowEventQueued { + pub event_time: DateTime, + pub flow_id: FlowID, + pub activate_at: DateTime, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowEventTriggerAdded { + pub event_time: DateTime, + pub flow_id: FlowID, + pub trigger: FlowTrigger, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowEventTaskScheduled { + pub event_time: DateTime, + pub flow_id: FlowID, + pub task_id: TaskID, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowEventTaskFinished { + pub event_time: DateTime, + pub flow_id: FlowID, + pub task_id: TaskID, + pub task_outcome: TaskOutcome, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowEventCancelled { + pub event_time: DateTime, + pub flow_id: FlowID, + pub by_account_id: AccountID, + pub by_account_name: AccountName, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowEventAborted { + pub event_time: DateTime, + pub flow_id: FlowID, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl FlowEvent { + pub fn flow_id(&self) -> FlowID { + match self { + FlowEvent::Initiated(e) => e.flow_id, + FlowEvent::StartConditionDefined(e) => e.flow_id, + FlowEvent::Queued(e) => e.flow_id, + FlowEvent::TriggerAdded(e) => e.flow_id, + FlowEvent::TaskScheduled(e) => e.flow_id, + FlowEvent::TaskFinished(e) => e.flow_id, + FlowEvent::Cancelled(e) => e.flow_id, + FlowEvent::Aborted(e) => e.flow_id, + } + } + + pub fn event_time(&self) -> &DateTime { + match self { + FlowEvent::Initiated(e) => &e.event_time, + FlowEvent::StartConditionDefined(e) => &e.event_time, + FlowEvent::Queued(e) => &e.event_time, + FlowEvent::TriggerAdded(e) => &e.event_time, + FlowEvent::TaskScheduled(e) => &e.event_time, + FlowEvent::TaskFinished(e) => &e.event_time, + FlowEvent::Cancelled(e) => &e.event_time, + FlowEvent::Aborted(e) => &e.event_time, + } + } +} + +impl_enum_with_variants!(FlowEvent); + +impl_enum_variant!(FlowEvent::Initiated(FlowEventInitiated)); +impl_enum_variant!(FlowEvent::StartConditionDefined( + FlowEventStartConditionDefined +)); +impl_enum_variant!(FlowEvent::Queued(FlowEventQueued)); +impl_enum_variant!(FlowEvent::TriggerAdded(FlowEventTriggerAdded)); +impl_enum_variant!(FlowEvent::TaskScheduled(FlowEventTaskScheduled)); +impl_enum_variant!(FlowEvent::TaskFinished(FlowEventTaskFinished)); +impl_enum_variant!(FlowEvent::Cancelled(FlowEventCancelled)); +impl_enum_variant!(FlowEvent::Aborted(FlowEventAborted)); + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/flow/flow_id.rs b/src/domain/flow-system/src/entities/flow/flow_id.rs new file mode 100644 index 0000000000..d8a4900c1b --- /dev/null +++ b/src/domain/flow-system/src/entities/flow/flow_id.rs @@ -0,0 +1,43 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +///////////////////////////////////////////////////////////////////////////////////////// + +use internal_error::InternalError; + +/// Uniquely identifies a flow +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct FlowID(u64); + +impl FlowID { + pub fn new(id: u64) -> Self { + Self(id) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl std::fmt::Display for FlowID { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl Into for FlowID { + fn into(self) -> u64 { + self.0 + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +pub type FlowIDStream<'a> = + std::pin::Pin> + Send + 'a>>; + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/flow/flow_outcome.rs b/src/domain/flow-system/src/entities/flow/flow_outcome.rs new file mode 100644 index 0000000000..feaf737150 --- /dev/null +++ b/src/domain/flow-system/src/entities/flow/flow_outcome.rs @@ -0,0 +1,24 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FlowOutcome { + /// Update succeeded + Success, + /// Update failed to complete, even after retry logic + Failed, + /// Update was cancelled by a user + Cancelled, + /// Update was aborted by system by force + Aborted, +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/flow/flow_start_condition.rs b/src/domain/flow-system/src/entities/flow/flow_start_condition.rs new file mode 100644 index 0000000000..590df12b49 --- /dev/null +++ b/src/domain/flow-system/src/entities/flow/flow_start_condition.rs @@ -0,0 +1,34 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::Duration; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FlowStartCondition { + Throttling(FlowStartConditionThrottling), + Batching(FlowStartConditionBatching), +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FlowStartConditionThrottling { + pub interval: Duration, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FlowStartConditionBatching { + pub threshold_new_records: usize, +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/flow/flow_state.rs b/src/domain/flow-system/src/entities/flow/flow_state.rs new file mode 100644 index 0000000000..7b4ecc6d38 --- /dev/null +++ b/src/domain/flow-system/src/entities/flow/flow_state.rs @@ -0,0 +1,208 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::{DateTime, Utc}; +use event_sourcing::*; +use kamu_task_system::{TaskID, TaskOutcome}; + +use crate::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowState { + /// Unique flow identifier + pub flow_id: FlowID, + /// Flow key + pub flow_key: FlowKey, + /// Activating at time + pub activate_at: Option>, + /// Associated task IDs + pub task_ids: Vec, + /// Flow outcome + pub outcome: Option, + /// Termiation time (cancel or abort) + pub terminated_at: Option>, +} + +impl FlowState { + /// Checks if flow may be cancelled + pub fn can_cancel(&self) -> bool { + !self.outcome.is_some() && self.task_ids.is_empty() && self.terminated_at.is_none() + } + + /// Computes status + pub fn status(&self) -> FlowStatus { + if self.outcome.is_some() { + FlowStatus::Finished + } else if !self.task_ids.is_empty() { + FlowStatus::Scheduled + } else if self.activate_at.is_some() { + FlowStatus::Queued + } else { + FlowStatus::Draft + } + } +} + +impl Projection for FlowState { + type Query = FlowID; + type Event = FlowEvent; + + fn apply(state: Option, event: Self::Event) -> Result> { + use FlowEvent as E; + + match (state, event) { + (None, event) => match event { + E::Initiated(FlowEventInitiated { + event_time: _, + flow_id, + flow_key, + trigger: _, + }) => Ok(Self { + flow_id, + flow_key, + activate_at: None, + task_ids: vec![], + outcome: None, + terminated_at: None, + }), + _ => Err(ProjectionError::new(None, event)), + }, + (Some(s), event) => { + assert_eq!(s.flow_id, event.flow_id()); + + match &event { + E::Initiated(_) => Err(ProjectionError::new(Some(s), event)), + E::StartConditionDefined(FlowEventStartConditionDefined { + event_time: _, + flow_id: _, + start_condition: _, + }) => { + if s.outcome.is_some() || !s.task_ids.is_empty() { + Err(ProjectionError::new(Some(s), event)) + } else { + Ok(s) + } + } + E::Queued(FlowEventQueued { + event_time: _, + flow_id: _, + activate_at, + }) => { + if s.outcome.is_some() || !s.task_ids.is_empty() { + Err(ProjectionError::new(Some(s), event)) + } else { + Ok(FlowState { + activate_at: Some(*activate_at), + ..s + }) + } + } + E::TriggerAdded(FlowEventTriggerAdded { + event_time: _, + flow_id: _, + trigger: _, + }) => { + if s.outcome.is_some() { + Err(ProjectionError::new(Some(s), event)) + } else { + Ok(s) + } + } + E::TaskScheduled(FlowEventTaskScheduled { + event_time: _, + flow_id: _, + task_id, + }) => { + if s.outcome.is_some() || s.activate_at.is_none() { + Err(ProjectionError::new(Some(s), event)) + } else { + let mut task_ids = s.task_ids.clone(); + task_ids.push(*task_id); + + Ok(FlowState { task_ids, ..s }) + } + } + E::TaskFinished(FlowEventTaskFinished { + event_time, + flow_id: _, + task_id, + task_outcome, + }) => { + if !s.task_ids.contains(task_id) { + Err(ProjectionError::new(Some(s), event)) + } else if s.outcome.is_some() { + // Ignore for idempotence motivation + Ok(s) + } else { + match task_outcome { + TaskOutcome::Success => Ok(FlowState { + outcome: Some(FlowOutcome::Success), + ..s + }), + TaskOutcome::Cancelled => Ok(FlowState { + outcome: Some(FlowOutcome::Cancelled), + terminated_at: Some(event_time.clone()), + ..s + }), + // TODO: support retries + TaskOutcome::Failed => Ok(FlowState { + outcome: Some(FlowOutcome::Failed), + ..s + }), + } + } + } + E::Cancelled(FlowEventCancelled { + event_time, + flow_id: _, + by_account_id: _, + by_account_name: _, + }) => { + if s.outcome.is_some() || !s.task_ids.is_empty() { + Err(ProjectionError::new(Some(s), event)) + } else { + Ok(FlowState { + outcome: Some(FlowOutcome::Cancelled), + terminated_at: Some(event_time.clone()), + ..s + }) + } + } + E::Aborted(FlowEventAborted { + event_time, + flow_id: _, + }) => { + if s.outcome.is_some() { + // Ignore for idempotence reasons + Ok(s) + } else { + Ok(FlowState { + outcome: Some(FlowOutcome::Aborted), + terminated_at: Some(event_time.clone()), + ..s + }) + } + } + } + } + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl ProjectionEvent for FlowEvent { + fn matches_query(&self, query: &FlowID) -> bool { + self.flow_id() == *query + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/flow/flow_status.rs b/src/domain/flow-system/src/entities/flow/flow_status.rs new file mode 100644 index 0000000000..b1a1fd55de --- /dev/null +++ b/src/domain/flow-system/src/entities/flow/flow_status.rs @@ -0,0 +1,20 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FlowStatus { + Draft, + Queued, + Scheduled, + Finished, +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/flow/flow_trigger.rs b/src/domain/flow-system/src/entities/flow/flow_trigger.rs new file mode 100644 index 0000000000..30f1c7ff5c --- /dev/null +++ b/src/domain/flow-system/src/entities/flow/flow_trigger.rs @@ -0,0 +1,53 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use opendatafabric::{AccountID, AccountName, DatasetID}; + +use crate::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FlowTrigger { + Manual(FlowTriggerManual), + AutoPolling(FlowTriggerAutoPolling), + Push(FlowTriggerPush), + InputDatasetFlow(FlowTriggerInputDatasetFlow), +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowTriggerManual { + pub initiator_account_id: AccountID, + pub initiator_account_name: AccountName, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowTriggerAutoPolling {} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowTriggerPush { + // TODO: source (HTTP, MQTT, CMD, ...) +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowTriggerInputDatasetFlow { + pub input_dataset_id: DatasetID, + pub input_flow_type: DatasetFlowType, + pub input_flow_id: FlowID, +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/flow/mod.rs b/src/domain/flow-system/src/entities/flow/mod.rs new file mode 100644 index 0000000000..5b3b1d38d8 --- /dev/null +++ b/src/domain/flow-system/src/entities/flow/mod.rs @@ -0,0 +1,24 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow_event; +mod flow_id; +mod flow_outcome; +mod flow_start_condition; +mod flow_state; +mod flow_status; +mod flow_trigger; + +pub use flow_event::*; +pub use flow_id::*; +pub use flow_outcome::*; +pub use flow_start_condition::*; +pub use flow_state::*; +pub use flow_status::*; +pub use flow_trigger::*; diff --git a/src/domain/flow-system/src/entities/flow_configuration/flow_configuration_event.rs b/src/domain/flow-system/src/entities/flow_configuration/flow_configuration_event.rs new file mode 100644 index 0000000000..827684d68d --- /dev/null +++ b/src/domain/flow-system/src/entities/flow_configuration/flow_configuration_event.rs @@ -0,0 +1,85 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::{DateTime, Utc}; +use enum_variants::*; + +use crate::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FlowConfigurationEvent { + Created(FlowConfigurationEventCreated), + Modified(FlowConfigurationEventModified), + DatasetRemoved(FlowConfigurationEventDatasetRemoved), +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowConfigurationEventCreated { + pub event_time: DateTime, + pub flow_key: FlowKey, + pub paused: bool, + pub rule: FlowConfigurationRule, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowConfigurationEventModified { + pub event_time: DateTime, + pub flow_key: FlowKey, + pub paused: bool, + pub rule: FlowConfigurationRule, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowConfigurationEventDatasetRemoved { + pub event_time: DateTime, + pub flow_key: FlowKey, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl FlowConfigurationEvent { + pub fn flow_key(&self) -> &FlowKey { + match self { + FlowConfigurationEvent::Created(e) => &e.flow_key, + FlowConfigurationEvent::Modified(e) => &e.flow_key, + FlowConfigurationEvent::DatasetRemoved(e) => &e.flow_key, + } + } + + pub fn event_time(&self) -> &DateTime { + match self { + FlowConfigurationEvent::Created(e) => &e.event_time, + FlowConfigurationEvent::Modified(e) => &e.event_time, + FlowConfigurationEvent::DatasetRemoved(e) => &e.event_time, + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl_enum_with_variants!(FlowConfigurationEvent); +impl_enum_variant!(FlowConfigurationEvent::Created( + FlowConfigurationEventCreated +)); +impl_enum_variant!(FlowConfigurationEvent::Modified( + FlowConfigurationEventModified +)); +impl_enum_variant!(FlowConfigurationEvent::DatasetRemoved( + FlowConfigurationEventDatasetRemoved +)); + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/flow_configuration/flow_configuration_rule.rs b/src/domain/flow-system/src/entities/flow_configuration/flow_configuration_rule.rs new file mode 100644 index 0000000000..38c59811bb --- /dev/null +++ b/src/domain/flow-system/src/entities/flow_configuration/flow_configuration_rule.rs @@ -0,0 +1,30 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::Duration; + +use crate::Schedule; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FlowConfigurationRule { + Schedule(Schedule), + StartCondition(StartConditionConfiguration), +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct StartConditionConfiguration { + pub throttling_period: Option, + pub minimal_data_batch: Option, +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/flow_configuration/flow_configuration_state.rs b/src/domain/flow-system/src/entities/flow_configuration/flow_configuration_state.rs new file mode 100644 index 0000000000..dd637022c3 --- /dev/null +++ b/src/domain/flow-system/src/entities/flow_configuration/flow_configuration_state.rs @@ -0,0 +1,112 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use event_sourcing::*; + +use crate::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FlowConfigurationState { + /// Flow key + pub flow_key: FlowKey, + /// Flow configuration rule + pub rule: FlowConfigurationRule, + /// Configuration status + pub status: FlowConfigurationStatus, +} + +impl FlowConfigurationState { + pub fn is_active(&self) -> bool { + self.status.is_active() + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl Projection for FlowConfigurationState { + type Query = FlowKey; + type Event = FlowConfigurationEvent; + + fn apply(state: Option, event: Self::Event) -> Result> { + use FlowConfigurationEvent as E; + + match (state, event) { + (None, event) => match event { + E::Created(FlowConfigurationEventCreated { + event_time: _, + flow_key, + paused, + rule, + }) => Ok(Self { + flow_key, + status: if paused { + FlowConfigurationStatus::PausedTemporarily + } else { + FlowConfigurationStatus::Active + }, + rule, + }), + _ => Err(ProjectionError::new(None, event)), + }, + (Some(s), event) => { + assert_eq!(&s.flow_key, event.flow_key()); + + match &event { + E::Created(_) => Err(ProjectionError::new(Some(s), event)), + + E::Modified(FlowConfigurationEventModified { + event_time: _, + flow_key: _, + paused, + rule, + }) => { + // Note: when deleted dataset is re-added with the same id, we have to + // gracefully react on this, as if it wasn't a terminal state + Ok(FlowConfigurationState { + status: if *paused { + FlowConfigurationStatus::PausedTemporarily + } else { + FlowConfigurationStatus::Active + }, + rule: rule.clone(), + ..s + }) + } + + E::DatasetRemoved(_) => { + if let FlowKey::Dataset(_) = &s.flow_key { + if s.status == FlowConfigurationStatus::StoppedPermanently { + Ok(s) // idempotent DELETE + } else { + Ok(FlowConfigurationState { + status: FlowConfigurationStatus::StoppedPermanently, + ..s + }) + } + } else { + Err(ProjectionError::new(Some(s), event)) + } + } + } + } + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl ProjectionEvent for FlowConfigurationEvent { + fn matches_query(&self, query: &FlowKey) -> bool { + self.flow_key() == query + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/flow_configuration/flow_configuration_status.rs b/src/domain/flow-system/src/entities/flow_configuration/flow_configuration_status.rs new file mode 100644 index 0000000000..32d6fff5a0 --- /dev/null +++ b/src/domain/flow-system/src/entities/flow_configuration/flow_configuration_status.rs @@ -0,0 +1,29 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FlowConfigurationStatus { + Active, + PausedTemporarily, + StoppedPermanently, +} + +impl FlowConfigurationStatus { + pub fn is_active(&self) -> bool { + match self { + FlowConfigurationStatus::Active => true, + FlowConfigurationStatus::PausedTemporarily => false, + FlowConfigurationStatus::StoppedPermanently => false, + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/flow_configuration/mod.rs b/src/domain/flow-system/src/entities/flow_configuration/mod.rs new file mode 100644 index 0000000000..abda1e0b8c --- /dev/null +++ b/src/domain/flow-system/src/entities/flow_configuration/mod.rs @@ -0,0 +1,18 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow_configuration_event; +mod flow_configuration_rule; +mod flow_configuration_state; +mod flow_configuration_status; + +pub use flow_configuration_event::*; +pub use flow_configuration_rule::*; +pub use flow_configuration_state::*; +pub use flow_configuration_status::*; diff --git a/src/domain/flow-system/src/entities/mod.rs b/src/domain/flow-system/src/entities/mod.rs new file mode 100644 index 0000000000..59363d2ae9 --- /dev/null +++ b/src/domain/flow-system/src/entities/mod.rs @@ -0,0 +1,16 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow; +mod flow_configuration; +mod shared; + +pub use flow::*; +pub use flow_configuration::*; +pub use shared::*; diff --git a/src/domain/flow-system/src/entities/shared/dataset_flow_type.rs b/src/domain/flow-system/src/entities/shared/dataset_flow_type.rs new file mode 100644 index 0000000000..c2ea880ea2 --- /dev/null +++ b/src/domain/flow-system/src/entities/shared/dataset_flow_type.rs @@ -0,0 +1,29 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Copy, Clone, Hash, Eq, PartialEq)] +pub enum DatasetFlowType { + Ingest, + ExecuteQuery, + Compaction, +} + +impl DatasetFlowType { + pub fn all() -> &'static [DatasetFlowType] { + &[Self::Ingest, Self::ExecuteQuery, Self::Compaction] + } + + pub fn is_dataset_update(&self) -> bool { + *self == DatasetFlowType::Ingest || *self == DatasetFlowType::ExecuteQuery + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/shared/flow_key.rs b/src/domain/flow-system/src/entities/shared/flow_key.rs new file mode 100644 index 0000000000..4c931fc2b4 --- /dev/null +++ b/src/domain/flow-system/src/entities/shared/flow_key.rs @@ -0,0 +1,68 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use opendatafabric::DatasetID; + +use crate::{DatasetFlowType, SystemFlowType}; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub enum FlowKey { + Dataset(FlowKeyDataset), + System(FlowKeySystem), +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub struct FlowKeyDataset { + pub dataset_id: DatasetID, + pub flow_type: DatasetFlowType, +} + +impl FlowKeyDataset { + pub fn new(dataset_id: DatasetID, flow_type: DatasetFlowType) -> Self { + Self { + dataset_id, + flow_type, + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub struct FlowKeySystem { + pub flow_type: SystemFlowType, +} + +impl FlowKeySystem { + pub fn new(flow_type: SystemFlowType) -> Self { + Self { flow_type } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl From for FlowKey { + fn from(value: FlowKeyDataset) -> Self { + Self::Dataset(value) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl From for FlowKey { + fn from(value: SystemFlowType) -> Self { + Self::System(FlowKeySystem::new(value)) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/shared/mod.rs b/src/domain/flow-system/src/entities/shared/mod.rs new file mode 100644 index 0000000000..d2ae0b8b9d --- /dev/null +++ b/src/domain/flow-system/src/entities/shared/mod.rs @@ -0,0 +1,18 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod dataset_flow_type; +mod flow_key; +mod schedule; +mod system_flow_type; + +pub use dataset_flow_type::*; +pub use flow_key::*; +pub use schedule::*; +pub use system_flow_type::*; diff --git a/src/domain/flow-system/src/entities/shared/schedule.rs b/src/domain/flow-system/src/entities/shared/schedule.rs new file mode 100644 index 0000000000..7967490fe0 --- /dev/null +++ b/src/domain/flow-system/src/entities/shared/schedule.rs @@ -0,0 +1,58 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::{DateTime, Utc}; + +///////////////////////////////////////////////////////////////////////////////////////// + +/// Represents dataset update settings +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Schedule { + /// Time-delta based schedule + TimeDelta(ScheduleTimeDelta), + /// Cron-based schedule + CronExpression(ScheduleCronExpression), +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ScheduleTimeDelta { + pub every: chrono::Duration, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ScheduleCronExpression { + pub expression: String, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl Schedule { + pub fn next_activation_time(&self, now: DateTime) -> DateTime { + match self { + Schedule::TimeDelta(td) => now + td.every, + Schedule::CronExpression(_) => { + unimplemented!() + } + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl From for Schedule { + fn from(value: chrono::Duration) -> Self { + Self::TimeDelta(ScheduleTimeDelta { every: value }) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/entities/shared/system_flow_type.rs b/src/domain/flow-system/src/entities/shared/system_flow_type.rs new file mode 100644 index 0000000000..82aa977d70 --- /dev/null +++ b/src/domain/flow-system/src/entities/shared/system_flow_type.rs @@ -0,0 +1,23 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] +pub enum SystemFlowType { + GC, +} + +impl SystemFlowType { + pub fn all() -> &'static [SystemFlowType] { + &[Self::GC] + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/lib.rs b/src/domain/flow-system/src/lib.rs new file mode 100644 index 0000000000..ecdc3b583a --- /dev/null +++ b/src/domain/flow-system/src/lib.rs @@ -0,0 +1,21 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +// Re-exports +pub use event_sourcing::*; + +pub mod aggregates; +pub mod entities; +pub mod repos; +pub mod services; + +pub use aggregates::*; +pub use entities::*; +pub use repos::*; +pub use services::*; diff --git a/src/domain/flow-system/src/repos/flow/flow_event_store.rs b/src/domain/flow-system/src/repos/flow/flow_event_store.rs new file mode 100644 index 0000000000..799ad60c3b --- /dev/null +++ b/src/domain/flow-system/src/repos/flow/flow_event_store.rs @@ -0,0 +1,53 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use event_sourcing::EventStore; +use opendatafabric::DatasetID; + +use crate::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait FlowEventStore: EventStore { + /// Generates new unique flow identifier + fn new_flow_id(&self) -> FlowID; + + /// Returns the last dataset flow of certain type + fn get_last_dataset_flow_of_type( + &self, + dataset_id: &DatasetID, + flow_type: DatasetFlowType, + ) -> Option; + + /// Returns the last system flow of certain type + fn get_last_system_flow_of_type(&self, flow_type: SystemFlowType) -> Option; + + /// Returns the flows of certain type associated with the specified dataset + /// in reverse chronological order based on creation time + fn get_flows_by_dataset_of_type<'a>( + &'a self, + dataset_id: &DatasetID, + flow_type: DatasetFlowType, + ) -> FlowIDStream<'a>; + + /// Returns the flows of certain type in reverse chronological order based + /// on creation time + fn get_system_flows_of_type<'a>(&'a self, flow_type: SystemFlowType) -> FlowIDStream<'a>; + + /// Returns the flows of any type associated with the specified dataset + /// in reverse chronological order based on creation time + fn get_all_flows_by_dataset<'a>(&'a self, dataset_id: &DatasetID) -> FlowIDStream<'a>; + + /// Returns the flows of any type in reverse chronological order + /// based on creation time + fn get_all_flows<'a>(&'a self) -> FlowIDStream<'a>; +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/repos/flow/mod.rs b/src/domain/flow-system/src/repos/flow/mod.rs new file mode 100644 index 0000000000..a5f601dd99 --- /dev/null +++ b/src/domain/flow-system/src/repos/flow/mod.rs @@ -0,0 +1,12 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow_event_store; + +pub use flow_event_store::*; diff --git a/src/domain/flow-system/src/repos/flow_configuration/flow_configuration_event_store.rs b/src/domain/flow-system/src/repos/flow_configuration/flow_configuration_event_store.rs new file mode 100644 index 0000000000..76345d0189 --- /dev/null +++ b/src/domain/flow-system/src/repos/flow_configuration/flow_configuration_event_store.rs @@ -0,0 +1,24 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use event_sourcing::EventStore; +use kamu_core::DatasetIDStream; + +use crate::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait FlowConfigurationEventStore: EventStore { + /// Returns all unique values of dataset IDs associated with update configs + // TODO: re-consider performance impact + fn list_all_dataset_ids<'a>(&'a self) -> DatasetIDStream<'a>; +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/repos/flow_configuration/mod.rs b/src/domain/flow-system/src/repos/flow_configuration/mod.rs new file mode 100644 index 0000000000..3f30a96ec1 --- /dev/null +++ b/src/domain/flow-system/src/repos/flow_configuration/mod.rs @@ -0,0 +1,12 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow_configuration_event_store; + +pub use flow_configuration_event_store::*; diff --git a/src/domain/flow-system/src/repos/mod.rs b/src/domain/flow-system/src/repos/mod.rs new file mode 100644 index 0000000000..0ba9707f2c --- /dev/null +++ b/src/domain/flow-system/src/repos/mod.rs @@ -0,0 +1,14 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow; +mod flow_configuration; + +pub use flow::*; +pub use flow_configuration::*; diff --git a/src/domain/flow-system/src/services/flow/flow_service.rs b/src/domain/flow-system/src/services/flow/flow_service.rs new file mode 100644 index 0000000000..08eda99f64 --- /dev/null +++ b/src/domain/flow-system/src/services/flow/flow_service.rs @@ -0,0 +1,189 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::{DateTime, Utc}; +use event_sourcing::LoadError; +use internal_error::{ErrorIntoInternal, InternalError}; +use kamu_core::DatasetNotFoundError; +use opendatafabric::{AccountID, AccountName, DatasetID}; +use tokio_stream::Stream; + +use crate::{DatasetFlowType, FlowID, FlowKey, FlowState, SystemFlowType}; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait FlowService: Sync + Send { + /// Runs the update main loop + async fn run(&self, planned_start_time: DateTime) -> Result<(), InternalError>; + + /// Triggers the specified flow manually, unless it's already waiting + async fn trigger_manual_flow( + &self, + trigger_time: DateTime, + flow_key: FlowKey, + initiator_account_id: AccountID, + initiator_account_name: AccountName, + ) -> Result; + + /// Returns states of flows of certian type associated with a given dataset + /// ordered by creation time from newest to oldest + fn list_flows_by_dataset_of_type( + &self, + dataset_id: &DatasetID, + flow_type: DatasetFlowType, + ) -> Result; + + /// Returns states of system flows of certian type + /// ordered by creation time from newest to oldest + fn list_system_flows_of_type( + &self, + flow_type: SystemFlowType, + ) -> Result; + + /// Returns states of flows of any type associated with a given dataset + /// ordered by creation time from newest to oldest + fn list_all_flows_by_dataset( + &self, + dataset_id: &DatasetID, + ) -> Result; + + /// Returns state of all flows, whether they are system-level or + /// dataset-bound, ordered by creation time from newest to oldest + fn list_all_flows(&self) -> Result; + + /// Returns state of the latest flow of certain type created for the given + /// dataset + async fn get_last_flow_by_dataset_of_type( + &self, + dataset_id: &DatasetID, + flow_type: DatasetFlowType, + ) -> Result, GetLastDatasetFlowError>; + + /// Returns state of the latest sstem flow of certain type + async fn get_last_system_flow_of_type( + &self, + flow_type: SystemFlowType, + ) -> Result, GetLastSystemtFlowError>; + + /// Returns current state of a given flow + async fn get_flow(&self, flow_id: FlowID) -> Result; + + /// Attempts to cancel the given flow + async fn cancel_flow( + &self, + flow_id: FlowID, + by_account_id: AccountID, + by_account_name: AccountName, + ) -> Result; +} + +///////////////////////////////////////////////////////////////////////////////////////// + +pub type FlowStateStream<'a> = + std::pin::Pin> + Send + 'a>>; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(thiserror::Error, Debug)] +pub enum RequestFlowError { + #[error(transparent)] + Internal(#[from] InternalError), +} + +#[derive(thiserror::Error, Debug)] +pub enum ListFlowsByDatasetError { + #[error(transparent)] + DatasetNotFound(#[from] DatasetNotFoundError), + #[error(transparent)] + Internal(#[from] InternalError), +} + +#[derive(thiserror::Error, Debug)] +pub enum ListSystemFlowsError { + #[error(transparent)] + Internal(#[from] InternalError), +} + +#[derive(thiserror::Error, Debug)] +pub enum GetLastDatasetFlowError { + #[error(transparent)] + DatasetNotFound(#[from] DatasetNotFoundError), + #[error(transparent)] + Internal(#[from] InternalError), +} + +#[derive(thiserror::Error, Debug)] +pub enum GetLastSystemtFlowError { + #[error(transparent)] + Internal(#[from] InternalError), +} + +#[derive(thiserror::Error, Debug)] +pub enum GetFlowError { + #[error(transparent)] + NotFound(#[from] FlowNotFoundError), + #[error(transparent)] + Internal(#[from] InternalError), +} + +#[derive(thiserror::Error, Debug)] +pub enum CancelFlowError { + #[error(transparent)] + NotFound(#[from] FlowNotFoundError), + #[error(transparent)] + Internal(#[from] InternalError), +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(thiserror::Error, Debug)] +#[error("Flow {flow_id} not found")] +pub struct FlowNotFoundError { + pub flow_id: FlowID, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl From> for GetFlowError { + fn from(value: LoadError) -> Self { + match value { + LoadError::NotFound(err) => Self::NotFound(FlowNotFoundError { flow_id: err.query }), + LoadError::ProjectionError(err) => Self::Internal(err.int_err()), + LoadError::Internal(err) => Self::Internal(err), + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl From> for CancelFlowError { + fn from(value: LoadError) -> Self { + match value { + LoadError::NotFound(err) => Self::NotFound(FlowNotFoundError { flow_id: err.query }), + LoadError::ProjectionError(err) => Self::Internal(err.int_err()), + LoadError::Internal(err) => Self::Internal(err), + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug)] +pub struct FlowServiceRunConfig { + pub awaiting_step: chrono::Duration, +} + +impl FlowServiceRunConfig { + pub fn new(awaiting_step: chrono::Duration) -> Self { + Self { awaiting_step } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/services/flow/flow_service_event.rs b/src/domain/flow-system/src/services/flow/flow_service_event.rs new file mode 100644 index 0000000000..93c5cd7141 --- /dev/null +++ b/src/domain/flow-system/src/services/flow/flow_service_event.rs @@ -0,0 +1,52 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::{DateTime, Utc}; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone)] +pub enum FlowServiceEvent { + ConfigurationLoaded(FlowServiceEventConfigurationLoaded), + ExecutedTimeSlot(FlowServiceEventExecutedTimeSlot), + FlowFinished(FlowServiceEventFlowFinished), +} + +impl FlowServiceEvent { + pub fn event_time(&self) -> DateTime { + match self { + FlowServiceEvent::ConfigurationLoaded(e) => e.event_time, + FlowServiceEvent::ExecutedTimeSlot(e) => e.event_time, + FlowServiceEvent::FlowFinished(e) => e.event_time, + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone)] +pub struct FlowServiceEventConfigurationLoaded { + pub event_time: DateTime, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone)] +pub struct FlowServiceEventExecutedTimeSlot { + pub event_time: DateTime, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Debug, Clone)] +pub struct FlowServiceEventFlowFinished { + pub event_time: DateTime, +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/services/flow/mod.rs b/src/domain/flow-system/src/services/flow/mod.rs new file mode 100644 index 0000000000..c88f895cc6 --- /dev/null +++ b/src/domain/flow-system/src/services/flow/mod.rs @@ -0,0 +1,14 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow_service; +mod flow_service_event; + +pub use flow_service::*; +pub use flow_service_event::*; diff --git a/src/domain/flow-system/src/services/flow_configuration/flow_configuration_service.rs b/src/domain/flow-system/src/services/flow_configuration/flow_configuration_service.rs new file mode 100644 index 0000000000..51042282a4 --- /dev/null +++ b/src/domain/flow-system/src/services/flow_configuration/flow_configuration_service.rs @@ -0,0 +1,80 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use chrono::{DateTime, Utc}; +use event_sourcing::TryLoadError; +use internal_error::{ErrorIntoInternal, InternalError}; +use tokio_stream::Stream; + +use crate::{FlowConfigurationRule, FlowConfigurationState, FlowKey}; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait FlowConfigurationService: Sync + Send { + /// Find current configuration of a certian type + async fn find_configuration( + &self, + flow_key: FlowKey, + ) -> Result, FindFlowConfigurationError>; + + /// Set or modify flow configuration + async fn set_configuration( + &self, + request_time: DateTime, + flow_key: FlowKey, + paused: bool, + rule: FlowConfigurationRule, + ) -> Result; + + /// Lists all flow configurations, which are currently enabled + fn list_enabled_configurations(&self) -> FlowConfigurationStateStream; +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(thiserror::Error, Debug)] +pub enum SetFlowConfigurationError { + #[error(transparent)] + Internal(#[from] InternalError), +} + +#[derive(thiserror::Error, Debug)] +pub enum FindFlowConfigurationError { + #[error(transparent)] + Internal(#[from] InternalError), +} + +///////////////////////////////////////////////////////////////////////////////////////// + +pub type FlowConfigurationStateStream<'a> = std::pin::Pin< + Box> + Send + 'a>, +>; + +///////////////////////////////////////////////////////////////////////////////////////// + +impl From> for FindFlowConfigurationError { + fn from(value: TryLoadError) -> Self { + match value { + TryLoadError::ProjectionError(err) => Self::Internal(err.int_err()), + TryLoadError::Internal(err) => Self::Internal(err), + } + } +} + +impl From> for SetFlowConfigurationError { + fn from(value: TryLoadError) -> Self { + match value { + TryLoadError::ProjectionError(err) => Self::Internal(err.int_err()), + TryLoadError::Internal(err) => Self::Internal(err), + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/flow-system/src/services/flow_configuration/mod.rs b/src/domain/flow-system/src/services/flow_configuration/mod.rs new file mode 100644 index 0000000000..0d2d68df9a --- /dev/null +++ b/src/domain/flow-system/src/services/flow_configuration/mod.rs @@ -0,0 +1,12 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow_configuration_service; + +pub use flow_configuration_service::*; diff --git a/src/domain/flow-system/src/services/mod.rs b/src/domain/flow-system/src/services/mod.rs new file mode 100644 index 0000000000..0ba9707f2c --- /dev/null +++ b/src/domain/flow-system/src/services/mod.rs @@ -0,0 +1,14 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow; +mod flow_configuration; + +pub use flow::*; +pub use flow_configuration::*; diff --git a/src/domain/task-system/src/aggregates/task.rs b/src/domain/task-system/src/aggregates/task.rs index a149ccc531..724265620c 100644 --- a/src/domain/task-system/src/aggregates/task.rs +++ b/src/domain/task-system/src/aggregates/task.rs @@ -7,7 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use chrono::Utc; +use chrono::{DateTime, Utc}; use event_sourcing::*; use crate::*; @@ -19,12 +19,12 @@ pub struct Task(Aggregate); impl Task { /// Creates a task with a pending TaskCreated event - pub fn new(task_id: TaskID, logical_plan: LogicalPlan) -> Self { + pub fn new(now: DateTime, task_id: TaskID, logical_plan: LogicalPlan) -> Self { Self( Aggregate::new( task_id, - TaskCreated { - event_time: Utc::now(), + TaskEventCreated { + event_time: now, task_id, logical_plan, }, @@ -34,9 +34,9 @@ impl Task { } /// Transition task to a `Running` state - pub fn run(&mut self) -> Result<(), ProjectionError> { - let event = TaskRunning { - event_time: Utc::now(), + pub fn run(&mut self, now: DateTime) -> Result<(), ProjectionError> { + let event = TaskEventRunning { + event_time: now, task_id: self.task_id, }; self.apply(event) @@ -52,22 +52,26 @@ impl Task { } /// Set cancellation flag (if not already set) - pub fn cancel(&mut self) -> Result<(), ProjectionError> { + pub fn cancel(&mut self, now: DateTime) -> Result<(), ProjectionError> { if self.cancellation_requested { return Ok(()); } - let event = TaskCancelled { - event_time: Utc::now(), + let event = TaskEventCancelled { + event_time: now, task_id: self.task_id, }; self.apply(event) } /// Transition task to a `Finished` state with the specified outcome - pub fn finish(&mut self, outcome: TaskOutcome) -> Result<(), ProjectionError> { - let event = TaskFinished { - event_time: Utc::now(), + pub fn finish( + &mut self, + now: DateTime, + outcome: TaskOutcome, + ) -> Result<(), ProjectionError> { + let event = TaskEventFinished { + event_time: now, task_id: self.task_id, outcome, }; diff --git a/src/domain/task-system/src/entities/mod.rs b/src/domain/task-system/src/entities/mod.rs index 40556bb1f7..b19e129f26 100644 --- a/src/domain/task-system/src/entities/mod.rs +++ b/src/domain/task-system/src/entities/mod.rs @@ -8,13 +8,13 @@ // by the Apache License, Version 2.0. mod logical_plan; +mod task_event; mod task_id; mod task_state; mod task_status; -mod task_system_event; pub use logical_plan::*; +pub use task_event::*; pub use task_id::*; pub use task_state::*; pub use task_status::*; -pub use task_system_event::*; diff --git a/src/domain/task-system/src/entities/task_system_event.rs b/src/domain/task-system/src/entities/task_event.rs similarity index 63% rename from src/domain/task-system/src/entities/task_system_event.rs rename to src/domain/task-system/src/entities/task_event.rs index 4111aa8853..efd4a6b727 100644 --- a/src/domain/task-system/src/entities/task_system_event.rs +++ b/src/domain/task-system/src/entities/task_event.rs @@ -16,22 +16,22 @@ use super::*; /// All events that model life-cycle of a task #[derive(Debug, Clone, PartialEq, Eq)] -pub enum TaskSystemEvent { +pub enum TaskEvent { /// New task entered the queue - TaskCreated(TaskCreated), + TaskCreated(TaskEventCreated), /// Task execution had started - TaskRunning(TaskRunning), + TaskRunning(TaskEventRunning), /// Cancellation of task was requested (this is not immediate and task may /// still finish with a different outcome than cancelled) - TaskCancelled(TaskCancelled), + TaskCancelled(TaskEventCancelled), /// Task has reached a final outcome - TaskFinished(TaskFinished), + TaskFinished(TaskEventFinished), } ///////////////////////////////////////////////////////////////////////////////////////// #[derive(Debug, Clone, PartialEq, Eq)] -pub struct TaskCreated { +pub struct TaskEventCreated { pub event_time: DateTime, pub task_id: TaskID, pub logical_plan: LogicalPlan, @@ -40,7 +40,7 @@ pub struct TaskCreated { ///////////////////////////////////////////////////////////////////////////////////////// #[derive(Debug, Clone, PartialEq, Eq)] -pub struct TaskRunning { +pub struct TaskEventRunning { pub event_time: DateTime, pub task_id: TaskID, } @@ -48,7 +48,7 @@ pub struct TaskRunning { ///////////////////////////////////////////////////////////////////////////////////////// #[derive(Debug, Clone, PartialEq, Eq)] -pub struct TaskCancelled { +pub struct TaskEventCancelled { pub event_time: DateTime, pub task_id: TaskID, } @@ -56,7 +56,7 @@ pub struct TaskCancelled { ///////////////////////////////////////////////////////////////////////////////////////// #[derive(Debug, Clone, PartialEq, Eq)] -pub struct TaskFinished { +pub struct TaskEventFinished { pub event_time: DateTime, pub task_id: TaskID, pub outcome: TaskOutcome, @@ -64,29 +64,31 @@ pub struct TaskFinished { ///////////////////////////////////////////////////////////////////////////////////////// -impl TaskSystemEvent { +impl TaskEvent { pub fn task_id(&self) -> TaskID { match self { - TaskSystemEvent::TaskCreated(e) => e.task_id, - TaskSystemEvent::TaskRunning(e) => e.task_id, - TaskSystemEvent::TaskCancelled(e) => e.task_id, - TaskSystemEvent::TaskFinished(e) => e.task_id, + TaskEvent::TaskCreated(e) => e.task_id, + TaskEvent::TaskRunning(e) => e.task_id, + TaskEvent::TaskCancelled(e) => e.task_id, + TaskEvent::TaskFinished(e) => e.task_id, } } pub fn event_time(&self) -> &DateTime { match self { - TaskSystemEvent::TaskCreated(e) => &e.event_time, - TaskSystemEvent::TaskRunning(e) => &e.event_time, - TaskSystemEvent::TaskCancelled(e) => &e.event_time, - TaskSystemEvent::TaskFinished(e) => &e.event_time, + TaskEvent::TaskCreated(e) => &e.event_time, + TaskEvent::TaskRunning(e) => &e.event_time, + TaskEvent::TaskCancelled(e) => &e.event_time, + TaskEvent::TaskFinished(e) => &e.event_time, } } } // TODO: Replace with derive macro -impl_enum_with_variants!(TaskSystemEvent); -impl_enum_variant!(TaskSystemEvent::TaskCreated(TaskCreated)); -impl_enum_variant!(TaskSystemEvent::TaskRunning(TaskRunning)); -impl_enum_variant!(TaskSystemEvent::TaskCancelled(TaskCancelled)); -impl_enum_variant!(TaskSystemEvent::TaskFinished(TaskFinished)); +impl_enum_with_variants!(TaskEvent); +impl_enum_variant!(TaskEvent::TaskCreated(TaskEventCreated)); +impl_enum_variant!(TaskEvent::TaskRunning(TaskEventRunning)); +impl_enum_variant!(TaskEvent::TaskCancelled(TaskEventCancelled)); +impl_enum_variant!(TaskEvent::TaskFinished(TaskEventFinished)); + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/task-system/src/entities/task_state.rs b/src/domain/task-system/src/entities/task_state.rs index ee3c58d8b2..819d39d28f 100644 --- a/src/domain/task-system/src/entities/task_state.rs +++ b/src/domain/task-system/src/entities/task_state.rs @@ -40,14 +40,14 @@ pub struct TaskState { impl Projection for TaskState { type Query = TaskID; - type Event = TaskSystemEvent; + type Event = TaskEvent; fn apply(state: Option, event: Self::Event) -> Result> { - use TaskSystemEvent as E; + use TaskEvent as E; match (state, event) { (None, event) => match event { - E::TaskCreated(TaskCreated { + E::TaskCreated(TaskEventCreated { event_time, task_id, logical_plan, @@ -68,7 +68,7 @@ impl Projection for TaskState { match event { E::TaskCreated(_) => Err(ProjectionError::new(Some(s), event)), - E::TaskRunning(TaskRunning { + E::TaskRunning(TaskEventRunning { event_time, task_id: _, }) if s.status == TaskStatus::Queued => Ok(Self { @@ -76,7 +76,7 @@ impl Projection for TaskState { ran_at: Some(event_time), ..s }), - E::TaskCancelled(TaskCancelled { + E::TaskCancelled(TaskEventCancelled { event_time, task_id: _, }) if s.status == TaskStatus::Queued @@ -88,7 +88,7 @@ impl Projection for TaskState { ..s }) } - E::TaskFinished(TaskFinished { + E::TaskFinished(TaskEventFinished { event_time, task_id: _, outcome, @@ -107,3 +107,13 @@ impl Projection for TaskState { } } } + +///////////////////////////////////////////////////////////////////////////////////////// + +impl ProjectionEvent for TaskEvent { + fn matches_query(&self, query: &TaskID) -> bool { + self.task_id() == *query + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/domain/task-system/src/services/task_scheduler.rs b/src/domain/task-system/src/services/task_scheduler.rs index b89141b665..95932bb9ba 100644 --- a/src/domain/task-system/src/services/task_scheduler.rs +++ b/src/domain/task-system/src/services/task_scheduler.rs @@ -23,7 +23,11 @@ pub trait TaskScheduler: Sync + Send { /// Returns states of tasks associated with a given dataset ordered by /// creation time from newest to oldest - fn list_tasks_by_dataset(&self, dataset_id: &DatasetID) -> TaskStateStream; + // TODO: reconsider performance impact + fn list_tasks_by_dataset( + &self, + dataset_id: &DatasetID, + ) -> Result; /// Returns current state of a given task async fn get_task(&self, task_id: TaskID) -> Result; diff --git a/src/infra/core/Cargo.toml b/src/infra/core/Cargo.toml index 6a5abd0797..474776cb21 100644 --- a/src/infra/core/Cargo.toml +++ b/src/infra/core/Cargo.toml @@ -29,6 +29,7 @@ internal-error = { workspace = true } container-runtime = { workspace = true } kamu-data-utils = { workspace = true } opendatafabric = { workspace = true } +event-bus = { workspace = true } kamu-core = { workspace = true } kamu-ingest-datafusion = { workspace = true } @@ -74,7 +75,7 @@ bytes = "1" cfg-if = "1" # Conditional compilation chrono = { version = "0.4", features = ["serde"] } dashmap = "5" -dill = "0.7" +dill = "0.8" futures = "0.3" glob = "0.3" # Used for glob fetch hyper = "0.14" @@ -84,6 +85,7 @@ jsonwebtoken = "9" libc = "0.2" # Signal names mockall = "0.11" pin-project = "1" +petgraph = { version = "0.6.4", default-features = false } rand = "0.8" regex = "1" strum = "0.25" # Enum from string diff --git a/src/infra/core/src/auth/authentication_service_impl.rs b/src/infra/core/src/auth/authentication_service_impl.rs index 5cb19815d2..5a449d57e0 100644 --- a/src/infra/core/src/auth/authentication_service_impl.rs +++ b/src/infra/core/src/auth/authentication_service_impl.rs @@ -12,7 +12,7 @@ use std::collections::HashMap; use std::sync::Arc; use chrono::Duration; -use dill::component; +use dill::*; use internal_error::{ErrorIntoInternal, InternalError}; use jsonwebtoken::errors::ErrorKind; use jsonwebtoken::{decode, encode, Algorithm, DecodingKey, EncodingKey, Header, Validation}; @@ -53,6 +53,7 @@ pub struct AuthenticationServiceImpl { } #[component(pub)] +#[interface(dyn AuthenticationService)] impl AuthenticationServiceImpl { pub fn new( authentication_providers: Vec>, diff --git a/src/infra/core/src/dependency_graph_repository_inmem.rs b/src/infra/core/src/dependency_graph_repository_inmem.rs new file mode 100644 index 0000000000..63ade20ff6 --- /dev/null +++ b/src/infra/core/src/dependency_graph_repository_inmem.rs @@ -0,0 +1,59 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use kamu_core::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +pub struct DependencyGraphRepositoryInMemory { + dataset_repo: Arc, +} + +#[dill::component(pub)] +#[dill::interface(dyn DependencyGraphRepository)] +impl DependencyGraphRepositoryInMemory { + pub fn new(dataset_repo: Arc) -> Self { + Self { dataset_repo } + } +} + +impl DependencyGraphRepository for DependencyGraphRepositoryInMemory { + #[tracing::instrument(level = "debug", skip_all)] + fn list_dependencies_of_all_datasets(&self) -> DatasetDependenciesIDStream { + use tokio_stream::StreamExt; + + Box::pin(async_stream::try_stream! { + let mut datasets_stream = self.dataset_repo.get_all_datasets(); + while let Some(Ok(dataset_handle)) = datasets_stream.next().await { + let dataset_span = tracing::debug_span!("Scanning dataset dependencies", dataset=%dataset_handle); + let _ = dataset_span.enter(); + + let summary = self + .dataset_repo + .get_dataset(&dataset_handle.as_local_ref()) + .await + .int_err()? + .get_summary(GetSummaryOpts::default()) + .await + .int_err()?; + + for transform_input in summary.dependencies { + if let Some(input_id) = transform_input.id { + yield (dataset_handle.id.clone(), input_id); + } + } + + } + }) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/dependency_graph_service_inmem.rs b/src/infra/core/src/dependency_graph_service_inmem.rs new file mode 100644 index 0000000000..52d59c674e --- /dev/null +++ b/src/infra/core/src/dependency_graph_service_inmem.rs @@ -0,0 +1,323 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use dill::*; +use event_bus::AsyncEventHandler; +use internal_error::InternalError; +use kamu_core::events::{ + DatasetEventCreated, + DatasetEventDeleted, + DatasetEventDependenciesUpdated, +}; +use kamu_core::*; +use opendatafabric::DatasetID; +use petgraph::stable_graph::{NodeIndex, StableDiGraph}; +use petgraph::Direction; + +///////////////////////////////////////////////////////////////////////////////////////// + +pub struct DependencyGraphServiceInMemory { + repository: Option>, + state: Arc>, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Default)] +struct State { + datasets_graph: StableDiGraph, + dataset_node_indices: HashMap, + initially_scanned: bool, +} + +impl State { + fn get_dataset_node( + &self, + dataset_id: &DatasetID, + ) -> Result { + match self.dataset_node_indices.get(dataset_id) { + Some(index) => Ok(index.to_owned()), + None => Err(DatasetNodeNotFoundError { + dataset_id: dataset_id.clone(), + }), + } + } + + fn get_or_create_dataset_node(&mut self, dataset_id: &DatasetID) -> NodeIndex { + match self.dataset_node_indices.get(dataset_id) { + Some(index) => index.to_owned(), + None => { + let node_index = self.datasets_graph.add_node(dataset_id.clone()); + self.dataset_node_indices + .insert(dataset_id.clone(), node_index); + node_index + } + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn DependencyGraphService)] +#[interface(dyn AsyncEventHandler)] +#[interface(dyn AsyncEventHandler)] +#[interface(dyn AsyncEventHandler)] +#[scope(Singleton)] +impl DependencyGraphServiceInMemory { + pub fn new(repository: Option>) -> Self { + Self { + repository, + state: Arc::new(tokio::sync::Mutex::new(State::default())), + } + } + + async fn ensure_datasets_initially_scanned(&self) -> Result<(), InternalError> { + let mut state = self.state.lock().await; + if state.initially_scanned { + return Ok(()); + } + + self.ensure_datasets_initially_scanned_with( + &mut state, + self.repository + .as_ref() + .expect("Dependencies graph repository not present") + .as_ref(), + ) + .await + } + + #[tracing::instrument(level = "debug", skip_all)] + async fn ensure_datasets_initially_scanned_with( + &self, + state: &mut State, + repository: &dyn DependencyGraphRepository, + ) -> Result<(), InternalError> { + use tokio_stream::StreamExt; + + let mut dependencies_stream = repository.list_dependencies_of_all_datasets(); + while let Some(Ok((dataset_id, upstream_dataset_id))) = dependencies_stream.next().await { + self.add_dependency(state, &upstream_dataset_id, &dataset_id); + } + + state.initially_scanned = true; + + tracing::debug!( + num_nodes = % state.datasets_graph.node_count(), + num_edges = % state.datasets_graph.edge_count(), + "Dependencies graph initialization stats", + ); + + Ok(()) + } + + /// Tracks a dependency between upstream and downstream dataset + #[tracing::instrument(level = "trace", skip_all, fields(%dataset_upstream_id, %dataset_downstream_id))] + fn add_dependency( + &self, + state: &mut State, + dataset_upstream_id: &DatasetID, + dataset_downstream_id: &DatasetID, + ) { + tracing::debug!(downstream=%dataset_downstream_id, upstream=%dataset_upstream_id, "Adding dataset dependency"); + + let upstream_node_index = state.get_or_create_dataset_node(dataset_upstream_id); + let downstream_node_index = state.get_or_create_dataset_node(dataset_downstream_id); + state + .datasets_graph + .update_edge(upstream_node_index, downstream_node_index, ()); + } + + /// Removes tracked dependency between updstream and downstream dataset + #[tracing::instrument(level = "trace", skip_all, fields(%dataset_upstream_id, %dataset_downstream_id))] + fn remove_dependency( + &self, + state: &mut State, + dataset_upstream_id: &DatasetID, + dataset_downstream_id: &DatasetID, + ) { + tracing::debug!(downstream=%dataset_downstream_id, upstream=%dataset_upstream_id, "Removing dataset dependency"); + + let upstream_node_index = state.get_or_create_dataset_node(dataset_upstream_id); + let downstream_node_index = state.get_or_create_dataset_node(dataset_downstream_id); + + // Idempotent DELETE - ignore, if not found + if let Some(edge_index) = state + .datasets_graph + .find_edge(upstream_node_index, downstream_node_index) + { + state.datasets_graph.remove_edge(edge_index); + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl DependencyGraphService for DependencyGraphServiceInMemory { + /// Forces initialization of graph data, if it wasn't initialized already. + /// Ignored if called multiple times + #[tracing::instrument(level = "debug", skip_all)] + async fn eager_initialization( + &self, + repository: &dyn DependencyGraphRepository, + ) -> Result<(), InternalError> { + let mut state = self.state.lock().await; + if state.initially_scanned { + return Ok(()); + } + + self.ensure_datasets_initially_scanned_with(&mut state, repository) + .await + } + + /// Iterates over 1st level of dataset's downstream dependencies + #[tracing::instrument(level = "debug", skip_all, fields(%dataset_id))] + async fn get_downstream_dependencies( + &self, + dataset_id: &DatasetID, + ) -> Result { + self.ensure_datasets_initially_scanned() + .await + .int_err() + .map_err(|e| GetDownstreamDependenciesError::Internal(e))?; + + let downstream_node_datasets: Vec<_> = { + let state = self.state.lock().await; + + let node_index = state + .get_dataset_node(dataset_id) + .map_err(|e| GetDownstreamDependenciesError::DatasetNotFound(e))?; + + state + .datasets_graph + .neighbors_directed(node_index, Direction::Outgoing) + .map(|node_index| { + state + .datasets_graph + .node_weight(node_index) + .unwrap() + .clone() + }) + .collect() + }; + + Ok(Box::pin(tokio_stream::iter(downstream_node_datasets))) + } + + /// Iterates over 1st level of dataset's upstream dependencies + #[tracing::instrument(level = "debug", skip_all, fields(%dataset_id))] + async fn get_upstream_dependencies( + &self, + dataset_id: &DatasetID, + ) -> Result { + self.ensure_datasets_initially_scanned() + .await + .int_err() + .map_err(|e| GetUpstreamDependenciesError::Internal(e))?; + + let upstream_node_datasets: Vec<_> = { + let state = self.state.lock().await; + + let node_index = state + .get_dataset_node(dataset_id) + .map_err(|e| GetUpstreamDependenciesError::DatasetNotFound(e))?; + + state + .datasets_graph + .neighbors_directed(node_index, Direction::Incoming) + .map(|node_index| { + state + .datasets_graph + .node_weight(node_index) + .unwrap() + .clone() + }) + .collect() + }; + + Ok(Box::pin(tokio_stream::iter(upstream_node_datasets))) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl AsyncEventHandler for DependencyGraphServiceInMemory { + #[tracing::instrument(level = "debug", skip_all, fields(?event))] + async fn handle(&self, event: &DatasetEventCreated) -> Result<(), InternalError> { + let mut state = self.state.lock().await; + state.get_or_create_dataset_node(&event.dataset_id); + Ok(()) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl AsyncEventHandler for DependencyGraphServiceInMemory { + #[tracing::instrument(level = "debug", skip_all, fields(?event))] + async fn handle(&self, event: &DatasetEventDeleted) -> Result<(), InternalError> { + let mut state = self.state.lock().await; + + let node_index = state + .get_dataset_node(&event.dataset_id) + .map_err(|e| e.int_err())?; + + state.datasets_graph.remove_node(node_index); + state.dataset_node_indices.remove(&event.dataset_id); + + Ok(()) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl AsyncEventHandler for DependencyGraphServiceInMemory { + #[tracing::instrument(level = "debug", skip_all, fields(?event))] + async fn handle(&self, event: &DatasetEventDependenciesUpdated) -> Result<(), InternalError> { + let mut state = self.state.lock().await; + + let node_index = state + .get_dataset_node(&event.dataset_id) + .map_err(|e| e.int_err())?; + + let existing_upstream_ids: HashSet<_> = state + .datasets_graph + .neighbors_directed(node_index, Direction::Incoming) + .map(|node_index| { + state + .datasets_graph + .node_weight(node_index) + .unwrap() + .clone() + }) + .collect(); + + let new_upstream_ids: HashSet<_> = + HashSet::from_iter(event.new_upstream_ids.iter().cloned()); + + for obsolete_upstream_id in existing_upstream_ids.difference(&new_upstream_ids) { + self.remove_dependency(&mut state, obsolete_upstream_id, &event.dataset_id) + } + + for added_id in new_upstream_ids.difference(&existing_upstream_ids) { + self.add_dependency(&mut state, added_id, &event.dataset_id); + } + + Ok(()) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/engine/engine_provisioner_local.rs b/src/infra/core/src/engine/engine_provisioner_local.rs index 6ef71b343b..c71e1f52a6 100644 --- a/src/infra/core/src/engine/engine_provisioner_local.rs +++ b/src/infra/core/src/engine/engine_provisioner_local.rs @@ -286,6 +286,8 @@ impl Default for EngineProvisionerLocalConfig { // Null Object ///////////////////////////////////////////////////////////////////////////////////////// +#[component(pub)] +#[interface(dyn EngineProvisioner)] pub struct EngineProvisionerNull; #[async_trait::async_trait] diff --git a/src/infra/core/src/ingest/data_format_registry_impl.rs b/src/infra/core/src/ingest/data_format_registry_impl.rs index fbd0b5439e..2a3eaed716 100644 --- a/src/infra/core/src/ingest/data_format_registry_impl.rs +++ b/src/infra/core/src/ingest/data_format_registry_impl.rs @@ -20,6 +20,7 @@ use opendatafabric::*; pub struct DataFormatRegistryImpl {} #[dill::component(pub)] +#[dill::interface(dyn DataFormatRegistry)] impl DataFormatRegistryImpl { pub const FMT_CSV: DataFormatDesc = DataFormatDesc { short_name: "CSV", diff --git a/src/infra/core/src/lib.rs b/src/infra/core/src/lib.rs index 64014fb017..286e263cd3 100644 --- a/src/infra/core/src/lib.rs +++ b/src/infra/core/src/lib.rs @@ -25,6 +25,8 @@ pub mod utils; mod dataset_config; mod dataset_layout; +mod dependency_graph_repository_inmem; +mod dependency_graph_service_inmem; mod provenance_service_impl; mod pull_service_impl; mod push_service_impl; @@ -41,6 +43,8 @@ mod verification_service_impl; pub use auth::*; pub use dataset_config::*; pub use dataset_layout::*; +pub use dependency_graph_repository_inmem::*; +pub use dependency_graph_service_inmem::*; pub use engine::*; pub use ingest::*; pub use provenance_service_impl::*; diff --git a/src/infra/core/src/provenance_service_impl.rs b/src/infra/core/src/provenance_service_impl.rs index d804ccb973..45ad9d3ccb 100644 --- a/src/infra/core/src/provenance_service_impl.rs +++ b/src/infra/core/src/provenance_service_impl.rs @@ -24,6 +24,7 @@ pub struct ProvenanceServiceImpl { } #[component(pub)] +#[interface(dyn ProvenanceService)] impl ProvenanceServiceImpl { pub fn new( dataset_repo: Arc, diff --git a/src/infra/core/src/pull_service_impl.rs b/src/infra/core/src/pull_service_impl.rs index 919bb3e2a0..94d4edd5d9 100644 --- a/src/infra/core/src/pull_service_impl.rs +++ b/src/infra/core/src/pull_service_impl.rs @@ -28,6 +28,7 @@ pub struct PullServiceImpl { } #[component(pub)] +#[interface(dyn PullService)] impl PullServiceImpl { pub fn new( dataset_repo: Arc, diff --git a/src/infra/core/src/push_service_impl.rs b/src/infra/core/src/push_service_impl.rs index 2212165244..9e0e6d2ad3 100644 --- a/src/infra/core/src/push_service_impl.rs +++ b/src/infra/core/src/push_service_impl.rs @@ -20,6 +20,7 @@ pub struct PushServiceImpl { } #[component(pub)] +#[interface(dyn PushService)] impl PushServiceImpl { pub fn new( dataset_repo: Arc, diff --git a/src/infra/core/src/query_service_impl.rs b/src/infra/core/src/query_service_impl.rs index eebca10c69..1c93ca8a75 100644 --- a/src/infra/core/src/query_service_impl.rs +++ b/src/infra/core/src/query_service_impl.rs @@ -35,6 +35,7 @@ pub struct QueryServiceImpl { } #[component(pub)] +#[interface(dyn QueryService)] impl QueryServiceImpl { pub fn new( dataset_repo: Arc, diff --git a/src/infra/core/src/remote_aliases_registry_impl.rs b/src/infra/core/src/remote_aliases_registry_impl.rs index 9e59238141..54a9424bdd 100644 --- a/src/infra/core/src/remote_aliases_registry_impl.rs +++ b/src/infra/core/src/remote_aliases_registry_impl.rs @@ -26,6 +26,7 @@ pub struct RemoteAliasesRegistryImpl { //////////////////////////////////////////////////////////////////////////////////////// #[component(pub)] +#[interface(dyn RemoteAliasesRegistry)] impl RemoteAliasesRegistryImpl { pub fn new(dataset_repo: Arc) -> Self { Self { dataset_repo } diff --git a/src/infra/core/src/repos/dataset_factory_impl.rs b/src/infra/core/src/repos/dataset_factory_impl.rs index 1e57eb9884..8cb47ce0c2 100644 --- a/src/infra/core/src/repos/dataset_factory_impl.rs +++ b/src/infra/core/src/repos/dataset_factory_impl.rs @@ -10,6 +10,7 @@ use std::sync::Arc; use dill::*; +use event_bus::EventBus; use kamu_core::*; use url::Url; @@ -21,6 +22,7 @@ use crate::*; pub struct DatasetFactoryImpl { ipfs_gateway: IpfsGateway, access_token_resolver: Arc, + event_bus: Arc, } ///////////////////////////////////////////////////////////////////////////////////////// @@ -39,19 +41,23 @@ type DatasetImplLocalFS = DatasetImpl< ///////////////////////////////////////////////////////////////////////////////////////// #[component(pub)] +#[interface(dyn DatasetFactory)] impl DatasetFactoryImpl { pub fn new( ipfs_gateway: IpfsGateway, access_token_resolver: Arc, + event_bus: Arc, ) -> Self { Self { ipfs_gateway, access_token_resolver, + event_bus, } } - pub fn get_local_fs(layout: DatasetLayout) -> DatasetImplLocalFS { + pub fn get_local_fs(layout: DatasetLayout, event_bus: Arc) -> DatasetImplLocalFS { DatasetImpl::new( + event_bus, MetadataChainImpl::new( ObjectRepositoryLocalFS::new(layout.blocks_dir), ReferenceRepositoryImpl::new(NamedObjectRepositoryLocalFS::new(layout.refs_dir)), @@ -62,9 +68,14 @@ impl DatasetFactoryImpl { ) } - fn get_http(base_url: Url, header_map: http::HeaderMap) -> Result { + fn get_http( + base_url: Url, + header_map: http::HeaderMap, + event_bus: Arc, + ) -> Result { let client = reqwest::Client::new(); Ok(DatasetImpl::new( + event_bus, MetadataChainImpl::new( ObjectRepositoryHttp::new( client.clone(), @@ -101,20 +112,27 @@ impl DatasetFactoryImpl { /// credental resolution from scratch which can be very expensive. If you /// already have an established [S3Context] use [get_s3_with_context] /// function instead. - pub async fn get_s3_from_url(base_url: Url) -> Result { + pub async fn get_s3_from_url( + base_url: Url, + event_bus: Arc, + ) -> Result { // TODO: We should ensure optimal credential reuse. Perhaps in future we should // create a cache of S3Contexts keyed by an endpoint. let s3_context = S3Context::from_url(&base_url).await; - Self::get_s3_from_context(s3_context).await + Self::get_s3_from_context(s3_context, event_bus).await } - pub async fn get_s3_from_context(s3_context: S3Context) -> Result { + pub async fn get_s3_from_context( + s3_context: S3Context, + event_bus: Arc, + ) -> Result { let client = s3_context.client; let endpoint = s3_context.endpoint; let bucket = s3_context.bucket; let key_prefix = s3_context.key_prefix; Ok(DatasetImpl::new( + event_bus, MetadataChainImpl::new( ObjectRepositoryS3::::new(S3Context::new( client.clone(), @@ -150,7 +168,11 @@ impl DatasetFactoryImpl { )) } - async fn get_ipfs_http(&self, base_url: Url) -> Result { + async fn get_ipfs_http( + &self, + base_url: Url, + event_bus: Arc, + ) -> Result { // Resolve IPNS DNSLink names if configured let dataset_url = match base_url.scheme() { "ipns" if self.ipfs_gateway.pre_resolve_dnslink => { @@ -204,6 +226,7 @@ impl DatasetFactoryImpl { let client = reqwest::Client::new(); Ok(DatasetImpl::new( + event_bus, MetadataChainImpl::new( ObjectRepositoryHttp::new( client.clone(), @@ -291,19 +314,25 @@ impl DatasetFactory for DatasetFactoryImpl { } else { DatasetLayout::new(path) }; - let ds = Self::get_local_fs(layout); + let ds = Self::get_local_fs(layout, self.event_bus.clone()); Ok(Arc::new(ds) as Arc) } "http" | "https" | "odf+http" | "odf+https" => { - let ds = Self::get_http(url.clone(), self.build_header_map(&url))?; + let ds = Self::get_http( + url.clone(), + self.build_header_map(&url), + self.event_bus.clone(), + )?; Ok(Arc::new(ds)) } "ipfs" | "ipns" | "ipfs+http" | "ipfs+https" | "ipns+http" | "ipns+https" => { - let ds = self.get_ipfs_http(url.clone()).await?; + let ds = self + .get_ipfs_http(url.clone(), self.event_bus.clone()) + .await?; Ok(Arc::new(ds)) } "s3" | "s3+http" | "s3+https" => { - let ds = Self::get_s3_from_url(url.clone()).await?; + let ds = Self::get_s3_from_url(url.clone(), self.event_bus.clone()).await?; Ok(Arc::new(ds)) } _ => Err(UnsupportedProtocolError { diff --git a/src/infra/core/src/repos/dataset_impl.rs b/src/infra/core/src/repos/dataset_impl.rs index c5f92c71a3..24a189e224 100644 --- a/src/infra/core/src/repos/dataset_impl.rs +++ b/src/infra/core/src/repos/dataset_impl.rs @@ -7,8 +7,12 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +use std::sync::Arc; + use async_trait::async_trait; use chrono::{DateTime, Utc}; +use event_bus::EventBus; +use kamu_core::events::DatasetEventDependenciesUpdated; use kamu_core::*; use opendatafabric::serde::yaml::Manifest; use opendatafabric::*; @@ -16,6 +20,7 @@ use opendatafabric::*; ///////////////////////////////////////////////////////////////////////////////////////// pub struct DatasetImpl { + event_bus: Arc, metadata_chain: MetaChain, data_repo: DataRepo, checkpoint_repo: CheckpointRepo, @@ -33,12 +38,14 @@ where InfoRepo: NamedObjectRepository + Sync + Send, { pub fn new( + event_bus: Arc, metadata_chain: MetaChain, data_repo: DataRepo, checkpoint_repo: CheckpointRepo, info_repo: InfoRepo, ) -> Self { Self { + event_bus, metadata_chain, data_repo, checkpoint_repo, @@ -405,6 +412,22 @@ where 0 }; + let mut new_upstream_ids: Vec = vec![]; + if let opendatafabric::MetadataEvent::SetTransform(transform) = &event { + for new_input in transform.inputs.iter() { + if let Some(id) = &new_input.id { + new_upstream_ids.push(id.clone()); + } else { + return Err(CommitError::MetadataAppendError(AppendError::InvalidBlock( + AppendValidationError::InvalidEvent(InvalidEventError::new( + event, + "Transform input with unresolved ID", + )), + ))); + } + } + } + let block = MetadataBlock { prev_block_hash: prev_block_hash.clone(), sequence_number, @@ -427,6 +450,21 @@ where tracing::info!(%new_head, "Committed new block"); + if !new_upstream_ids.is_empty() { + let summary = self + .get_summary(GetSummaryOpts::default()) + .await + .int_err()?; + + self.event_bus + .dispatch_event(DatasetEventDependenciesUpdated { + dataset_id: summary.id.clone(), + new_upstream_ids, + }) + .await + .int_err()?; + } + Ok(CommitResult { old_head: prev_block_hash, new_head, diff --git a/src/infra/core/src/repos/dataset_repository_helpers.rs b/src/infra/core/src/repos/dataset_repository_helpers.rs index 6e3619dbc4..c431e10a41 100644 --- a/src/infra/core/src/repos/dataset_repository_helpers.rs +++ b/src/infra/core/src/repos/dataset_repository_helpers.rs @@ -7,13 +7,26 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use futures::Stream; -use kamu_core::{DatasetRepository, GetSummaryOpts, InternalError, ResultIntoInternal}; -use opendatafabric::{DatasetHandle, DatasetRef}; -use tokio_stream::StreamExt; - ///////////////////////////////////////////////////////////////////////////////////////// +use chrono::Utc; +use event_bus::EventBus; +use internal_error::ResultIntoInternal; +use kamu_core::events::DatasetEventDependenciesUpdated; +use kamu_core::{ + AppendOpts, + BlockRef, + CreateDatasetFromSnapshotError, + CreateDatasetResult, + DatasetRepository, + DatasetRepositoryExt, + GetDatasetError, + InvalidSnapshotError, + MissingInputsError, + SetRefOpts, +}; +use opendatafabric::*; + pub fn get_staging_name() -> String { use rand::distributions::Alphanumeric; use rand::Rng; @@ -32,36 +45,202 @@ pub fn get_staging_name() -> String { ///////////////////////////////////////////////////////////////////////////////////////// -pub fn get_downstream_dependencies_impl<'s>( - repo: &'s dyn DatasetRepository, - dataset_ref: &'s DatasetRef, -) -> impl Stream> + 's { - async_stream::try_stream! { - let dataset_handle = repo.resolve_dataset_ref(dataset_ref).await.int_err()?; - - let mut dataset_handles = repo.get_all_datasets(); - while let Some(hdl) = dataset_handles.try_next().await? { - if hdl.id == dataset_handle.id { - continue; +pub async fn create_dataset_from_snapshot_impl( + dataset_repo: &dyn DatasetRepositoryExt, + event_bus: &EventBus, + account_name: Option, + mut snapshot: DatasetSnapshot, +) -> Result { + // Validate / resolve events + for event in snapshot.metadata.iter_mut() { + match event { + MetadataEvent::Seed(_) => Err(InvalidSnapshotError::new( + "Seed event is generated and cannot be specified explicitly", + ) + .into()), + MetadataEvent::SetPollingSource(_) | MetadataEvent::AddPushSource(_) => { + if snapshot.kind != DatasetKind::Root { + Err(InvalidSnapshotError { + reason: format!("Event is only allowed on root datasets: {:?}", event), + } + .into()) + } else { + Ok(()) + } + } + MetadataEvent::SetTransform(e) => { + if snapshot.kind != DatasetKind::Derivative { + Err(InvalidSnapshotError::new( + "SetTransform is only allowed on derivative datasets", + ) + .into()) + } else { + resolve_transform_inputs(dataset_repo, &snapshot.name, &mut e.inputs).await + } } + MetadataEvent::SetDataSchema(_) => { + // It shouldn't be common to provide schema as part of the snapshot. In most + // cases it will inferred upon first ingest/transform. But no reason not to + // allow it. + Ok(()) + } + MetadataEvent::SetAttachments(_) + | MetadataEvent::SetInfo(_) + | MetadataEvent::SetLicense(_) + | MetadataEvent::SetVocab(_) => Ok(()), + MetadataEvent::AddData(_) + | MetadataEvent::ExecuteQuery(_) + | MetadataEvent::SetWatermark(_) + | MetadataEvent::DisablePollingSource(_) + | MetadataEvent::DisablePushSource(_) => Err(InvalidSnapshotError::new(format!( + "Event is not allowed to appear in a DatasetSnapshot: {:?}", + event + )) + .into()), + }?; + } + + // We are generating a key pair and deriving a dataset ID from it. + // The key pair is discarded for now, but in future can be used for + // proof of control over dataset and metadata signing. + let (_keypair, dataset_id) = DatasetID::from_new_keypair_ed25519(); - let summary = repo - .get_dataset(&hdl.as_local_ref()) - .await - .int_err()? - .get_summary(GetSummaryOpts::default()) - .await - .int_err()?; - - if summary - .dependencies - .iter() - .any(|d| d.id.as_ref() == Some(&dataset_handle.id)) - { - yield hdl; + let system_time = Utc::now(); + + let create_result = dataset_repo + .create_dataset( + &DatasetAlias::new(account_name, snapshot.name), + MetadataBlockTyped { + system_time, + prev_block_hash: None, + event: Seed { + dataset_id, + dataset_kind: snapshot.kind, + }, + sequence_number: 0, + }, + ) + .await?; + + let chain = create_result.dataset.as_metadata_chain(); + let mut head = create_result.head.clone(); + let mut sequence_number = 1; + let mut new_upstream_ids: Vec = vec![]; + + for event in snapshot.metadata { + if let MetadataEvent::SetTransform(transform) = &event { + // Collect only the latest upstream dataset IDs + new_upstream_ids.clear(); + for new_input in transform.inputs.iter() { + // Note: the IDs have been checked in `resolve_transform_inputs` + new_upstream_ids.push(new_input.id.clone().unwrap()); } } + + head = chain + .append( + MetadataBlock { + system_time, + prev_block_hash: Some(head), + event, + sequence_number, + }, + AppendOpts { + update_ref: None, + ..AppendOpts::default() + }, + ) + .await + .int_err()?; + + sequence_number += 1; + } + + chain + .set_ref( + &BlockRef::Head, + &head, + SetRefOpts { + validate_block_present: false, + check_ref_is: Some(Some(&create_result.head)), + }, + ) + .await + .int_err()?; + + // TODO: encapsulate this inside dataset/chain + if !new_upstream_ids.is_empty() { + event_bus + .dispatch_event(DatasetEventDependenciesUpdated { + dataset_id: create_result.dataset_handle.id.clone(), + new_upstream_ids, + }) + .await + .int_err()?; + } + + Ok(CreateDatasetResult { + head, + ..create_result + }) +} + +///////////////////////////////////////////////////////////////////////////////////////// + +async fn resolve_transform_inputs( + repo: &T, + dataset_name: &DatasetName, + inputs: &mut Vec, +) -> Result<(), CreateDatasetFromSnapshotError> +where + T: DatasetRepository, + T: ?Sized, +{ + for input in inputs.iter_mut() { + if let Some(input_id) = &input.id { + // Input is referenced by ID - in this case we allow any name + match repo.resolve_dataset_ref(&input_id.as_local_ref()).await { + Ok(_) => Ok(()), + Err(GetDatasetError::NotFound(_)) => Err( + CreateDatasetFromSnapshotError::MissingInputs(MissingInputsError { + dataset_ref: dataset_name.into(), + missing_inputs: vec![input_id.as_local_ref()], + }), + ), + Err(GetDatasetError::Internal(e)) => Err(e.into()), + }?; + } else { + // When ID is not specified we try resolving it by name or a reference + + // When reference is available, it dominates + let input_local_ref = if let Some(dataset_ref) = &input.dataset_ref { + match dataset_ref.as_local_ref(|_| !repo.is_multi_tenant()) { + Ok(local_ref) => local_ref, + Err(_) => { + unimplemented!("Deriving from remote dataset is not supported yet"); + } + } + } else { + // Derive reference purely from a name assuming a default account + let input_alias = DatasetAlias::new(None, input.name.clone()); + input_alias.as_local_ref() + }; + + let hdl = match repo.resolve_dataset_ref(&input_local_ref).await { + Ok(hdl) => Ok(hdl), + Err(GetDatasetError::NotFound(_)) => Err( + CreateDatasetFromSnapshotError::MissingInputs(MissingInputsError { + dataset_ref: dataset_name.into(), + missing_inputs: vec![input_local_ref], + }), + ), + Err(GetDatasetError::Internal(e)) => Err(e.into()), + }?; + + input.id = Some(hdl.id); + } } + Ok(()) } ///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/src/repos/dataset_repository_local_fs.rs b/src/infra/core/src/repos/dataset_repository_local_fs.rs index 9594e834a2..b16e44eead 100644 --- a/src/infra/core/src/repos/dataset_repository_local_fs.rs +++ b/src/infra/core/src/repos/dataset_repository_local_fs.rs @@ -13,7 +13,7 @@ use std::sync::Arc; use async_trait::async_trait; use dill::*; use domain::auth::{DatasetAction, DatasetActionAuthorizer, DEFAULT_ACCOUNT_NAME}; -use futures::TryStreamExt; +use event_bus::EventBus; use kamu_core::*; use opendatafabric::*; use url::Url; @@ -25,6 +25,8 @@ use crate::*; pub struct DatasetRepositoryLocalFs { storage_strategy: Box, dataset_action_authorizer: Arc, + dependency_graph_service: Arc, + event_bus: Arc, thrash_lock: tokio::sync::Mutex<()>, } @@ -36,6 +38,8 @@ impl DatasetRepositoryLocalFs { root: PathBuf, current_account_subject: Arc, dataset_action_authorizer: Arc, + dependency_graph_service: Arc, + event_bus: Arc, multi_tenant: bool, ) -> Self { Self { @@ -43,11 +47,17 @@ impl DatasetRepositoryLocalFs { Box::new(DatasetMultiTenantStorageStrategy::new( root, current_account_subject, + event_bus.clone(), )) } else { - Box::new(DatasetSingleTenantStorageStrategy::new(root)) + Box::new(DatasetSingleTenantStorageStrategy::new( + root, + event_bus.clone(), + )) }, dataset_action_authorizer, + dependency_graph_service, + event_bus, thrash_lock: tokio::sync::Mutex::new(()), } } @@ -56,6 +66,8 @@ impl DatasetRepositoryLocalFs { root: impl Into, current_account_subject: Arc, dataset_action_authorizer: Arc, + dependency_graph_service: Arc, + event_bus: Arc, multi_tenant: bool, ) -> Result { let root = root.into(); @@ -64,6 +76,8 @@ impl DatasetRepositoryLocalFs { root, current_account_subject, dataset_action_authorizer, + dependency_graph_service, + event_bus, multi_tenant, )) } @@ -74,7 +88,10 @@ impl DatasetRepositoryLocalFs { dataset_handle: &DatasetHandle, ) -> Result { let layout = DatasetLayout::new(self.storage_strategy.get_dataset_path(&dataset_handle)); - Ok(DatasetFactoryImpl::get_local_fs(layout)) + Ok(DatasetFactoryImpl::get_local_fs( + layout, + self.event_bus.clone(), + )) } // TODO: Used only for testing, but should be removed it in future to discourage @@ -221,7 +238,7 @@ impl DatasetRepository for DatasetRepositoryLocalFs { let layout = DatasetLayout::create(&dataset_path).int_err()?; - let dataset = DatasetFactoryImpl::get_local_fs(layout); + let dataset = DatasetFactoryImpl::get_local_fs(layout, self.event_bus.clone()); // There are three possiblities at this point: // - Dataset did not exist before - continue normally @@ -257,6 +274,12 @@ impl DatasetRepository for DatasetRepositoryLocalFs { "Created new dataset", ); + self.event_bus + .dispatch_event(events::DatasetEventCreated { + dataset_id: dataset_handle.id.clone(), + }) + .await?; + Ok(CreateDatasetResult { dataset_handle, dataset: Arc::new(dataset), @@ -264,6 +287,15 @@ impl DatasetRepository for DatasetRepositoryLocalFs { }) } + async fn create_dataset_from_snapshot( + &self, + account_name: Option, + snapshot: DatasetSnapshot, + ) -> Result { + create_dataset_from_snapshot_impl(self, self.event_bus.as_ref(), account_name, snapshot) + .await + } + async fn rename_dataset( &self, dataset_ref: &DatasetRef, @@ -307,11 +339,25 @@ impl DatasetRepository for DatasetRepositoryLocalFs { Err(GetDatasetError::Internal(e)) => Err(DeleteDatasetError::Internal(e)), }?; - let children: Vec<_> = get_downstream_dependencies_impl(self, dataset_ref) - .try_collect() - .await?; + use tokio_stream::StreamExt; + let downstream_dataset_ids: Vec<_> = self + .dependency_graph_service + .get_downstream_dependencies(&dataset_handle.id) + .await + .int_err()? + .collect() + .await; + + if !downstream_dataset_ids.is_empty() { + let mut children = Vec::with_capacity(downstream_dataset_ids.len()); + for downstream_dataset_id in downstream_dataset_ids { + let hdl = self + .resolve_dataset_ref(&downstream_dataset_id.as_local_ref()) + .await + .int_err()?; + children.push(hdl); + } - if !children.is_empty() { return Err(DanglingReferenceError { dataset_handle, children, @@ -336,14 +382,14 @@ impl DatasetRepository for DatasetRepositoryLocalFs { let dataset_dir = self.storage_strategy.get_dataset_path(&dataset_handle); tokio::fs::remove_dir_all(dataset_dir).await.int_err()?; - Ok(()) - } - fn get_downstream_dependencies<'s>( - &'s self, - dataset_ref: &'s DatasetRef, - ) -> DatasetHandleStream<'s> { - Box::pin(get_downstream_dependencies_impl(self, dataset_ref)) + self.event_bus + .dispatch_event(events::DatasetEventDeleted { + dataset_id: dataset_handle.id, + }) + .await?; + + Ok(()) } } @@ -402,11 +448,15 @@ enum ResolveDatasetError { struct DatasetSingleTenantStorageStrategy { root: PathBuf, + event_bus: Arc, } impl DatasetSingleTenantStorageStrategy { - pub fn new(root: impl Into) -> Self { - Self { root: root.into() } + pub fn new(root: impl Into, event_bus: Arc) -> Self { + Self { + root: root.into(), + event_bus, + } } fn dataset_name<'a>(&self, dataset_alias: &'a DatasetAlias) -> &'a DatasetName { @@ -424,7 +474,7 @@ impl DatasetSingleTenantStorageStrategy { dataset_alias: &DatasetAlias, ) -> Result { let layout = DatasetLayout::new(dataset_path); - let dataset = DatasetFactoryImpl::get_local_fs(layout); + let dataset = DatasetFactoryImpl::get_local_fs(layout, self.event_bus.clone()); dataset .get_summary(GetSummaryOpts::default()) .await @@ -564,16 +614,19 @@ impl DatasetStorageStrategy for DatasetSingleTenantStorageStrategy { struct DatasetMultiTenantStorageStrategy { root: PathBuf, current_account_subject: Arc, + event_bus: Arc, } impl DatasetMultiTenantStorageStrategy { pub fn new( root: impl Into, current_account_subject: Arc, + event_bus: Arc, ) -> Self { Self { root: root.into(), current_account_subject, + event_bus, } } @@ -597,7 +650,7 @@ impl DatasetMultiTenantStorageStrategy { dataset_id: &DatasetID, ) -> Result { let layout = DatasetLayout::new(dataset_path); - let dataset = DatasetFactoryImpl::get_local_fs(layout); + let dataset = DatasetFactoryImpl::get_local_fs(layout, self.event_bus.clone()); match dataset.as_info_repo().get("alias").await { Ok(bytes) => { let dataset_alias_str = std::str::from_utf8(&bytes[..]).int_err()?.trim(); @@ -824,7 +877,7 @@ impl DatasetStorageStrategy for DatasetMultiTenantStorageStrategy { ) -> Result<(), InternalError> { let dataset_path = self.get_dataset_path(dataset_handle); let layout = DatasetLayout::new(dataset_path); - let dataset = DatasetFactoryImpl::get_local_fs(layout); + let dataset = DatasetFactoryImpl::get_local_fs(layout, self.event_bus.clone()); let new_alias = DatasetAlias::new(dataset_handle.alias.account_name.clone(), new_name.clone()); diff --git a/src/infra/core/src/repos/dataset_repository_s3.rs b/src/infra/core/src/repos/dataset_repository_s3.rs index c6b55e9ec0..2093f9c67f 100644 --- a/src/infra/core/src/repos/dataset_repository_s3.rs +++ b/src/infra/core/src/repos/dataset_repository_s3.rs @@ -11,13 +11,14 @@ use std::sync::Arc; use async_trait::async_trait; use dill::*; -use futures::TryStreamExt; +use event_bus::EventBus; use kamu_core::auth::{DatasetAction, DatasetActionAuthorizer, DEFAULT_ACCOUNT_NAME}; use kamu_core::*; use opendatafabric::*; use url::Url; -use super::{get_downstream_dependencies_impl, DatasetFactoryImpl}; +use super::DatasetFactoryImpl; +use crate::create_dataset_from_snapshot_impl; use crate::utils::s3_context::S3Context; ///////////////////////////////////////////////////////////////////////////////////////// @@ -27,6 +28,8 @@ pub struct DatasetRepositoryS3 { s3_context: S3Context, current_account_subject: Arc, dataset_action_authorizer: Arc, + dependency_graph_service: Arc, + event_bus: Arc, multi_tenant: bool, } @@ -37,12 +40,16 @@ impl DatasetRepositoryS3 { s3_context: S3Context, current_account_subject: Arc, dataset_action_authorizer: Arc, + dependency_graph_service: Arc, + event_bus: Arc, multi_tenant: bool, ) -> Self { Self { s3_context, current_account_subject, dataset_action_authorizer, + dependency_graph_service, + event_bus, multi_tenant, } } @@ -55,7 +62,7 @@ impl DatasetRepositoryS3 { .s3_context .sub_context(&format!("{}/", &dataset_id.cid.to_string())); - DatasetFactoryImpl::get_s3_from_context(s3_context).await + DatasetFactoryImpl::get_s3_from_context(s3_context, self.event_bus.clone()).await } async fn delete_dataset_s3_objects(&self, dataset_id: &DatasetID) -> Result<(), InternalError> { @@ -295,6 +302,12 @@ impl DatasetRepository for DatasetRepositoryS3 { let dataset_handle = DatasetHandle::new(dataset_id, dataset_alias.clone()); + self.event_bus + .dispatch_event(events::DatasetEventCreated { + dataset_id: dataset_handle.id.clone(), + }) + .await?; + tracing::info!( id = %dataset_handle.id, alias = %dataset_handle.alias, @@ -309,6 +322,15 @@ impl DatasetRepository for DatasetRepositoryS3 { }) } + async fn create_dataset_from_snapshot( + &self, + account_name: Option, + snapshot: DatasetSnapshot, + ) -> Result { + create_dataset_from_snapshot_impl(self, self.event_bus.as_ref(), account_name, snapshot) + .await + } + async fn rename_dataset( &self, dataset_ref: &DatasetRef, @@ -350,11 +372,25 @@ impl DatasetRepository for DatasetRepositoryS3 { Err(GetDatasetError::Internal(e)) => return Err(DeleteDatasetError::Internal(e)), }; - let children: Vec<_> = get_downstream_dependencies_impl(self, dataset_ref) - .try_collect() - .await?; + use tokio_stream::StreamExt; + let downstream_dataset_ids: Vec<_> = self + .dependency_graph_service + .get_downstream_dependencies(&dataset_handle.id) + .await + .int_err()? + .collect() + .await; + + if !downstream_dataset_ids.is_empty() { + let mut children = Vec::with_capacity(downstream_dataset_ids.len()); + for downstream_dataset_id in downstream_dataset_ids { + let hdl = self + .resolve_dataset_ref(&downstream_dataset_id.as_local_ref()) + .await + .int_err()?; + children.push(hdl); + } - if !children.is_empty() { return Err(DanglingReferenceError { dataset_handle, children, @@ -366,17 +402,17 @@ impl DatasetRepository for DatasetRepositoryS3 { .check_action_allowed(&dataset_handle, DatasetAction::Write) .await?; - match self.delete_dataset_s3_objects(&dataset_handle.id).await { - Ok(_) => Ok(()), - Err(e) => Err(DeleteDatasetError::Internal(e)), - } - } + self.delete_dataset_s3_objects(&dataset_handle.id) + .await + .map_err(|e| DeleteDatasetError::Internal(e))?; - fn get_downstream_dependencies<'s>( - &'s self, - dataset_ref: &'s DatasetRef, - ) -> DatasetHandleStream<'s> { - Box::pin(get_downstream_dependencies_impl(self, dataset_ref)) + self.event_bus + .dispatch_event(events::DatasetEventDeleted { + dataset_id: dataset_handle.id, + }) + .await?; + + Ok(()) } } diff --git a/src/infra/core/src/repos/object_store_builder_local_fs.rs b/src/infra/core/src/repos/object_store_builder_local_fs.rs index 15fbefde4f..4c18c90584 100644 --- a/src/infra/core/src/repos/object_store_builder_local_fs.rs +++ b/src/infra/core/src/repos/object_store_builder_local_fs.rs @@ -16,6 +16,7 @@ use url::Url; ///////////////////////////////////////////////////////////////////////////////////////// #[component(pub)] +#[interface(dyn ObjectStoreBuilder)] pub struct ObjectStoreBuilderLocalFs {} impl ObjectStoreBuilderLocalFs { diff --git a/src/infra/core/src/repos/object_store_registy_impl.rs b/src/infra/core/src/repos/object_store_registy_impl.rs index 6e78e8d9f0..aebd06b0f6 100644 --- a/src/infra/core/src/repos/object_store_registy_impl.rs +++ b/src/infra/core/src/repos/object_store_registy_impl.rs @@ -27,6 +27,7 @@ pub struct ObjectStoreRegistryImpl { } #[dill::component(pub)] +#[dill::interface(dyn ObjectStoreRegistry)] #[dill::scope(dill::Singleton)] impl ObjectStoreRegistryImpl { pub fn new(builders: Vec>) -> Self { diff --git a/src/infra/core/src/reset_service_impl.rs b/src/infra/core/src/reset_service_impl.rs index a2d1f713c3..de3abe60ff 100644 --- a/src/infra/core/src/reset_service_impl.rs +++ b/src/infra/core/src/reset_service_impl.rs @@ -19,6 +19,7 @@ pub struct ResetServiceImpl { } #[component(pub)] +#[interface(dyn ResetService)] impl ResetServiceImpl { pub fn new( dataset_repo: Arc, diff --git a/src/infra/core/src/resource_loader_impl.rs b/src/infra/core/src/resource_loader_impl.rs index 93c372f21c..d2a0bf59fa 100644 --- a/src/infra/core/src/resource_loader_impl.rs +++ b/src/infra/core/src/resource_loader_impl.rs @@ -9,13 +9,14 @@ use std::path::Path; -use dill::component; +use dill::{component, interface}; use kamu_core::*; use opendatafabric::serde::yaml::*; use opendatafabric::*; use url::Url; #[component] +#[interface(dyn ResourceLoader)] pub struct ResourceLoaderImpl {} impl ResourceLoaderImpl { diff --git a/src/infra/core/src/search_service_impl.rs b/src/infra/core/src/search_service_impl.rs index 187c120aa7..5621e65799 100644 --- a/src/infra/core/src/search_service_impl.rs +++ b/src/infra/core/src/search_service_impl.rs @@ -21,6 +21,7 @@ pub struct SearchServiceImpl { } #[component(pub)] +#[interface(dyn SearchService)] impl SearchServiceImpl { pub fn new(remote_repo_reg: Arc) -> Self { Self { remote_repo_reg } diff --git a/src/infra/core/src/sync_service_impl.rs b/src/infra/core/src/sync_service_impl.rs index 49a712dec7..2839d4a819 100644 --- a/src/infra/core/src/sync_service_impl.rs +++ b/src/infra/core/src/sync_service_impl.rs @@ -35,6 +35,7 @@ pub struct SyncServiceImpl { ///////////////////////////////////////////////////////////////////////////////////////// #[component(pub)] +#[interface(dyn SyncService)] impl SyncServiceImpl { pub fn new( remote_repo_reg: Arc, diff --git a/src/infra/core/src/testing/metadata_factory.rs b/src/infra/core/src/testing/metadata_factory.rs index 7f09d44446..e1d62fc755 100644 --- a/src/infra/core/src/testing/metadata_factory.rs +++ b/src/infra/core/src/testing/metadata_factory.rs @@ -7,6 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +use std::collections::HashMap; use std::path::Path; use chrono::{DateTime, Utc}; @@ -421,6 +422,15 @@ impl SetTransformBuilder { self } + pub fn set_dataset_ids(mut self, mut ids: HashMap) -> Self { + for input in self.v.inputs.iter_mut() { + if let Some(input_id) = ids.remove(&input.name) { + input.id = Some(input_id) + } + } + self + } + pub fn transform(mut self, transform: Transform) -> Self { self.v.transform = transform; self diff --git a/src/infra/core/src/transform_service_impl.rs b/src/infra/core/src/transform_service_impl.rs index 2c03003ae5..5b5336484e 100644 --- a/src/infra/core/src/transform_service_impl.rs +++ b/src/infra/core/src/transform_service_impl.rs @@ -25,6 +25,7 @@ pub struct TransformServiceImpl { } #[component(pub)] +#[interface(dyn TransformService)] impl TransformServiceImpl { pub fn new( dataset_repo: Arc, diff --git a/src/infra/core/src/verification_service_impl.rs b/src/infra/core/src/verification_service_impl.rs index fddd129d22..301bea94d0 100644 --- a/src/infra/core/src/verification_service_impl.rs +++ b/src/infra/core/src/verification_service_impl.rs @@ -24,6 +24,7 @@ pub struct VerificationServiceImpl { } #[component(pub)] +#[interface(dyn VerificationService)] impl VerificationServiceImpl { pub fn new( dataset_repo: Arc, diff --git a/src/infra/core/tests/tests/auth/test_authentication_service.rs b/src/infra/core/tests/tests/auth/test_authentication_service.rs index 321aa6194a..c1776c2263 100644 --- a/src/infra/core/tests/tests/auth/test_authentication_service.rs +++ b/src/infra/core/tests/tests/auth/test_authentication_service.rs @@ -131,11 +131,8 @@ fn make_catalog() -> dill::Catalog { dill::CatalogBuilder::new() .add::() - .bind::() .add::() - .bind::() .add::() - .bind::() .add_value(SystemTimeSourceStub::new_set(Utc::now())) .bind::() .build() @@ -147,6 +144,7 @@ struct DummyAuthenticationProviderA {} struct DummyAuthenticationProviderB {} #[dill::component(pub)] +#[dill::interface(dyn AuthenticationProvider)] impl DummyAuthenticationProviderA { fn new() -> Self { Self {} @@ -154,6 +152,7 @@ impl DummyAuthenticationProviderA { } #[dill::component(pub)] +#[dill::interface(dyn AuthenticationProvider)] impl DummyAuthenticationProviderB { fn new() -> Self { Self {} diff --git a/src/infra/core/tests/tests/engine/test_engine_io.rs b/src/infra/core/tests/tests/engine/test_engine_io.rs index 353c4fcc6c..4d9e3ba94f 100644 --- a/src/infra/core/tests/tests/engine/test_engine_io.rs +++ b/src/infra/core/tests/tests/engine/test_engine_io.rs @@ -12,6 +12,8 @@ use std::path::Path; use std::sync::Arc; use container_runtime::ContainerRuntime; +use dill::Component; +use event_bus::EventBus; use indoc::indoc; use kamu::domain::*; use kamu::testing::*; @@ -235,15 +237,20 @@ async fn test_engine_io_local_file_mount() { std::fs::create_dir(&run_info_dir).unwrap(); std::fs::create_dir(&cache_dir).unwrap(); - let dataset_repo = Arc::new( - DatasetRepositoryLocalFs::create( - tempdir.path().join("datasets"), - Arc::new(CurrentAccountSubject::new_test()), - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - false, + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(tempdir.path().join("datasets")) + .with_multi_tenant(false), ) - .unwrap(), - ); + .bind::() + .build(); + + let dataset_repo = catalog.get_one::().unwrap(); test_engine_io_common( vec![Arc::new(ObjectStoreBuilderLocalFs::new())], @@ -272,12 +279,20 @@ async fn test_engine_io_s3_to_local_file_mount_proxy() { let s3_context = kamu::utils::s3_context::S3Context::from_url(&s3.url).await; - let dataset_repo = Arc::new(DatasetRepositoryS3::new( - s3_context.clone(), - Arc::new(CurrentAccountSubject::new_test()), - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - false, - )); + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_builder( + DatasetRepositoryS3::builder() + .with_s3_context(s3_context.clone()) + .with_multi_tenant(false), + ) + .bind::() + .build(); + + let dataset_repo = catalog.get_one::().unwrap(); test_engine_io_common( vec![ diff --git a/src/infra/core/tests/tests/engine/test_engine_transform.rs b/src/infra/core/tests/tests/engine/test_engine_transform.rs index 91179fb67d..49ba94394a 100644 --- a/src/infra/core/tests/tests/engine/test_engine_transform.rs +++ b/src/infra/core/tests/tests/engine/test_engine_transform.rs @@ -14,6 +14,8 @@ use std::sync::Arc; use chrono::{TimeZone, Utc}; use container_runtime::ContainerRuntime; use datafusion::arrow::record_batch::RecordBatch; +use dill::Component; +use event_bus::EventBus; use futures::StreamExt; use indoc::indoc; use kamu::domain::*; @@ -211,48 +213,46 @@ async fn test_transform_common(transform: Transform) { std::fs::create_dir(&run_info_dir).unwrap(); std::fs::create_dir(&cache_dir).unwrap(); - let dataset_repo = Arc::new( - DatasetRepositoryLocalFs::create( - tempdir.path().join("datasets"), - Arc::new(CurrentAccountSubject::new_test()), - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - false, + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(tempdir.path().join("datasets")) + .with_multi_tenant(false), ) - .unwrap(), - ); - let engine_provisioner = Arc::new(EngineProvisionerLocal::new( - EngineProvisionerLocalConfig::default(), - ContainerRuntime::default(), - dataset_repo.clone(), - run_info_dir.clone(), - )); - - let dataset_action_authorizer = Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()); - - let time_source = Arc::new(SystemTimeSourceStub::new_set( - Utc.with_ymd_and_hms(2050, 1, 1, 12, 0, 0).unwrap(), - )); - - let ingest_svc = PollingIngestServiceImpl::new( - dataset_repo.clone(), - dataset_action_authorizer.clone(), - engine_provisioner.clone(), - Arc::new(ObjectStoreRegistryImpl::new(vec![Arc::new( + .bind::() + .add_builder( + EngineProvisionerLocal::builder() + .with_config(EngineProvisionerLocalConfig::default()) + .with_container_runtime(ContainerRuntime::default()) + .with_run_info_dir(run_info_dir.clone()), + ) + .bind::() + .add_value(ObjectStoreRegistryImpl::new(vec![Arc::new( ObjectStoreBuilderLocalFs::new(), - )])), - Arc::new(DataFormatRegistryImpl::new()), - Arc::new(ContainerRuntime::default()), - run_info_dir, - cache_dir, - time_source.clone(), - ); + )])) + .bind::() + .add_builder( + PollingIngestServiceImpl::builder() + .with_cache_dir(cache_dir) + .with_run_info_dir(run_info_dir) + .with_container_runtime(Arc::new(ContainerRuntime::default())) + .with_data_format_registry(Arc::new(DataFormatRegistryImpl::new())), + ) + .bind::() + .add::() + .add_value(SystemTimeSourceStub::new_set( + Utc.with_ymd_and_hms(2050, 1, 1, 12, 0, 0).unwrap(), + )) + .bind::() + .build(); - let transform_svc = TransformServiceImpl::new( - dataset_repo.clone(), - dataset_action_authorizer.clone(), - engine_provisioner.clone(), - time_source.clone(), - ); + let dataset_repo = catalog.get_one::().unwrap(); + let ingest_svc = catalog.get_one::().unwrap(); + let transform_svc = catalog.get_one::().unwrap(); /////////////////////////////////////////////////////////////////////////// // Root setup @@ -340,6 +340,7 @@ async fn test_transform_common(transform: Transform) { let deriv_helper = DatasetHelper::new(dataset.clone(), tempdir.path()); let deriv_data_helper = DatasetDataHelper::new(dataset); + let time_source = catalog.get_one::().unwrap(); time_source.set(Utc.with_ymd_and_hms(2050, 1, 2, 12, 0, 0).unwrap()); let res = transform_svc diff --git a/src/infra/core/tests/tests/ingest/test_fetch.rs b/src/infra/core/tests/tests/ingest/test_fetch.rs index 6507020acc..4211a2881d 100644 --- a/src/infra/core/tests/tests/ingest/test_fetch.rs +++ b/src/infra/core/tests/tests/ingest/test_fetch.rs @@ -632,6 +632,7 @@ async fn test_fetch_files_glob() { /////////////////////////////////////////////////////////////////////////////// #[test_group::group(containerized)] +#[ignore] #[test_log::test(tokio::test)] async fn test_fetch_container_ok() { let tempdir = tempfile::tempdir().unwrap(); diff --git a/src/infra/core/tests/tests/ingest/test_polling_ingest.rs b/src/infra/core/tests/tests/ingest/test_polling_ingest.rs index a7644c212c..f8184643a9 100644 --- a/src/infra/core/tests/tests/ingest/test_polling_ingest.rs +++ b/src/infra/core/tests/tests/ingest/test_polling_ingest.rs @@ -14,9 +14,10 @@ use chrono::{TimeZone, Utc}; use container_runtime::ContainerRuntime; use datafusion::parquet::record::RowAccessor; use datafusion::prelude::*; +use dill::Component; +use event_bus::EventBus; use indoc::indoc; use itertools::Itertools; -use kamu::domain::auth::DatasetActionAuthorizer; use kamu::domain::*; use kamu::testing::*; use kamu::*; @@ -955,13 +956,13 @@ async fn test_ingest_polling_datafusion_bad_column_names_rename() { #[test_group::group(engine, ingest, datafusion)] #[test_log::test(tokio::test)] -async fn test_ingest_polling_checks_auth() { - let harness = IngestTestHarness::new_with_authorizer(Arc::new( +async fn test_ingest_checks_auth() { + let harness = IngestTestHarness::new_with_authorizer( MockDatasetActionAuthorizer::new().expect_check_write_dataset( DatasetAlias::new(None, DatasetName::new_unchecked("foo.bar")), 1, ), - )); + ); let src_path = harness.temp_dir.path().join("data.json"); let dataset_snapshot = MetadataFactory::dataset_snapshot() @@ -1006,60 +1007,65 @@ async fn test_ingest_polling_checks_auth() { struct IngestTestHarness { temp_dir: TempDir, - dataset_repo: Arc, - ingest_svc: Arc, + dataset_repo: Arc, + ingest_svc: Arc, time_source: Arc, ctx: SessionContext, } impl IngestTestHarness { fn new() -> Self { - Self::new_with_authorizer(Arc::new( - kamu_core::auth::AlwaysHappyDatasetActionAuthorizer::new(), - )) + Self::new_with_authorizer(kamu_core::auth::AlwaysHappyDatasetActionAuthorizer::new()) } - fn new_with_authorizer(dataset_action_authorizer: Arc) -> Self { + fn new_with_authorizer( + dataset_action_authorizer: TDatasetAuthorizer, + ) -> Self { let temp_dir = tempfile::tempdir().unwrap(); let run_info_dir = temp_dir.path().join("run"); let cache_dir = temp_dir.path().join("cache"); std::fs::create_dir(&run_info_dir).unwrap(); std::fs::create_dir(&cache_dir).unwrap(); - let dataset_repo = Arc::new( - DatasetRepositoryLocalFs::create( - temp_dir.path().join("datasets"), - Arc::new(CurrentAccountSubject::new_test()), - dataset_action_authorizer.clone(), - false, + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_value(dataset_action_authorizer) + .bind::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(temp_dir.path().join("datasets")) + .with_multi_tenant(false), + ) + .bind::() + .add_builder( + EngineProvisionerLocal::builder() + .with_config(EngineProvisionerLocalConfig::default()) + .with_container_runtime(ContainerRuntime::default()) + .with_run_info_dir(run_info_dir.clone()), ) - .unwrap(), - ); - - let engine_provisioner = Arc::new(EngineProvisionerLocal::new( - EngineProvisionerLocalConfig::default(), - ContainerRuntime::default(), - dataset_repo.clone(), - run_info_dir.clone(), - )); - - let time_source = Arc::new(SystemTimeSourceStub::new_set( - Utc.with_ymd_and_hms(2050, 1, 1, 12, 0, 0).unwrap(), - )); - - let ingest_svc = Arc::new(PollingIngestServiceImpl::new( - dataset_repo.clone(), - dataset_action_authorizer, - engine_provisioner, - Arc::new(ObjectStoreRegistryImpl::new(vec![Arc::new( - ObjectStoreBuilderLocalFs::new(), - )])), - Arc::new(DataFormatRegistryImpl::new()), - Arc::new(ContainerRuntime::default()), - run_info_dir, - cache_dir, - time_source.clone(), - )); + .bind::() + .add_value(SystemTimeSourceStub::new_set( + Utc.with_ymd_and_hms(2050, 1, 1, 12, 0, 0).unwrap(), + )) + .bind::() + .add_builder( + PollingIngestServiceImpl::builder() + .with_cache_dir(cache_dir) + .with_container_runtime(Arc::new(ContainerRuntime::default())) + .with_object_store_registry(Arc::new(ObjectStoreRegistryImpl::new(vec![ + Arc::new(ObjectStoreBuilderLocalFs::new()), + ]))) + .with_data_format_registry(Arc::new(DataFormatRegistryImpl::new())) + .with_run_info_dir(run_info_dir), + ) + .bind::() + .build(); + + let dataset_repo = catalog.get_one::().unwrap(); + let ingest_svc = catalog.get_one::().unwrap(); + let time_source = catalog.get_one::().unwrap(); Self { temp_dir, diff --git a/src/infra/core/tests/tests/ingest/test_push_ingest.rs b/src/infra/core/tests/tests/ingest/test_push_ingest.rs index 1cc18e6a66..920bddb2a9 100644 --- a/src/infra/core/tests/tests/ingest/test_push_ingest.rs +++ b/src/infra/core/tests/tests/ingest/test_push_ingest.rs @@ -11,8 +11,9 @@ use std::sync::Arc; use chrono::{TimeZone, Utc}; use datafusion::prelude::*; +use dill::Component; +use event_bus::EventBus; use indoc::indoc; -use kamu::domain::auth::DatasetActionAuthorizer; use kamu::domain::*; use kamu::testing::*; use kamu::*; @@ -355,49 +356,54 @@ async fn test_ingest_push_media_type_override() { struct IngestTestHarness { temp_dir: TempDir, - dataset_repo: Arc, - push_ingest_svc: Arc, + dataset_repo: Arc, + push_ingest_svc: Arc, ctx: SessionContext, } impl IngestTestHarness { fn new() -> Self { - Self::new_with_authorizer(Arc::new( - kamu_core::auth::AlwaysHappyDatasetActionAuthorizer::new(), - )) + Self::new_with_authorizer(kamu_core::auth::AlwaysHappyDatasetActionAuthorizer::new()) } - fn new_with_authorizer(dataset_action_authorizer: Arc) -> Self { + fn new_with_authorizer( + dataset_action_authorizer: TDatasetAuthorizer, + ) -> Self { let temp_dir = tempfile::tempdir().unwrap(); let run_info_dir = temp_dir.path().join("run"); let cache_dir = temp_dir.path().join("cache"); std::fs::create_dir(&run_info_dir).unwrap(); std::fs::create_dir(&cache_dir).unwrap(); - let dataset_repo = Arc::new( - DatasetRepositoryLocalFs::create( - temp_dir.path().join("datasets"), - Arc::new(CurrentAccountSubject::new_test()), - dataset_action_authorizer.clone(), - false, + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_value(dataset_action_authorizer) + .bind::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(temp_dir.path().join("datasets")) + .with_multi_tenant(false), + ) + .bind::() + .add_value(SystemTimeSourceStub::new_set( + Utc.with_ymd_and_hms(2050, 1, 1, 12, 0, 0).unwrap(), + )) + .bind::() + .add_builder( + PushIngestServiceImpl::builder() + .with_object_store_registry(Arc::new(ObjectStoreRegistryImpl::new(vec![ + Arc::new(ObjectStoreBuilderLocalFs::new()), + ]))) + .with_data_format_registry(Arc::new(DataFormatRegistryImpl::new())) + .with_run_info_dir(run_info_dir), ) - .unwrap(), - ); - - let time_source = Arc::new(SystemTimeSourceStub::new_set( - Utc.with_ymd_and_hms(2050, 1, 1, 12, 0, 0).unwrap(), - )); - - let push_ingest_svc = Arc::new(PushIngestServiceImpl::new( - dataset_repo.clone(), - dataset_action_authorizer, - Arc::new(ObjectStoreRegistryImpl::new(vec![Arc::new( - ObjectStoreBuilderLocalFs::new(), - )])), - Arc::new(DataFormatRegistryImpl::new()), - run_info_dir, - time_source, - )); + .bind::() + .build(); + + let dataset_repo = catalog.get_one::().unwrap(); + let push_ingest_svc = catalog.get_one::().unwrap(); Self { temp_dir, diff --git a/src/infra/core/tests/tests/ingest/test_writer.rs b/src/infra/core/tests/tests/ingest/test_writer.rs index af5363efba..e98b7a7a2d 100644 --- a/src/infra/core/tests/tests/ingest/test_writer.rs +++ b/src/infra/core/tests/tests/ingest/test_writer.rs @@ -14,9 +14,11 @@ use std::sync::Arc; use chrono::{DateTime, TimeZone, Utc}; use datafusion::arrow::datatypes::Schema; use datafusion::prelude::*; +use dill::Component; +use event_bus::EventBus; use indoc::indoc; use kamu::testing::MetadataFactory; -use kamu::DatasetRepositoryLocalFs; +use kamu::{DatasetRepositoryLocalFs, DependencyGraphServiceInMemory}; use kamu_core::*; use kamu_data_utils::testing::{assert_data_eq, assert_schema_eq}; use kamu_ingest_datafusion::*; @@ -747,15 +749,20 @@ impl Harness { let temp_dir = tempfile::tempdir().unwrap(); let system_time = Utc.with_ymd_and_hms(2010, 1, 1, 12, 0, 0).unwrap(); - let dataset_repo = Arc::new( - DatasetRepositoryLocalFs::create( - temp_dir.path().join("datasets"), - Arc::new(CurrentAccountSubject::new_test()), - Arc::new(kamu_core::auth::AlwaysHappyDatasetActionAuthorizer::new()), - false, + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(temp_dir.path().join("datasets")) + .with_multi_tenant(false), ) - .unwrap(), - ); + .bind::() + .build(); + + let dataset_repo = catalog.get_one::().unwrap(); let dataset = dataset_repo .create_dataset( diff --git a/src/infra/core/tests/tests/mod.rs b/src/infra/core/tests/tests/mod.rs index de9f249cb6..5468710857 100644 --- a/src/infra/core/tests/tests/mod.rs +++ b/src/infra/core/tests/tests/mod.rs @@ -11,6 +11,7 @@ mod auth; mod engine; mod ingest; mod repos; +mod test_dependency_graph_inmem; mod test_metadata_chain_comparator; mod test_pull_service_impl; mod test_query_service_impl; diff --git a/src/infra/core/tests/tests/repos/test_dataset_impl.rs b/src/infra/core/tests/tests/repos/test_dataset_impl.rs index cfd9e284b3..32ee4cbad9 100644 --- a/src/infra/core/tests/tests/repos/test_dataset_impl.rs +++ b/src/infra/core/tests/tests/repos/test_dataset_impl.rs @@ -9,6 +9,7 @@ use std::assert_matches::assert_matches; +use event_bus::EventBus; use kamu::domain::*; use kamu::testing::*; use kamu::*; @@ -18,7 +19,9 @@ use opendatafabric::*; async fn test_summary_updates() { let tmp_dir = tempfile::tempdir().unwrap(); let layout = DatasetLayout::create(tmp_dir.path()).unwrap(); - let ds = DatasetFactoryImpl::get_local_fs(layout); + + let catalog = dill::CatalogBuilder::new().add::().build(); + let ds = DatasetFactoryImpl::get_local_fs(layout, catalog.get_one().unwrap()); assert_matches!( ds.get_summary(GetSummaryOpts::default()).await, diff --git a/src/infra/core/tests/tests/repos/test_dataset_repository_local_fs.rs b/src/infra/core/tests/tests/repos/test_dataset_repository_local_fs.rs index c522a03825..1a62169e92 100644 --- a/src/infra/core/tests/tests/repos/test_dataset_repository_local_fs.rs +++ b/src/infra/core/tests/tests/repos/test_dataset_repository_local_fs.rs @@ -9,7 +9,9 @@ use std::sync::Arc; -use domain::{auth, CurrentAccountSubject}; +use dill::Component; +use domain::{auth, CurrentAccountSubject, DatasetRepository, DependencyGraphService}; +use event_bus::EventBus; use kamu::testing::MockDatasetActionAuthorizer; use kamu::*; use opendatafabric::AccountName; @@ -19,18 +21,51 @@ use super::test_dataset_repository_shared; ///////////////////////////////////////////////////////////////////////////////////////// -fn local_fs_repo( - tempdir: &TempDir, - dataset_action_authorizer: Arc, - multi_tenant: bool, -) -> DatasetRepositoryLocalFs { - DatasetRepositoryLocalFs::create( - tempdir.path().join("datasets"), - Arc::new(CurrentAccountSubject::new_test()), - dataset_action_authorizer, - multi_tenant, - ) - .unwrap() +struct LocalFsRepoHarness { + catalog: dill::Catalog, + dataset_repo: Arc, +} + +impl LocalFsRepoHarness { + pub async fn create( + tempdir: &TempDir, + dataset_action_authorizer: TDatasetActionAuthorizer, + multi_tenant: bool, + ) -> Self { + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_value(dataset_action_authorizer) + .bind::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(tempdir.path().join("datasets")) + .with_multi_tenant(multi_tenant), + ) + .bind::() + .build(); + + let dataset_repo = catalog.get_one().unwrap(); + + Self { + catalog, + dataset_repo, + } + } + + pub async fn dependencies_eager_initialization(&self) { + let dependency_graph_service = self + .catalog + .get_one::() + .unwrap(); + dependency_graph_service + .eager_initialization(&DependencyGraphRepositoryInMemory::new( + self.dataset_repo.clone(), + )) + .await + .unwrap(); + } } ///////////////////////////////////////////////////////////////////////////////////////// @@ -38,13 +73,14 @@ fn local_fs_repo( #[tokio::test] async fn test_create_dataset() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( + let harness = LocalFsRepoHarness::create( &tempdir, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), + auth::AlwaysHappyDatasetActionAuthorizer::new(), false, - ); + ) + .await; - test_dataset_repository_shared::test_create_dataset(&repo, None).await; + test_dataset_repository_shared::test_create_dataset(harness.dataset_repo.as_ref(), None).await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -52,14 +88,15 @@ async fn test_create_dataset() { #[tokio::test] async fn test_create_dataset_multi_tenant() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( + let harness = LocalFsRepoHarness::create( &tempdir, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), + auth::AlwaysHappyDatasetActionAuthorizer::new(), true, - ); + ) + .await; test_dataset_repository_shared::test_create_dataset( - &repo, + harness.dataset_repo.as_ref(), Some(AccountName::new_unchecked(auth::DEFAULT_ACCOUNT_NAME)), ) .await; @@ -70,13 +107,17 @@ async fn test_create_dataset_multi_tenant() { #[tokio::test] async fn test_create_dataset_same_name_multiple_tenants() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( + let harness = LocalFsRepoHarness::create( &tempdir, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), + auth::AlwaysHappyDatasetActionAuthorizer::new(), true, - ); + ) + .await; - test_dataset_repository_shared::test_create_dataset_same_name_multiple_tenants(&repo).await; + test_dataset_repository_shared::test_create_dataset_same_name_multiple_tenants( + harness.dataset_repo.as_ref(), + ) + .await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -84,13 +125,18 @@ async fn test_create_dataset_same_name_multiple_tenants() { #[tokio::test] async fn test_create_dataset_from_snapshot() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( + let harness = LocalFsRepoHarness::create( &tempdir, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), + auth::AlwaysHappyDatasetActionAuthorizer::new(), false, - ); + ) + .await; - test_dataset_repository_shared::test_create_dataset_from_snapshot(&repo, None).await; + test_dataset_repository_shared::test_create_dataset_from_snapshot( + harness.dataset_repo.as_ref(), + None, + ) + .await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -98,14 +144,15 @@ async fn test_create_dataset_from_snapshot() { #[tokio::test] async fn test_create_dataset_from_snapshot_multi_tenant() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( + let harness = LocalFsRepoHarness::create( &tempdir, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), + auth::AlwaysHappyDatasetActionAuthorizer::new(), true, - ); + ) + .await; test_dataset_repository_shared::test_create_dataset_from_snapshot( - &repo, + harness.dataset_repo.as_ref(), Some(AccountName::new_unchecked(auth::DEFAULT_ACCOUNT_NAME)), ) .await; @@ -116,13 +163,14 @@ async fn test_create_dataset_from_snapshot_multi_tenant() { #[tokio::test] async fn test_rename_dataset() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( + let harness = LocalFsRepoHarness::create( &tempdir, - Arc::new(MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1)), + MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1), false, - ); + ) + .await; - test_dataset_repository_shared::test_rename_dataset(&repo, None).await; + test_dataset_repository_shared::test_rename_dataset(harness.dataset_repo.as_ref(), None).await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -130,14 +178,15 @@ async fn test_rename_dataset() { #[tokio::test] async fn test_rename_dataset_multi_tenant() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( + let harness = LocalFsRepoHarness::create( &tempdir, - Arc::new(MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1)), + MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1), true, - ); + ) + .await; test_dataset_repository_shared::test_rename_dataset( - &repo, + harness.dataset_repo.as_ref(), Some(AccountName::new_unchecked(auth::DEFAULT_ACCOUNT_NAME)), ) .await; @@ -148,13 +197,17 @@ async fn test_rename_dataset_multi_tenant() { #[tokio::test] async fn test_rename_dataset_same_name_multiple_tenants() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( + let harness = LocalFsRepoHarness::create( &tempdir, - Arc::new(MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1)), + MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1), true, - ); + ) + .await; - test_dataset_repository_shared::test_rename_dataset_same_name_multiple_tenants(&repo).await; + test_dataset_repository_shared::test_rename_dataset_same_name_multiple_tenants( + harness.dataset_repo.as_ref(), + ) + .await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -162,13 +215,14 @@ async fn test_rename_dataset_same_name_multiple_tenants() { #[tokio::test] async fn test_rename_unauthorized() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( - &tempdir, - Arc::new(MockDatasetActionAuthorizer::denying()), - true, - ); + let harness = + LocalFsRepoHarness::create(&tempdir, MockDatasetActionAuthorizer::denying(), true).await; - test_dataset_repository_shared::test_rename_dataset_unauthroized(&repo, None).await; + test_dataset_repository_shared::test_rename_dataset_unauthroized( + harness.dataset_repo.as_ref(), + None, + ) + .await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -176,13 +230,15 @@ async fn test_rename_unauthorized() { #[tokio::test] async fn test_delete_dataset() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( + let harness = LocalFsRepoHarness::create( &tempdir, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), + auth::AlwaysHappyDatasetActionAuthorizer::new(), false, - ); + ) + .await; + harness.dependencies_eager_initialization().await; - test_dataset_repository_shared::test_delete_dataset(&repo, None).await; + test_dataset_repository_shared::test_delete_dataset(harness.dataset_repo.as_ref(), None).await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -190,14 +246,16 @@ async fn test_delete_dataset() { #[tokio::test] async fn test_delete_dataset_multi_tenant() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( + let harness = LocalFsRepoHarness::create( &tempdir, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), + auth::AlwaysHappyDatasetActionAuthorizer::new(), true, - ); + ) + .await; + harness.dependencies_eager_initialization().await; test_dataset_repository_shared::test_delete_dataset( - &repo, + harness.dataset_repo.as_ref(), Some(AccountName::new_unchecked(auth::DEFAULT_ACCOUNT_NAME)), ) .await; @@ -208,13 +266,15 @@ async fn test_delete_dataset_multi_tenant() { #[tokio::test] async fn test_delete_unauthorized() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( - &tempdir, - Arc::new(MockDatasetActionAuthorizer::denying()), - true, - ); + let harness = + LocalFsRepoHarness::create(&tempdir, MockDatasetActionAuthorizer::denying(), true).await; + harness.dependencies_eager_initialization().await; - test_dataset_repository_shared::test_delete_dataset_unauthroized(&repo, None).await; + test_dataset_repository_shared::test_delete_dataset_unauthroized( + harness.dataset_repo.as_ref(), + None, + ) + .await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -222,13 +282,14 @@ async fn test_delete_unauthorized() { #[tokio::test] async fn test_iterate_datasets() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( + let harness = LocalFsRepoHarness::create( &tempdir, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), + auth::AlwaysHappyDatasetActionAuthorizer::new(), false, - ); + ) + .await; - test_dataset_repository_shared::test_iterate_datasets(&repo).await; + test_dataset_repository_shared::test_iterate_datasets(harness.dataset_repo.as_ref()).await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -236,13 +297,17 @@ async fn test_iterate_datasets() { #[tokio::test] async fn test_iterate_datasets_multi_tenant() { let tempdir = tempfile::tempdir().unwrap(); - let repo = local_fs_repo( + let harness = LocalFsRepoHarness::create( &tempdir, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), + auth::AlwaysHappyDatasetActionAuthorizer::new(), true, - ); + ) + .await; - test_dataset_repository_shared::test_iterate_datasets_multi_tenant(&repo).await; + test_dataset_repository_shared::test_iterate_datasets_multi_tenant( + harness.dataset_repo.as_ref(), + ) + .await; } ///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/repos/test_dataset_repository_s3.rs b/src/infra/core/tests/tests/repos/test_dataset_repository_s3.rs index 62f43f1a1e..872b044065 100644 --- a/src/infra/core/tests/tests/repos/test_dataset_repository_s3.rs +++ b/src/infra/core/tests/tests/repos/test_dataset_repository_s3.rs @@ -9,28 +9,70 @@ use std::sync::Arc; +use dill::Component; +use event_bus::EventBus; use kamu::domain::{auth, CurrentAccountSubject}; use kamu::testing::{LocalS3Server, MockDatasetActionAuthorizer}; use kamu::utils::s3_context::S3Context; -use kamu::DatasetRepositoryS3; +use kamu::{ + DatasetRepositoryS3, + DependencyGraphRepositoryInMemory, + DependencyGraphServiceInMemory, +}; +use kamu_core::{DatasetRepository, DependencyGraphService}; use opendatafabric::AccountName; use super::test_dataset_repository_shared; ///////////////////////////////////////////////////////////////////////////////////////// -async fn s3_repo( - s3: &LocalS3Server, - dataset_action_authorizer: Arc, - multi_tenant: bool, -) -> DatasetRepositoryS3 { - let s3_context = S3Context::from_url(&s3.url).await; - DatasetRepositoryS3::new( - s3_context, - Arc::new(CurrentAccountSubject::new_test()), - dataset_action_authorizer, - multi_tenant, - ) +struct S3RepoHarness { + catalog: dill::Catalog, + dataset_repo: Arc, +} + +impl S3RepoHarness { + pub async fn create( + s3: &LocalS3Server, + dataset_action_authorizer: TDatasetActionAuthorizer, + multi_tenant: bool, + ) -> Self { + let s3_context = S3Context::from_url(&s3.url).await; + + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_value(dataset_action_authorizer) + .bind::() + .add_builder( + DatasetRepositoryS3::builder() + .with_s3_context(s3_context) + .with_multi_tenant(multi_tenant), + ) + .bind::() + .build(); + + let dataset_repo = catalog.get_one().unwrap(); + + Self { + catalog, + dataset_repo, + } + } + + pub async fn dependencies_eager_initialization(&self) { + let dependency_graph_service = self + .catalog + .get_one::() + .unwrap(); + dependency_graph_service + .eager_initialization(&DependencyGraphRepositoryInMemory::new( + self.dataset_repo.clone(), + )) + .await + .unwrap(); + } } ///////////////////////////////////////////////////////////////////////////////////////// @@ -39,14 +81,10 @@ async fn s3_repo( #[tokio::test] async fn test_create_dataset() { let s3 = LocalS3Server::new().await; - let repo = s3_repo( - &s3, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - false, - ) - .await; + let harness = + S3RepoHarness::create(&s3, auth::AlwaysHappyDatasetActionAuthorizer::new(), false).await; - test_dataset_repository_shared::test_create_dataset(&repo, None).await; + test_dataset_repository_shared::test_create_dataset(harness.dataset_repo.as_ref(), None).await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -55,15 +93,11 @@ async fn test_create_dataset() { #[tokio::test] async fn test_create_dataset_multi_tenant() { let s3 = LocalS3Server::new().await; - let repo = s3_repo( - &s3, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - true, - ) - .await; + let harness = + S3RepoHarness::create(&s3, auth::AlwaysHappyDatasetActionAuthorizer::new(), true).await; test_dataset_repository_shared::test_create_dataset( - &repo, + harness.dataset_repo.as_ref(), Some(AccountName::new_unchecked(auth::DEFAULT_ACCOUNT_NAME)), ) .await; @@ -75,14 +109,13 @@ async fn test_create_dataset_multi_tenant() { #[tokio::test] async fn test_create_dataset_same_name_multiple_tenants() { let s3 = LocalS3Server::new().await; - let repo = s3_repo( - &s3, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - true, + let harness = + S3RepoHarness::create(&s3, auth::AlwaysHappyDatasetActionAuthorizer::new(), true).await; + + test_dataset_repository_shared::test_create_dataset_same_name_multiple_tenants( + harness.dataset_repo.as_ref(), ) .await; - - test_dataset_repository_shared::test_create_dataset_same_name_multiple_tenants(&repo).await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -92,14 +125,14 @@ async fn test_create_dataset_same_name_multiple_tenants() { #[tokio::test] async fn test_create_dataset_from_snapshot() { let s3 = LocalS3Server::new().await; - let repo = s3_repo( - &s3, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - false, + let harness = + S3RepoHarness::create(&s3, auth::AlwaysHappyDatasetActionAuthorizer::new(), false).await; + + test_dataset_repository_shared::test_create_dataset_from_snapshot( + harness.dataset_repo.as_ref(), + None, ) .await; - - test_dataset_repository_shared::test_create_dataset_from_snapshot(&repo, None).await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -108,15 +141,11 @@ async fn test_create_dataset_from_snapshot() { #[tokio::test] async fn test_create_dataset_from_snapshot_multi_tenant() { let s3 = LocalS3Server::new().await; - let repo = s3_repo( - &s3, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - true, - ) - .await; + let harness = + S3RepoHarness::create(&s3, auth::AlwaysHappyDatasetActionAuthorizer::new(), true).await; test_dataset_repository_shared::test_create_dataset_from_snapshot( - &repo, + harness.dataset_repo.as_ref(), Some(AccountName::new_unchecked(auth::DEFAULT_ACCOUNT_NAME)), ) .await; @@ -128,14 +157,14 @@ async fn test_create_dataset_from_snapshot_multi_tenant() { #[tokio::test] async fn test_rename_dataset() { let s3 = LocalS3Server::new().await; - let repo = s3_repo( + let harness = S3RepoHarness::create( &s3, - Arc::new(MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1)), + MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1), false, ) .await; - test_dataset_repository_shared::test_rename_dataset(&repo, None).await; + test_dataset_repository_shared::test_rename_dataset(harness.dataset_repo.as_ref(), None).await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -144,15 +173,15 @@ async fn test_rename_dataset() { #[tokio::test] async fn test_rename_dataset_multi_tenant() { let s3 = LocalS3Server::new().await; - let repo = s3_repo( + let harness = S3RepoHarness::create( &s3, - Arc::new(MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1)), + MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1), true, ) .await; test_dataset_repository_shared::test_rename_dataset( - &repo, + harness.dataset_repo.as_ref(), Some(AccountName::new_unchecked(auth::DEFAULT_ACCOUNT_NAME)), ) .await; @@ -164,14 +193,17 @@ async fn test_rename_dataset_multi_tenant() { #[tokio::test] async fn test_rename_dataset_same_name_multiple_tenants() { let s3 = LocalS3Server::new().await; - let repo = s3_repo( + let harness = S3RepoHarness::create( &s3, - Arc::new(MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1)), + MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1), true, ) .await; - test_dataset_repository_shared::test_rename_dataset_same_name_multiple_tenants(&repo).await; + test_dataset_repository_shared::test_rename_dataset_same_name_multiple_tenants( + harness.dataset_repo.as_ref(), + ) + .await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -180,9 +212,13 @@ async fn test_rename_dataset_same_name_multiple_tenants() { #[tokio::test] async fn test_rename_unauthorized() { let s3 = LocalS3Server::new().await; - let repo = s3_repo(&s3, Arc::new(MockDatasetActionAuthorizer::denying()), true).await; + let harness = S3RepoHarness::create(&s3, MockDatasetActionAuthorizer::denying(), true).await; - test_dataset_repository_shared::test_rename_dataset_unauthroized(&repo, None).await; + test_dataset_repository_shared::test_rename_dataset_unauthroized( + harness.dataset_repo.as_ref(), + None, + ) + .await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -191,14 +227,11 @@ async fn test_rename_unauthorized() { #[tokio::test] async fn test_delete_dataset() { let s3 = LocalS3Server::new().await; - let repo = s3_repo( - &s3, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - false, - ) - .await; + let harness = + S3RepoHarness::create(&s3, auth::AlwaysHappyDatasetActionAuthorizer::new(), false).await; + harness.dependencies_eager_initialization().await; - test_dataset_repository_shared::test_delete_dataset(&repo, None).await; + test_dataset_repository_shared::test_delete_dataset(harness.dataset_repo.as_ref(), None).await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -207,15 +240,12 @@ async fn test_delete_dataset() { #[tokio::test] async fn test_delete_dataset_multi_tenant() { let s3 = LocalS3Server::new().await; - let repo = s3_repo( - &s3, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - true, - ) - .await; + let harness = + S3RepoHarness::create(&s3, auth::AlwaysHappyDatasetActionAuthorizer::new(), true).await; + harness.dependencies_eager_initialization().await; test_dataset_repository_shared::test_delete_dataset( - &repo, + harness.dataset_repo.as_ref(), Some(AccountName::new_unchecked(auth::DEFAULT_ACCOUNT_NAME)), ) .await; @@ -226,10 +256,15 @@ async fn test_delete_dataset_multi_tenant() { #[test_group::group(containerized)] #[tokio::test] async fn test_delete_unauthorized() { - let s3 = LocalS3Server::new().await; - let repo = s3_repo(&s3, Arc::new(MockDatasetActionAuthorizer::denying()), true).await; + let s3: LocalS3Server = LocalS3Server::new().await; + let harness = S3RepoHarness::create(&s3, MockDatasetActionAuthorizer::denying(), true).await; + harness.dependencies_eager_initialization().await; - test_dataset_repository_shared::test_delete_dataset_unauthroized(&repo, None).await; + test_dataset_repository_shared::test_delete_dataset_unauthroized( + harness.dataset_repo.as_ref(), + None, + ) + .await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -238,14 +273,10 @@ async fn test_delete_unauthorized() { #[tokio::test] async fn test_iterate_datasets() { let s3 = LocalS3Server::new().await; - let repo = s3_repo( - &s3, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - false, - ) - .await; + let harness = + S3RepoHarness::create(&s3, auth::AlwaysHappyDatasetActionAuthorizer::new(), false).await; - test_dataset_repository_shared::test_iterate_datasets(&repo).await; + test_dataset_repository_shared::test_iterate_datasets(harness.dataset_repo.as_ref()).await; } ///////////////////////////////////////////////////////////////////////////////////////// @@ -254,14 +285,13 @@ async fn test_iterate_datasets() { #[tokio::test] async fn test_iterate_datasets_multi_tenant() { let s3 = LocalS3Server::new().await; - let repo = s3_repo( - &s3, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - true, + let harness = + S3RepoHarness::create(&s3, auth::AlwaysHappyDatasetActionAuthorizer::new(), true).await; + + test_dataset_repository_shared::test_iterate_datasets_multi_tenant( + harness.dataset_repo.as_ref(), ) .await; - - test_dataset_repository_shared::test_iterate_datasets_multi_tenant(&repo).await; } ///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/test_dependency_graph_inmem.rs b/src/infra/core/tests/tests/test_dependency_graph_inmem.rs new file mode 100644 index 0000000000..59a965ded2 --- /dev/null +++ b/src/infra/core/tests/tests/test_dependency_graph_inmem.rs @@ -0,0 +1,614 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::collections::HashMap; +use std::sync::Arc; + +use dill::Component; +use event_bus::EventBus; +use futures::{StreamExt, TryStreamExt}; +use internal_error::ResultIntoInternal; +use kamu::testing::MetadataFactory; +use kamu::{ + DatasetRepositoryLocalFs, + DependencyGraphRepositoryInMemory, + DependencyGraphServiceInMemory, +}; +use kamu_core::{ + auth, + CurrentAccountSubject, + DatasetRepository, + DependencyGraphRepository, + DependencyGraphService, +}; +use opendatafabric::{ + AccountName, + DatasetAlias, + DatasetID, + DatasetKind, + DatasetName, + MetadataEvent, +}; +use tempfile::TempDir; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_single_tenant_repository() { + let harness = DependencyGraphHarness::new(false); + + let all_dependencies: Vec<_> = harness.list_all_dependencies().await; + assert_eq!( + DependencyGraphHarness::all_dependencies_report(all_dependencies), + "" + ); + + harness.create_single_tenant_graph().await; + + let all_dependencies: Vec<_> = harness.list_all_dependencies().await; + assert_eq!( + DependencyGraphHarness::all_dependencies_report(all_dependencies), + indoc::indoc!( + r#" + bar -> foo-bar + baz -> foo-baz + foo -> foo-bar + foo -> foo-baz + foo-bar -> foo-bar-foo-baz + foo-baz -> foo-bar-foo-baz"# + ), + ) +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_multi_tenant_repository() { + let harness = DependencyGraphHarness::new(true); + + let all_dependencies: Vec<_> = harness.list_all_dependencies().await; + assert_eq!( + DependencyGraphHarness::all_dependencies_report(all_dependencies), + "" + ); + + harness.create_multi_tenant_graph().await; + + let all_dependencies: Vec<_> = harness.list_all_dependencies().await; + assert_eq!( + DependencyGraphHarness::all_dependencies_report(all_dependencies), + indoc::indoc!( + r#" + alice/bar -> alice/foo-bar + alice/foo -> alice/foo-bar + alice/foo -> bob/foo-baz + alice/foo-bar -> eve/foo-bar-foo-baz + bob/baz -> bob/foo-baz + bob/foo-baz -> eve/foo-bar-foo-baz"# + ), + ) +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_service_queries() { + let harness = DependencyGraphHarness::new(false); + harness.create_single_tenant_graph().await; + harness.eager_initialization().await; + + assert_eq!( + harness.dataset_dependencies_report("foo").await, + "[] -> foo -> [foo-bar, foo-baz]" + ); + + assert_eq!( + harness.dataset_dependencies_report("bar").await, + "[] -> bar -> [foo-bar]" + ); + + assert_eq!( + harness.dataset_dependencies_report("baz").await, + "[] -> baz -> [foo-baz]" + ); + + assert_eq!( + harness.dataset_dependencies_report("foo-bar").await, + "[bar, foo] -> foo-bar -> [foo-bar-foo-baz]" + ); + + assert_eq!( + harness.dataset_dependencies_report("foo-baz").await, + "[baz, foo] -> foo-baz -> [foo-bar-foo-baz]" + ); + + assert_eq!( + harness.dataset_dependencies_report("foo-bar-foo-baz").await, + "[foo-bar, foo-baz] -> foo-bar-foo-baz -> []" + ); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_service_new_datasets() { + let harness = DependencyGraphHarness::new(false); + harness.create_single_tenant_graph().await; + harness.eager_initialization().await; + + harness.create_root_dataset(None, "test-root").await; + + assert_eq!( + harness.dataset_dependencies_report("test-root").await, + "[] -> test-root -> []" + ); + assert_eq!( + harness.dataset_dependencies_report("foo").await, + "[] -> foo -> [foo-bar, foo-baz]" + ); + + harness + .create_derived_dataset( + None, + "test-deriv", + vec![ + DatasetAlias::new(None, DatasetName::new_unchecked("foo")), + DatasetAlias::new(None, DatasetName::new_unchecked("test-root")), + ], + ) + .await; + + assert_eq!( + harness.dataset_dependencies_report("test-root").await, + "[] -> test-root -> [test-deriv]" + ); + assert_eq!( + harness.dataset_dependencies_report("test-deriv").await, + "[foo, test-root] -> test-deriv -> []" + ); + assert_eq!( + harness.dataset_dependencies_report("foo").await, + "[] -> foo -> [foo-bar, foo-baz, test-deriv]" + ); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_service_derived_dataset_modifies_links() { + let harness = DependencyGraphHarness::new(false); + harness.create_single_tenant_graph().await; + harness.eager_initialization().await; + + assert_eq!( + harness.dataset_dependencies_report("bar").await, + "[] -> bar -> [foo-bar]" + ); + assert_eq!( + harness.dataset_dependencies_report("baz").await, + "[] -> baz -> [foo-baz]" + ); + + // Initially "test-deriv" will have 2 upstream dependencies: "bar" and "baz" + harness + .create_derived_dataset( + None, + "test-deriv", + vec![ + DatasetAlias::new(None, DatasetName::new_unchecked("bar")), + DatasetAlias::new(None, DatasetName::new_unchecked("baz")), + ], + ) + .await; + + assert_eq!( + harness.dataset_dependencies_report("bar").await, + "[] -> bar -> [foo-bar, test-deriv]" + ); + assert_eq!( + harness.dataset_dependencies_report("baz").await, + "[] -> baz -> [foo-baz, test-deriv]" + ); + assert_eq!( + harness.dataset_dependencies_report("test-deriv").await, + "[bar, baz] -> test-deriv -> []" + ); + + // Drop "baz" dependency + harness + .modify_derived_dataset( + None, + "test-deriv", + vec![DatasetAlias::new(None, DatasetName::new_unchecked("bar"))], + ) + .await; + + // Confirm we only have "bar" left + assert_eq!( + harness.dataset_dependencies_report("bar").await, + "[] -> bar -> [foo-bar, test-deriv]" + ); + assert_eq!( + harness.dataset_dependencies_report("baz").await, + "[] -> baz -> [foo-baz]" + ); + assert_eq!( + harness.dataset_dependencies_report("test-deriv").await, + "[bar] -> test-deriv -> []" + ); + + // Add "baz" dependency back + harness + .modify_derived_dataset( + None, + "test-deriv", + vec![ + DatasetAlias::new(None, DatasetName::new_unchecked("bar")), + DatasetAlias::new(None, DatasetName::new_unchecked("baz")), + ], + ) + .await; + + // Confirm we both "bar" and "baz" now + assert_eq!( + harness.dataset_dependencies_report("bar").await, + "[] -> bar -> [foo-bar, test-deriv]" + ); + assert_eq!( + harness.dataset_dependencies_report("baz").await, + "[] -> baz -> [foo-baz, test-deriv]" + ); + assert_eq!( + harness.dataset_dependencies_report("test-deriv").await, + "[bar, baz] -> test-deriv -> []" + ); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_service_dataset_deleted() { + let harness = DependencyGraphHarness::new(false); + harness.create_single_tenant_graph().await; + harness.eager_initialization().await; + + assert_eq!( + harness.dataset_dependencies_report("foo-bar").await, + "[bar, foo] -> foo-bar -> [foo-bar-foo-baz]" + ); + assert_eq!( + harness.dataset_dependencies_report("foo-baz").await, + "[baz, foo] -> foo-baz -> [foo-bar-foo-baz]" + ); + assert_eq!( + harness.dataset_dependencies_report("foo-bar-foo-baz").await, + "[foo-bar, foo-baz] -> foo-bar-foo-baz -> []" + ); + + harness + .dataset_repo + .delete_dataset( + &DatasetAlias::new(None, DatasetName::new_unchecked("foo-bar-foo-baz")).as_local_ref(), + ) + .await + .unwrap(); + + assert_eq!( + harness.dataset_dependencies_report("foo-bar").await, + "[bar, foo] -> foo-bar -> []" + ); + assert_eq!( + harness.dataset_dependencies_report("foo-baz").await, + "[baz, foo] -> foo-baz -> []" + ); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +struct DependencyGraphHarness { + _workdir: TempDir, + _catalog: dill::Catalog, + dataset_repo: Arc, + dependency_graph_service: Arc, + dependency_graph_repository: Arc, +} + +impl DependencyGraphHarness { + fn new(multi_tenant: bool) -> Self { + let workdir = tempfile::tempdir().unwrap(); + let datasets_dir = workdir.path().join("datasets"); + std::fs::create_dir(&datasets_dir).unwrap(); + + let catalog = dill::CatalogBuilder::new() + .add::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(datasets_dir) + .with_multi_tenant(multi_tenant), + ) + .bind::() + .add_value(CurrentAccountSubject::new_test()) + .add::() + .add::() + .build(); + + let dataset_repo = catalog.get_one::().unwrap(); + + let dependency_graph_service = catalog.get_one::().unwrap(); + + // Note: don't place into catalog, avoid cyclic dependency + let dependency_graph_repository = + Arc::new(DependencyGraphRepositoryInMemory::new(dataset_repo.clone())); + + Self { + _workdir: workdir, + _catalog: catalog, + dataset_repo, + dependency_graph_service, + dependency_graph_repository, + } + } + + async fn list_all_dependencies(&self) -> Vec<(String, String)> { + let dependencies: Vec<_> = self + .dependency_graph_repository + .list_dependencies_of_all_datasets() + .try_collect() + .await + .unwrap(); + + let mut res = Vec::new(); + for (downstream_id, upstream_id) in dependencies { + let downstream_hdl = self + .dataset_repo + .resolve_dataset_ref(&downstream_id.as_local_ref()) + .await + .unwrap(); + let upstream_hdl = self + .dataset_repo + .resolve_dataset_ref(&upstream_id.as_local_ref()) + .await + .unwrap(); + + res.push(( + format!("{}", upstream_hdl.alias), + format!("{}", downstream_hdl.alias), + )); + } + + res.sort(); + res + } + + fn all_dependencies_report(dependencies: Vec<(String, String)>) -> String { + dependencies + .iter() + .map(|(name1, name2)| format!("{name1} -> {name2}")) + .collect::>() + .join("\n") + } + + async fn eager_initialization(&self) { + self.dependency_graph_service + .eager_initialization(self.dependency_graph_repository.as_ref()) + .await + .unwrap(); + } + + async fn dataset_dependencies_report(&self, dataset_name: &str) -> String { + let downstream = self.get_downstream_dependencies(dataset_name).await; + let upstream = self.get_upstream_dependencies(dataset_name).await; + + format!( + "[{}] -> {} -> [{}]", + upstream.join(", "), + dataset_name, + downstream.join(", "), + ) + } + + async fn get_downstream_dependencies(&self, dataset_name: &str) -> Vec { + let dataset_id = self.dataset_id_by_name(dataset_name).await; + + let downstream_dataset_ids: Vec<_> = self + .dependency_graph_service + .get_downstream_dependencies(&dataset_id) + .await + .int_err() + .unwrap() + .collect() + .await; + + let mut res = Vec::new(); + for downstream_dataset_id in downstream_dataset_ids { + let dataset_alias = self.dataset_alias_by_id(&downstream_dataset_id).await; + res.push(format!("{dataset_alias}")); + } + + res.sort(); + res + } + + async fn get_upstream_dependencies(&self, dataset_name: &str) -> Vec { + let dataset_id = self.dataset_id_by_name(dataset_name).await; + + let upstream_dataset_ids: Vec<_> = self + .dependency_graph_service + .get_upstream_dependencies(&dataset_id) + .await + .int_err() + .unwrap() + .collect() + .await; + + let mut res = Vec::new(); + for upstream_dataset_id in upstream_dataset_ids { + let dataset_alias = self.dataset_alias_by_id(&upstream_dataset_id).await; + res.push(format!("{dataset_alias}")); + } + + res.sort(); + res + } + + async fn dataset_id_by_name(&self, dataset_name: &str) -> DatasetID { + let dataset_alias = DatasetAlias::try_from(dataset_name).unwrap(); + let dataset_hdl = self + .dataset_repo + .resolve_dataset_ref(&dataset_alias.as_local_ref()) + .await + .unwrap(); + dataset_hdl.id + } + + async fn dataset_alias_by_id(&self, dataset_id: &DatasetID) -> DatasetAlias { + let dataset_ref = dataset_id.as_local_ref(); + let dataset_hdl = self + .dataset_repo + .resolve_dataset_ref(&dataset_ref) + .await + .unwrap(); + dataset_hdl.alias + } + + async fn create_single_tenant_graph(&self) { + self.create_graph(|_| None).await; + } + + async fn create_multi_tenant_graph(&self) { + let alice = AccountName::new_unchecked("alice"); + let bob = AccountName::new_unchecked("bob"); + let eve: AccountName = AccountName::new_unchecked("eve"); + + let mut dataset_accounts: HashMap<&'static str, AccountName> = HashMap::new(); + dataset_accounts.insert("foo", alice.clone()); + dataset_accounts.insert("bar", alice.clone()); + dataset_accounts.insert("baz", bob.clone()); + dataset_accounts.insert("foo-bar", alice); + dataset_accounts.insert("foo-baz", bob); + dataset_accounts.insert("foo-bar-foo-baz", eve); + + self.create_graph(|dataset_name| dataset_accounts.get(dataset_name).cloned()) + .await; + } + + async fn create_graph(&self, account_getter: impl Fn(&str) -> Option) { + self.create_root_dataset(account_getter("foo"), "foo").await; + self.create_root_dataset(account_getter("bar"), "bar").await; + self.create_root_dataset(account_getter("baz"), "baz").await; + + self.create_derived_dataset( + account_getter("foo-bar"), + "foo-bar", + vec![ + DatasetAlias::new(account_getter("foo"), DatasetName::new_unchecked("foo")), + DatasetAlias::new(account_getter("bar"), DatasetName::new_unchecked("bar")), + ], + ) + .await; + + self.create_derived_dataset( + account_getter("foo-baz"), + "foo-baz", + vec![ + DatasetAlias::new(account_getter("foo"), DatasetName::new_unchecked("foo")), + DatasetAlias::new(account_getter("baz"), DatasetName::new_unchecked("baz")), + ], + ) + .await; + + self.create_derived_dataset( + account_getter("foo-bar-foo-baz"), + "foo-bar-foo-baz", + vec![ + DatasetAlias::new( + account_getter("foo-bar"), + DatasetName::new_unchecked("foo-bar"), + ), + DatasetAlias::new( + account_getter("foo-baz"), + DatasetName::new_unchecked("foo-baz"), + ), + ], + ) + .await; + } + + async fn create_root_dataset(&self, account_name: Option, dataset_name: &str) { + self.dataset_repo + .create_dataset_from_snapshot( + account_name, + MetadataFactory::dataset_snapshot() + .name(DatasetName::new_unchecked(dataset_name)) + .kind(DatasetKind::Root) + .push_event(MetadataFactory::set_polling_source().build()) + .build(), + ) + .await + .unwrap(); + } + + async fn create_derived_dataset( + &self, + account_name: Option, + dataset_name: &str, + input_aliases: Vec, + ) { + self.dataset_repo + .create_dataset_from_snapshot( + account_name, + MetadataFactory::dataset_snapshot() + .name(DatasetName::new_unchecked(dataset_name)) + .kind(DatasetKind::Derivative) + .push_event(MetadataFactory::set_transform_aliases(input_aliases).build()) + .build(), + ) + .await + .unwrap(); + } + + async fn modify_derived_dataset( + &self, + account_name: Option, + dataset_name: &str, + input_aliases: Vec, + ) { + let dataset_alias = + DatasetAlias::new(account_name, DatasetName::new_unchecked(dataset_name)); + + let dataset = self + .dataset_repo + .get_dataset(&dataset_alias.as_local_ref()) + .await + .unwrap(); + + let mut id_map = HashMap::new(); + for input_alias in &input_aliases { + id_map.insert( + input_alias.dataset_name.clone(), + self.dataset_id_by_name(input_alias.dataset_name.as_str()) + .await, + ); + } + + dataset + .commit_event( + MetadataEvent::SetTransform( + MetadataFactory::set_transform_aliases(input_aliases) + .set_dataset_ids(id_map) + .build(), + ), + Default::default(), + ) + .await + .unwrap(); + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/core/tests/tests/test_pull_service_impl.rs b/src/infra/core/tests/tests/test_pull_service_impl.rs index 25806695d0..81bd2c26cd 100644 --- a/src/infra/core/tests/tests/test_pull_service_impl.rs +++ b/src/infra/core/tests/tests/test_pull_service_impl.rs @@ -13,6 +13,8 @@ use std::path::Path; use std::sync::{Arc, Mutex}; use chrono::prelude::*; +use dill::Component; +use event_bus::EventBus; use kamu::domain::*; use kamu::testing::*; use kamu::*; @@ -153,12 +155,13 @@ async fn create_graph( // TODO: Rewrite this abomination async fn create_graph_in_repository( + event_bus: Arc, repo_path: &Path, datasets: Vec<(DatasetAlias, Vec)>, ) { for (dataset_alias, deps) in datasets { let layout = DatasetLayout::create(repo_path.join(&dataset_alias.dataset_name)).unwrap(); - let ds = DatasetFactoryImpl::get_local_fs(layout); + let ds = DatasetFactoryImpl::get_local_fs(layout, event_bus.clone()); let chain = ds.as_metadata_chain(); if deps.is_empty() { @@ -220,13 +223,14 @@ async fn create_graph_in_repository( // dir and syncing it into the main workspace. TODO: Add simpler way to import // remote dataset async fn create_graph_remote( + event_bus: Arc, dataset_repo: Arc, reg: Arc, datasets: Vec<(DatasetAlias, Vec)>, to_import: Vec, ) { let tmp_repo_dir = tempfile::tempdir().unwrap(); - create_graph_in_repository(tmp_repo_dir.path(), datasets).await; + create_graph_in_repository(event_bus.clone(), tmp_repo_dir.path(), datasets).await; let tmp_repo_name = RepoName::new_unchecked("tmp"); @@ -243,6 +247,7 @@ async fn create_graph_remote( Arc::new(DatasetFactoryImpl::new( IpfsGateway::default(), Arc::new(auth::DummyOdfServerAccessTokenResolver::new()), + event_bus, )), Arc::new(DummySmartTransferProtocolClient::new()), Arc::new(kamu::utils::ipfs_wrapper::IpfsClient::default()), @@ -430,6 +435,7 @@ async fn test_pull_batching_complex_with_remote() { // C --------/ / // D -----------/ create_graph_remote( + harness.event_bus.clone(), harness.dataset_repo.clone(), harness.remote_repo_reg.clone(), vec![ @@ -804,11 +810,9 @@ async fn test_set_watermark() { let tmp_dir = tempfile::tempdir().unwrap(); let harness = PullTestHarness::new_with_authorizer( tmp_dir.path(), - Arc::new( - MockDatasetActionAuthorizer::new().expect_check_write_dataset( - DatasetAlias::new(None, DatasetName::new_unchecked("foo")), - 4, - ), + MockDatasetActionAuthorizer::new().expect_check_write_dataset( + DatasetAlias::new(None, DatasetName::new_unchecked("foo")), + 4, ), false, ); @@ -872,7 +876,7 @@ async fn test_set_watermark_unauthorized() { let tmp_dir = tempfile::tempdir().unwrap(); let harness = PullTestHarness::new_with_authorizer( tmp_dir.path(), - Arc::new(MockDatasetActionAuthorizer::denying()), + MockDatasetActionAuthorizer::denying(), true, ); @@ -899,51 +903,59 @@ struct PullTestHarness { calls: Arc>>, dataset_repo: Arc, remote_repo_reg: Arc, - remote_alias_reg: Arc, - pull_svc: PullServiceImpl, + remote_alias_reg: Arc, + pull_svc: Arc, + event_bus: Arc, } impl PullTestHarness { fn new(tmp_path: &Path, multi_tenant: bool) -> Self { Self::new_with_authorizer( tmp_path, - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), + auth::AlwaysHappyDatasetActionAuthorizer::new(), multi_tenant, ) } - fn new_with_authorizer( + fn new_with_authorizer( tmp_path: &Path, - dataset_action_authorizer: Arc, + dataset_action_authorizer: TDatasetAuthorizer, multi_tenant: bool, ) -> Self { let calls = Arc::new(Mutex::new(Vec::new())); - let current_account_config = Arc::new(CurrentAccountSubject::new_test()); - let dataset_repo = Arc::new( - DatasetRepositoryLocalFs::create( - tmp_path.join("datasets"), - current_account_config.clone(), - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - multi_tenant, + + let datasets_dir_path = tmp_path.join("datasets"); + std::fs::create_dir(&datasets_dir_path).unwrap(); + + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_value(dataset_action_authorizer) + .bind::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(datasets_dir_path) + .with_multi_tenant(multi_tenant), ) - .unwrap(), - ); - let remote_repo_reg = - Arc::new(RemoteRepositoryRegistryImpl::create(tmp_path.join("repos")).unwrap()); - let remote_alias_reg = Arc::new(RemoteAliasesRegistryImpl::new(dataset_repo.clone())); - let ingest_svc = Arc::new(TestIngestService::new(calls.clone())); - let transform_svc = Arc::new(TestTransformService::new(calls.clone())); - let sync_svc = Arc::new(TestSyncService::new(calls.clone(), dataset_repo.clone())); - - let pull_svc = PullServiceImpl::new( - dataset_repo.clone(), - remote_alias_reg.clone(), - ingest_svc, - transform_svc, - sync_svc, - current_account_config, - dataset_action_authorizer, - ); + .bind::() + .add_value(RemoteRepositoryRegistryImpl::create(tmp_path.join("repos")).unwrap()) + .bind::() + .add::() + .add_value(TestIngestService::new(calls.clone())) + .bind::() + .add_value(TestTransformService::new(calls.clone())) + .bind::() + .add_builder(TestSyncService::builder().with_calls(calls.clone())) + .bind::() + .add::() + .build(); + + let dataset_repo = catalog.get_one::().unwrap(); + let remote_repo_reg = catalog.get_one::().unwrap(); + let remote_alias_reg = catalog.get_one::().unwrap(); + let pull_svc = catalog.get_one::().unwrap(); + let event_bus = catalog.get_one::().unwrap(); Self { calls, @@ -951,6 +963,7 @@ impl PullTestHarness { remote_repo_reg, remote_alias_reg, pull_svc, + event_bus, } } @@ -1202,6 +1215,7 @@ struct TestSyncService { dataset_repo: Arc, } +#[dill::component(pub)] impl TestSyncService { fn new(calls: Arc>>, dataset_repo: Arc) -> Self { Self { diff --git a/src/infra/core/tests/tests/test_query_service_impl.rs b/src/infra/core/tests/tests/test_query_service_impl.rs index 0b587f22f4..84b81950ec 100644 --- a/src/infra/core/tests/tests/test_query_service_impl.rs +++ b/src/infra/core/tests/tests/test_query_service_impl.rs @@ -14,6 +14,8 @@ use std::sync::Arc; use datafusion::arrow::array::*; use datafusion::arrow::datatypes::{DataType, Field, Int64Type, Schema}; use datafusion::arrow::record_batch::RecordBatch; +use dill::Component; +use event_bus::EventBus; use kamu::domain::*; use kamu::testing::{ LocalS3Server, @@ -107,18 +109,17 @@ async fn create_catalog_with_local_workspace( dataset_action_authorizer: MockDatasetActionAuthorizer, ) -> dill::Catalog { dill::CatalogBuilder::new() + .add::() + .add::() .add_builder( - dill::builder_for::() + DatasetRepositoryLocalFs::builder() .with_root(tempdir.join("datasets")) .with_multi_tenant(false), ) .bind::() .add::() - .bind::() .add::() - .bind::() - .add_value(ObjectStoreBuilderLocalFs::new()) - .bind::() + .add::() .add_value(CurrentAccountSubject::new_test()) .add_value(dataset_action_authorizer) .bind::() @@ -135,18 +136,17 @@ async fn create_catalog_with_s3_workspace( let s3_context = S3Context::from_items(endpoint.clone(), bucket, key_prefix).await; dill::CatalogBuilder::new() + .add::() + .add::() .add_builder( - dill::builder_for::() + DatasetRepositoryS3::builder() .with_s3_context(s3_context.clone()) .with_multi_tenant(false), ) .bind::() .add::() - .bind::() .add::() - .bind::() - .add_value(ObjectStoreBuilderLocalFs::new()) - .bind::() + .add::() .add_value(ObjectStoreBuilderS3::new(s3_context, true)) .bind::() .add_value(CurrentAccountSubject::new_test()) diff --git a/src/infra/core/tests/tests/test_reset_service_impl.rs b/src/infra/core/tests/tests/test_reset_service_impl.rs index 43a18d7097..fa4d53cd44 100644 --- a/src/infra/core/tests/tests/test_reset_service_impl.rs +++ b/src/infra/core/tests/tests/test_reset_service_impl.rs @@ -10,6 +10,8 @@ use std::assert_matches::assert_matches; use std::sync::Arc; +use dill::Component; +use event_bus::EventBus; use kamu::domain::*; use kamu::testing::*; use kamu::*; @@ -102,27 +104,31 @@ impl ChainWith2BlocksTestCase { struct ResetTestHarness { _temp_dir: TempDir, - dataset_repo: Arc, - reset_svc: ResetServiceImpl, + dataset_repo: Arc, + reset_svc: Arc, } impl ResetTestHarness { fn new() -> Self { let tempdir = tempfile::tempdir().unwrap(); - let dataset_authorizer = - Arc::new(MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1)); - - let dataset_repo = Arc::new( - DatasetRepositoryLocalFs::create( - tempdir.path().join("datasets"), - Arc::new(CurrentAccountSubject::new_test()), - dataset_authorizer.clone(), - false, + + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_value(MockDatasetActionAuthorizer::new().expect_check_write_a_dataset(1)) + .bind::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(tempdir.path().join("datasets")) + .with_multi_tenant(false), ) - .unwrap(), - ); + .bind::() + .add::() + .build(); - let reset_svc = ResetServiceImpl::new(dataset_repo.clone(), dataset_authorizer.clone()); + let dataset_repo = catalog.get_one::().unwrap(); + let reset_svc = catalog.get_one::().unwrap(); Self { _temp_dir: tempdir, diff --git a/src/infra/core/tests/tests/test_search_service_impl.rs b/src/infra/core/tests/tests/test_search_service_impl.rs index 3f6819e8c2..5855e85e97 100644 --- a/src/infra/core/tests/tests/test_search_service_impl.rs +++ b/src/infra/core/tests/tests/test_search_service_impl.rs @@ -9,8 +9,9 @@ use std::assert_matches::assert_matches; use std::path::Path; -use std::sync::Arc; +use dill::Component; +use event_bus::EventBus; use kamu::domain::*; use kamu::testing::*; use kamu::*; @@ -25,32 +26,32 @@ async fn do_test_search(tmp_workspace_dir: &Path, repo_url: Url) { let repo_name = RepoName::new_unchecked("repo"); let dataset_remote_alias = DatasetAliasRemote::try_from("repo/bar").unwrap(); - let dataset_action_authorizer = Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()); - - let dataset_repo = Arc::new( - DatasetRepositoryLocalFs::create( - tmp_workspace_dir.join("datasets"), - Arc::new(CurrentAccountSubject::new_test()), - dataset_action_authorizer.clone(), - false, + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(tmp_workspace_dir.join("datasets")) + .with_multi_tenant(false), ) - .unwrap(), - ); - let remote_repo_reg = - Arc::new(RemoteRepositoryRegistryImpl::create(tmp_workspace_dir.join("repos")).unwrap()); - let sync_svc = SyncServiceImpl::new( - remote_repo_reg.clone(), - dataset_repo.clone(), - dataset_action_authorizer.clone(), - Arc::new(DatasetFactoryImpl::new( - IpfsGateway::default(), - Arc::new(auth::DummyOdfServerAccessTokenResolver::new()), - )), - Arc::new(DummySmartTransferProtocolClient::new()), - Arc::new(kamu::utils::ipfs_wrapper::IpfsClient::default()), - ); + .bind::() + .add::() + .add_value(RemoteRepositoryRegistryImpl::create(tmp_workspace_dir.join("repos")).unwrap()) + .bind::() + .add_value(IpfsGateway::default()) + .add_value(kamu::utils::ipfs_wrapper::IpfsClient::default()) + .add::() + .add::() + .add::() + .add::() + .add::() + .build(); - let search_svc = SearchServiceImpl::new(remote_repo_reg.clone()); + let remote_repo_reg = catalog.get_one::().unwrap(); + let dataset_repo = catalog.get_one::().unwrap(); + let sync_svc = catalog.get_one::().unwrap(); + let search_svc = catalog.get_one::().unwrap(); // Add repository remote_repo_reg diff --git a/src/infra/core/tests/tests/test_sync_service_impl.rs b/src/infra/core/tests/tests/test_sync_service_impl.rs index f05e23c80d..44b938fe61 100644 --- a/src/infra/core/tests/tests/test_sync_service_impl.rs +++ b/src/infra/core/tests/tests/test_sync_service_impl.rs @@ -10,8 +10,9 @@ use std::assert_matches::assert_matches; use std::path::Path; use std::str::FromStr; -use std::sync::Arc; +use dill::Component; +use event_bus::EventBus; use kamu::domain::*; use kamu::testing::*; use kamu::utils::ipfs_wrapper::IpfsClient; @@ -115,14 +116,12 @@ fn construct_authorizer( authorization_expectations: AuthorizationExpectations, d1_alias: &DatasetAlias, d2_alias: &DatasetAlias, -) -> Arc { - let authorizer = MockDatasetActionAuthorizer::new() +) -> impl auth::DatasetActionAuthorizer { + MockDatasetActionAuthorizer::new() .expect_check_read_dataset(d1_alias.clone(), authorization_expectations.d1_reads) .expect_check_read_dataset(d2_alias.clone(), authorization_expectations.d2_reads) .expect_check_write_dataset(d1_alias.clone(), authorization_expectations.d1_writes) - .expect_check_write_dataset(d2_alias.clone(), authorization_expectations.d2_writes); - - Arc::new(authorizer) + .expect_check_write_dataset(d2_alias.clone(), authorization_expectations.d2_writes) } ///////////////////////////////////////////////////////////////////////////////////////// @@ -139,7 +138,7 @@ async fn do_test_sync( let (ipfs_gateway, ipfs_client) = ipfs.unwrap_or_default(); - let dataset_action_authorizer = construct_authorizer( + let dataset_authorizer = construct_authorizer( AuthorizationExpectations { d1_reads: 7, d2_reads: 2, @@ -150,30 +149,32 @@ async fn do_test_sync( &dataset_alias_2, ); - let dataset_repo = Arc::new( - DatasetRepositoryLocalFs::create( - tmp_workspace_dir.join("datasets"), - Arc::new(CurrentAccountSubject::new_test()), - dataset_action_authorizer.clone(), - false, + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add_value(ipfs_gateway) + .add_value(ipfs_client) + .add_value(CurrentAccountSubject::new_test()) + .add_value(dataset_authorizer) + .bind::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(tmp_workspace_dir.join("datasets")) + .with_multi_tenant(false), ) - .unwrap(), - ); - let remote_repo_reg = - Arc::new(RemoteRepositoryRegistryImpl::create(tmp_workspace_dir.join("repos")).unwrap()); - let dataset_factory = Arc::new(DatasetFactoryImpl::new( - ipfs_gateway, - Arc::new(auth::DummyOdfServerAccessTokenResolver::new()), - )); - - let sync_svc = SyncServiceImpl::new( - remote_repo_reg.clone(), - dataset_repo.clone(), - dataset_action_authorizer.clone(), - dataset_factory, - Arc::new(DummySmartTransferProtocolClient::new()), - Arc::new(ipfs_client), - ); + .bind::() + .add_builder( + RemoteRepositoryRegistryImpl::builder().with_repos_dir(tmp_workspace_dir.join("repos")), + ) + .bind::() + .add::() + .add::() + .add::() + .add::() + .build(); + + let sync_svc = catalog.get_one::().unwrap(); + let dataset_repo = catalog.get_one::().unwrap(); // Dataset does not exist locally / remotely ////////////////////////////// assert_matches!( diff --git a/src/infra/core/tests/tests/test_transform_service_impl.rs b/src/infra/core/tests/tests/test_transform_service_impl.rs index 2f97c74b5c..c73af59d26 100644 --- a/src/infra/core/tests/tests/test_transform_service_impl.rs +++ b/src/infra/core/tests/tests/test_transform_service_impl.rs @@ -11,6 +11,8 @@ use std::assert_matches::assert_matches; use std::sync::Arc; use chrono::{TimeZone, Utc}; +use dill::Component; +use event_bus::EventBus; use futures::TryStreamExt; use kamu::domain::engine::*; use kamu::domain::*; @@ -26,30 +28,39 @@ use crate::mock_engine_provisioner; struct TransformTestHarness { _tempdir: TempDir, dataset_repo: Arc, - transform_service: TransformServiceImpl, + transform_service: Arc, } impl TransformTestHarness { - pub fn new_custom( - dataset_action_authorizer: Arc, - engine_provisioner: Arc, + pub fn new_custom< + TAuthorizer: auth::DatasetActionAuthorizer + 'static, + TEngineProvisioner: EngineProvisioner + 'static, + >( + dataset_action_authorizer: TAuthorizer, + engine_provisioner: TEngineProvisioner, ) -> Self { let tempdir = tempfile::tempdir().unwrap(); - let dataset_repo = Arc::new( - DatasetRepositoryLocalFs::create( - tempdir.path().join("datasets"), - Arc::new(CurrentAccountSubject::new_test()), - dataset_action_authorizer.clone(), - false, + + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_value(dataset_action_authorizer) + .bind::() + .add_value(engine_provisioner) + .bind::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(tempdir.path().join("datasets")) + .with_multi_tenant(false), ) - .unwrap(), - ); - let transform_service = TransformServiceImpl::new( - dataset_repo.clone(), - dataset_action_authorizer.clone(), - engine_provisioner, - Arc::new(SystemTimeSourceDefault), - ); + .add::() + .bind::() + .add::() + .build(); + + let transform_service = catalog.get_one::().unwrap(); + let dataset_repo = catalog.get_one::().unwrap(); Self { _tempdir: tempdir, @@ -60,8 +71,8 @@ impl TransformTestHarness { pub fn new() -> Self { Self::new_custom( - Arc::new(auth::AlwaysHappyDatasetActionAuthorizer::new()), - Arc::new(EngineProvisionerNull), + auth::AlwaysHappyDatasetActionAuthorizer::new(), + EngineProvisionerNull, ) } @@ -230,8 +241,8 @@ async fn test_transform_enforces_authorization() { ); let harness = TransformTestHarness::new_custom( - Arc::new(mock_dataset_action_authorizer), - Arc::new(mock_engine_provisioner::MockEngineProvisioner::new().stub_provision_engine()), + mock_dataset_action_authorizer, + mock_engine_provisioner::MockEngineProvisioner::new().stub_provision_engine(), ); let foo = harness.new_root("foo").await; @@ -252,8 +263,8 @@ async fn test_transform_enforces_authorization() { #[test_log::test(tokio::test)] async fn test_transform_unauthorized() { let harness = TransformTestHarness::new_custom( - Arc::new(MockDatasetActionAuthorizer::denying()), - Arc::new(EngineProvisionerNull), + MockDatasetActionAuthorizer::denying(), + EngineProvisionerNull, ); let foo = harness.new_root("foo").await; diff --git a/src/infra/core/tests/tests/test_verification_service_impl.rs b/src/infra/core/tests/tests/test_verification_service_impl.rs index e3aebc76a1..2347fe25bf 100644 --- a/src/infra/core/tests/tests/test_verification_service_impl.rs +++ b/src/infra/core/tests/tests/test_verification_service_impl.rs @@ -13,6 +13,8 @@ use std::sync::{Arc, Mutex}; use datafusion::arrow::array::{Array, Int32Array, StringArray}; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::arrow::record_batch::RecordBatch; +use dill::Component; +use event_bus::EventBus; use kamu::domain::*; use kamu::testing::{MetadataFactory, MockDatasetActionAuthorizer, ParquetWriterHelper}; use kamu::*; @@ -26,25 +28,27 @@ async fn test_verify_data_consistency() { let dataset_alias = DatasetAlias::new(None, DatasetName::new_unchecked("bar")); - let dataset_authorizer = Arc::new( - MockDatasetActionAuthorizer::new().expect_check_read_dataset(dataset_alias.clone(), 3), - ); - - let dataset_repo = Arc::new( - DatasetRepositoryLocalFs::create( - tempdir.path().join("datasets"), - Arc::new(CurrentAccountSubject::new_test()), - dataset_authorizer.clone(), - false, + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add_value(CurrentAccountSubject::new_test()) + .add_value( + MockDatasetActionAuthorizer::new().expect_check_read_dataset(dataset_alias.clone(), 3), ) - .unwrap(), - ); - - let verification_svc = Arc::new(VerificationServiceImpl::new( - dataset_repo.clone(), - dataset_authorizer.clone(), - Arc::new(TestTransformService::new(Arc::new(Mutex::new(Vec::new())))), - )); + .bind::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(tempdir.path().join("datasets")) + .with_multi_tenant(false), + ) + .bind::() + .add_value(TestTransformService::new(Arc::new(Mutex::new(Vec::new())))) + .bind::() + .add::() + .build(); + + let verification_svc = catalog.get_one::().unwrap(); + let dataset_repo = catalog.get_one::().unwrap(); dataset_repo .create_dataset_from_snapshot( diff --git a/src/infra/core/tests/utils/dummy_smart_transfer_protocol_client.rs b/src/infra/core/tests/utils/dummy_smart_transfer_protocol_client.rs index 4f92415bd1..f9eec4082b 100644 --- a/src/infra/core/tests/utils/dummy_smart_transfer_protocol_client.rs +++ b/src/infra/core/tests/utils/dummy_smart_transfer_protocol_client.rs @@ -15,6 +15,8 @@ use kamu::utils::smart_transfer_protocol::{ObjectTransferOptions, SmartTransferP use opendatafabric::Multihash; use url::Url; +#[dill::component] +#[dill::interface(dyn SmartTransferProtocolClient)] pub struct DummySmartTransferProtocolClient {} impl DummySmartTransferProtocolClient { diff --git a/src/infra/flow-system-inmem/Cargo.toml b/src/infra/flow-system-inmem/Cargo.toml new file mode 100644 index 0000000000..fcbbe60448 --- /dev/null +++ b/src/infra/flow-system-inmem/Cargo.toml @@ -0,0 +1,52 @@ +[package] +name = "kamu-flow-system-inmem" +description = "In-memory implementation of the flows management for scheduled dataset and system activities" +version = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +readme = { workspace = true } +license-file = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +publish = { workspace = true } + + +[lib] +doctest = false + + +[dependencies] +opendatafabric = { workspace = true } +kamu = { workspace = true } +kamu-core = { workspace = true } +kamu-task-system = { workspace = true } +kamu-flow-system = { workspace = true } +event-bus = { workspace = true } + +async-stream = "0.3" +async-trait = { version = "0.1", default-features = false } +chrono = { version = "0.4", default-features = false } +dill = "0.8" +futures = "0.3" +thiserror = { version = "1", default-features = false } +tokio = { version = "1", default-features = false, features=[] } +tokio-stream = { version = "0.1", default-features = false } +tracing = { version = "0.1", default-features = false } +url = { version = "2", default-features = false, features = ["serde"] } + +# TODO: Make serde optional +serde = { version = "1", default-features = false, features = ["derive"] } +serde_with = { version = "3", default-features = false } + + +[dev-dependencies] +kamu-task-system-inmem = { workspace = true } + +env_logger = "0.10" +mockall = "0.11" +tempfile = "3" +test-log = { version = "0.2", features = ["trace"] } +tokio = { version = "1", default-features = false, features=["rt", "macros"] } +tracing-subscriber = { version = "0.3", features = ["env-filter"] } diff --git a/src/infra/flow-system-inmem/src/dataset_flow_key.rs b/src/infra/flow-system-inmem/src/dataset_flow_key.rs new file mode 100644 index 0000000000..66403f7e18 --- /dev/null +++ b/src/infra/flow-system-inmem/src/dataset_flow_key.rs @@ -0,0 +1,78 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::borrow::Borrow; +use std::hash::{Hash, Hasher}; + +use kamu_flow_system::{DatasetFlowType, FlowKeyDataset}; +use opendatafabric::DatasetID; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Copy, Clone, Hash, PartialEq, Eq)] +pub(crate) struct BorrowedFlowKeyDataset<'a> { + dataset_id: &'a DatasetID, + flow_type: DatasetFlowType, +} + +impl<'a> BorrowedFlowKeyDataset<'a> { + pub fn new(dataset_id: &'a DatasetID, flow_type: DatasetFlowType) -> Self { + Self { + dataset_id, + flow_type, + } + } + + pub fn as_trait(&self) -> &dyn BorrowedFlowKeyDatasetHelper { + self as &dyn BorrowedFlowKeyDatasetHelper + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +pub(crate) trait BorrowedFlowKeyDatasetHelper { + fn borrowed_key(&self) -> BorrowedFlowKeyDataset; +} + +impl BorrowedFlowKeyDatasetHelper for FlowKeyDataset { + fn borrowed_key(&self) -> BorrowedFlowKeyDataset { + BorrowedFlowKeyDataset { + dataset_id: &self.dataset_id, + flow_type: self.flow_type, + } + } +} + +impl BorrowedFlowKeyDatasetHelper for BorrowedFlowKeyDataset<'_> { + fn borrowed_key(&self) -> BorrowedFlowKeyDataset { + *self + } +} + +impl<'a> Borrow for FlowKeyDataset { + fn borrow(&self) -> &(dyn BorrowedFlowKeyDatasetHelper + 'a) { + self + } +} + +impl Eq for (dyn BorrowedFlowKeyDatasetHelper + '_) {} + +impl PartialEq for (dyn BorrowedFlowKeyDatasetHelper + '_) { + fn eq(&self, other: &dyn BorrowedFlowKeyDatasetHelper) -> bool { + self.borrowed_key().eq(&other.borrowed_key()) + } +} + +impl<'a> Hash for (dyn BorrowedFlowKeyDatasetHelper + 'a) { + fn hash(&self, state: &mut H) { + self.borrowed_key().hash(state) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/flow-system-inmem/src/lib.rs b/src/infra/flow-system-inmem/src/lib.rs new file mode 100644 index 0000000000..eb6c14fa20 --- /dev/null +++ b/src/infra/flow-system-inmem/src/lib.rs @@ -0,0 +1,21 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +#![feature(let_chains)] +#![feature(async_closure)] + +// Re-exports +pub use kamu_flow_system as domain; + +mod dataset_flow_key; +mod repos; +mod services; + +pub use repos::*; +pub use services::*; diff --git a/src/infra/flow-system-inmem/src/repos/flow/flow_event_store_inmem.rs b/src/infra/flow-system-inmem/src/repos/flow/flow_event_store_inmem.rs new file mode 100644 index 0000000000..0a7dd362d0 --- /dev/null +++ b/src/infra/flow-system-inmem/src/repos/flow/flow_event_store_inmem.rs @@ -0,0 +1,325 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::collections::hash_map::Entry; +use std::collections::HashMap; + +use dill::*; +use kamu_flow_system::*; +use opendatafabric::DatasetID; + +use crate::dataset_flow_key::BorrowedFlowKeyDataset; + +///////////////////////////////////////////////////////////////////////////////////////// + +pub struct FlowEventStoreInMem { + inner: EventStoreInMemory, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Default)] +struct State { + events: Vec, + typed_flows_by_dataset: HashMap>, + all_flows_by_dataset: HashMap>, + system_flows_by_type: HashMap>, + all_flows: Vec, + last_flow_id: Option, +} + +impl State { + fn next_flow_id(&mut self) -> FlowID { + let next_flow_id = if let Some(last_flow_id) = self.last_flow_id { + let id: u64 = last_flow_id.into(); + FlowID::new(id + 1) + } else { + FlowID::new(0) + }; + self.last_flow_id = Some(next_flow_id); + next_flow_id + } +} + +impl EventStoreState for State { + fn events_count(&self) -> usize { + self.events.len() + } + + fn get_events(&self) -> &[FlowEvent] { + &self.events + } + + fn add_event(&mut self, event: FlowEvent) { + self.events.push(event); + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn FlowEventStore)] +#[scope(Singleton)] +impl FlowEventStoreInMem { + pub fn new() -> Self { + Self { + inner: EventStoreInMemory::new(), + } + } + + fn update_index(state: &mut State, event: &FlowEvent) { + if let FlowEvent::Initiated(e) = &event { + match &e.flow_key { + FlowKey::Dataset(flow_key) => { + let typed_entries = match state.typed_flows_by_dataset.entry(flow_key.clone()) { + Entry::Occupied(v) => v.into_mut(), + Entry::Vacant(v) => v.insert(Vec::default()), + }; + typed_entries.push(event.flow_id()); + + let all_dataset_entries = match state + .all_flows_by_dataset + .entry(flow_key.dataset_id.clone()) + { + Entry::Occupied(v) => v.into_mut(), + Entry::Vacant(v) => v.insert(Vec::default()), + }; + all_dataset_entries.push(event.flow_id()); + } + + FlowKey::System(flow_key) => { + let entries = match state.system_flows_by_type.entry(flow_key.flow_type) { + Entry::Occupied(v) => v.into_mut(), + Entry::Vacant(v) => v.insert(Vec::default()), + }; + entries.push(event.flow_id()); + } + } + + state.all_flows.push(event.flow_id()); + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl EventStore for FlowEventStoreInMem { + #[tracing::instrument(level = "debug", skip_all)] + async fn len(&self) -> Result { + self.inner.len().await + } + + #[tracing::instrument(level = "debug", skip_all, fields(%query, ?opts))] + fn get_events<'a>(&'a self, query: &FlowID, opts: GetEventsOpts) -> EventStream<'a, FlowEvent> { + self.inner.get_events(query, opts) + } + + #[tracing::instrument(level = "debug", skip_all, fields(%query, num_events = events.len()))] + async fn save_events( + &self, + query: &FlowID, + events: Vec, + ) -> Result { + { + let state = self.inner.as_state(); + let mut g = state.lock().unwrap(); + for event in &events { + Self::update_index(&mut g, &event); + } + } + + self.inner.save_events(query, events).await + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl FlowEventStore for FlowEventStoreInMem { + #[tracing::instrument(level = "debug", skip_all)] + fn new_flow_id(&self) -> FlowID { + self.inner.as_state().lock().unwrap().next_flow_id() + } + + #[tracing::instrument(level = "debug", skip_all, fields(%dataset_id, ?flow_type))] + fn get_last_dataset_flow_of_type( + &self, + dataset_id: &DatasetID, + flow_type: DatasetFlowType, + ) -> Option { + let state = self.inner.as_state(); + let g = state.lock().unwrap(); + g.typed_flows_by_dataset + .get(BorrowedFlowKeyDataset::new(&dataset_id, flow_type).as_trait()) + .and_then(|flows| flows.last().cloned()) + } + + #[tracing::instrument(level = "debug", skip_all, fields(?flow_type))] + fn get_last_system_flow_of_type(&self, flow_type: SystemFlowType) -> Option { + let state = self.inner.as_state(); + let g = state.lock().unwrap(); + g.system_flows_by_type + .get(&flow_type) + .and_then(|flows| flows.last().cloned()) + } + + #[tracing::instrument(level = "debug", skip_all, fields(%dataset_id, ?flow_type))] + fn get_flows_by_dataset_of_type<'a>( + &'a self, + dataset_id: &DatasetID, + flow_type: DatasetFlowType, + ) -> FlowIDStream<'a> { + let dataset_id = dataset_id.clone(); + + // TODO: This should be a buffered stream so we don't lock per record + Box::pin(async_stream::try_stream! { + let borrowed_key = BorrowedFlowKeyDataset::new(&dataset_id, flow_type); + + let mut pos = { + let state = self.inner.as_state(); + let g = state.lock().unwrap(); + g.typed_flows_by_dataset.get(borrowed_key.as_trait()).map(|flows| flows.len()).unwrap_or(0) + }; + + loop { + if pos == 0 { + break; + } + + pos -= 1; + + let next = { + let state = self.inner.as_state(); + let g = state.lock().unwrap(); + g.typed_flows_by_dataset + .get(borrowed_key.as_trait()) + .and_then(|flows| flows.get(pos).cloned()) + }; + + let flow_id = match next { + None => break, + Some(flow_id) => flow_id, + }; + + yield flow_id; + } + }) + } + + #[tracing::instrument(level = "debug", skip_all, fields(?flow_type))] + fn get_system_flows_of_type<'a>(&'a self, flow_type: SystemFlowType) -> FlowIDStream<'a> { + let mut pos = { + let state = self.inner.as_state(); + let g = state.lock().unwrap(); + g.system_flows_by_type + .get(&flow_type) + .map(|flows| flows.len()) + .unwrap_or(0) + }; + + // TODO: This should be a buffered stream so we don't lock per record + Box::pin(async_stream::try_stream! { + loop { + if pos == 0 { + break; + } + + pos -= 1; + + let next = { + let state = self.inner.as_state(); + let g: std::sync::MutexGuard<'_, State> = state.lock().unwrap(); + g.system_flows_by_type + .get(&flow_type) + .and_then(|flows| flows.get(pos).cloned()) + }; + + let flow_id = match next { + None => break, + Some(flow_id) => flow_id, + }; + + yield flow_id; + } + }) + } + + #[tracing::instrument(level = "debug", skip_all, fields(%dataset_id))] + fn get_all_flows_by_dataset<'a>(&'a self, dataset_id: &DatasetID) -> FlowIDStream<'a> { + let dataset_id = dataset_id.clone(); + + // TODO: This should be a buffered stream so we don't lock per record + Box::pin(async_stream::try_stream! { + let mut pos = { + let state = self.inner.as_state(); + let g = state.lock().unwrap(); + g.all_flows_by_dataset.get(&dataset_id).map(|flows| flows.len()).unwrap_or(0) + }; + + loop { + if pos == 0 { + break; + } + + pos -= 1; + + let next = { + let state = self.inner.as_state(); + let g = state.lock().unwrap(); + g.all_flows_by_dataset + .get(&dataset_id) + .and_then(|flows| flows.get(pos).cloned()) + }; + + let flow_id = match next { + None => break, + Some(flow_id) => flow_id, + }; + + yield flow_id; + } + }) + } + + #[tracing::instrument(level = "debug", skip_all)] + fn get_all_flows<'a>(&'a self) -> FlowIDStream<'a> { + // TODO: This should be a buffered stream so we don't lock per record + Box::pin(async_stream::try_stream! { + let mut pos = { + let state = self.inner.as_state(); + let g = state.lock().unwrap(); + g.all_flows.len() + }; + + loop { + if pos == 0 { + break; + } + + pos -= 1; + + let next = { + let state = self.inner.as_state(); + let g = state.lock().unwrap(); + g.all_flows.get(pos).cloned() + }; + + let flow_id = match next { + None => break, + Some(flow_id) => flow_id, + }; + + yield flow_id; + } + }) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/flow-system-inmem/src/repos/flow/mod.rs b/src/infra/flow-system-inmem/src/repos/flow/mod.rs new file mode 100644 index 0000000000..370c975ab0 --- /dev/null +++ b/src/infra/flow-system-inmem/src/repos/flow/mod.rs @@ -0,0 +1,12 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow_event_store_inmem; + +pub use flow_event_store_inmem::*; diff --git a/src/infra/flow-system-inmem/src/repos/flow_configuration/flow_configuration_event_store_inmem.rs b/src/infra/flow-system-inmem/src/repos/flow_configuration/flow_configuration_event_store_inmem.rs new file mode 100644 index 0000000000..a64c8db44c --- /dev/null +++ b/src/infra/flow-system-inmem/src/repos/flow_configuration/flow_configuration_event_store_inmem.rs @@ -0,0 +1,104 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::collections::HashSet; + +use dill::*; +use kamu_core::DatasetIDStream; +use kamu_flow_system::*; +use opendatafabric::DatasetID; + +///////////////////////////////////////////////////////////////////////////////////////// + +pub struct FlowConfigurationEventStoreInMem { + inner: EventStoreInMemory, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Default)] +struct State { + events: Vec, + dataset_ids: HashSet, +} + +impl EventStoreState for State { + fn events_count(&self) -> usize { + self.events.len() + } + + fn get_events(&self) -> &[FlowConfigurationEvent] { + &self.events + } + + fn add_event(&mut self, event: FlowConfigurationEvent) { + self.events.push(event); + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn FlowConfigurationEventStore)] +#[scope(Singleton)] +impl FlowConfigurationEventStoreInMem { + pub fn new() -> Self { + Self { + inner: EventStoreInMemory::new(), + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl EventStore for FlowConfigurationEventStoreInMem { + #[tracing::instrument(level = "debug", skip_all)] + async fn len(&self) -> Result { + self.inner.len().await + } + + #[tracing::instrument(level = "debug", skip_all, fields(?query, ?opts))] + fn get_events<'a>( + &'a self, + query: &FlowKey, + opts: GetEventsOpts, + ) -> EventStream<'a, FlowConfigurationEvent> { + self.inner.get_events(query, opts) + } + + #[tracing::instrument(level = "debug", skip_all, fields(?query, num_events = events.len()))] + async fn save_events( + &self, + query: &FlowKey, + events: Vec, + ) -> Result { + if let FlowKey::Dataset(flow_key) = query { + let state = self.inner.as_state(); + let mut g = state.lock().unwrap(); + g.dataset_ids.insert(flow_key.dataset_id.clone()); + } + + self.inner.save_events(query, events).await + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +impl FlowConfigurationEventStore for FlowConfigurationEventStoreInMem { + #[tracing::instrument(level = "debug", skip_all)] + fn list_all_dataset_ids<'a>(&'a self) -> DatasetIDStream<'a> { + // TODO: re-consider performance impact + Box::pin(tokio_stream::iter( + self.inner.as_state().lock().unwrap().dataset_ids.clone(), + )) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/flow-system-inmem/src/repos/flow_configuration/mod.rs b/src/infra/flow-system-inmem/src/repos/flow_configuration/mod.rs new file mode 100644 index 0000000000..a04b950048 --- /dev/null +++ b/src/infra/flow-system-inmem/src/repos/flow_configuration/mod.rs @@ -0,0 +1,12 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow_configuration_event_store_inmem; + +pub use flow_configuration_event_store_inmem::*; diff --git a/src/infra/flow-system-inmem/src/repos/mod.rs b/src/infra/flow-system-inmem/src/repos/mod.rs new file mode 100644 index 0000000000..0ba9707f2c --- /dev/null +++ b/src/infra/flow-system-inmem/src/repos/mod.rs @@ -0,0 +1,14 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow; +mod flow_configuration; + +pub use flow::*; +pub use flow_configuration::*; diff --git a/src/infra/flow-system-inmem/src/services/flow/active_configs_state.rs b/src/infra/flow-system-inmem/src/services/flow/active_configs_state.rs new file mode 100644 index 0000000000..8312c693c5 --- /dev/null +++ b/src/infra/flow-system-inmem/src/services/flow/active_configs_state.rs @@ -0,0 +1,93 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::collections::HashMap; + +use kamu_flow_system::*; +use opendatafabric::DatasetID; + +use crate::dataset_flow_key::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Default)] +pub(crate) struct ActiveConfigsState { + dataset_schedules: HashMap, + system_schedules: HashMap, + dataset_start_conditions: HashMap, +} + +impl ActiveConfigsState { + pub fn add_dataset_flow_config( + &mut self, + flow_key: &FlowKeyDataset, + rule: FlowConfigurationRule, + ) { + let key = flow_key.clone(); + match rule { + FlowConfigurationRule::Schedule(schedule) => { + self.dataset_schedules.insert(key, schedule); + } + FlowConfigurationRule::StartCondition(condition) => { + self.dataset_start_conditions.insert(key, condition); + } + } + } + + pub fn add_system_flow_config(&mut self, flow_type: SystemFlowType, schedule: Schedule) { + self.system_schedules.insert(flow_type, schedule); + } + + pub fn drop_dataset_configs(&mut self, dataset_id: &DatasetID) { + for flow_type in DatasetFlowType::all() { + self.drop_dataset_flow_config(BorrowedFlowKeyDataset::new(dataset_id, *flow_type)); + } + } + + pub fn drop_flow_config(&mut self, flow_key: &FlowKey) { + match flow_key { + FlowKey::Dataset(flow_key) => { + self.drop_dataset_flow_config(flow_key.borrowed_key()); + } + FlowKey::System(flow_key) => { + self.system_schedules.remove(&flow_key.flow_type); + } + } + } + + fn drop_dataset_flow_config(&mut self, flow_key: BorrowedFlowKeyDataset) { + self.dataset_schedules.remove(flow_key.as_trait()); + self.dataset_start_conditions.remove(flow_key.as_trait()); + } + + pub fn try_get_flow_schedule(&self, flow_key: &FlowKey) -> Option { + match flow_key { + FlowKey::Dataset(flow_key) => self + .dataset_schedules + .get( + BorrowedFlowKeyDataset::new(&flow_key.dataset_id, flow_key.flow_type) + .as_trait(), + ) + .cloned(), + FlowKey::System(flow_key) => self.system_schedules.get(&flow_key.flow_type).cloned(), + } + } + + pub fn try_get_dataset_start_condition( + &self, + dataset_id: &DatasetID, + flow_type: DatasetFlowType, + ) -> Option { + self.dataset_start_conditions + .get(BorrowedFlowKeyDataset::new(&dataset_id, flow_type).as_trait()) + .cloned() + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/flow-system-inmem/src/services/flow/flow_service_inmem.rs b/src/infra/flow-system-inmem/src/services/flow/flow_service_inmem.rs new file mode 100644 index 0000000000..5012bcf665 --- /dev/null +++ b/src/infra/flow-system-inmem/src/services/flow/flow_service_inmem.rs @@ -0,0 +1,868 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +///////////////////////////////////////////////////////////////////////////////////////// + +use std::sync::{Arc, Mutex}; + +use chrono::{DateTime, DurationRound, Utc}; +use dill::*; +use event_bus::{AsyncEventHandler, EventBus}; +use futures::TryStreamExt; +use kamu_core::events::DatasetEventDeleted; +use kamu_core::{DependencyGraphService, InternalError, SystemTimeSource}; +use kamu_flow_system::*; +use kamu_task_system::*; +use opendatafabric::{AccountID, AccountName, DatasetID}; +use tokio_stream::StreamExt; + +use super::active_configs_state::ActiveConfigsState; +use super::flow_time_wheel::FlowTimeWheel; +use super::pending_flows_state::PendingFlowsState; + +///////////////////////////////////////////////////////////////////////////////////////// + +pub struct FlowServiceInMemory { + state: Arc>, + run_config: Arc, + event_bus: Arc, + flow_event_store: Arc, + time_source: Arc, + task_scheduler: Arc, + flow_configuration_service: Arc, + dependency_graph_service: Arc, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Default)] +struct State { + active_configs: ActiveConfigsState, + pending_flows: PendingFlowsState, + time_wheel: FlowTimeWheel, + running: bool, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn FlowService)] +#[interface(dyn AsyncEventHandler)] +#[interface(dyn AsyncEventHandler)] +#[interface(dyn AsyncEventHandler)] +#[scope(Singleton)] +impl FlowServiceInMemory { + pub fn new( + run_config: Arc, + event_bus: Arc, + flow_event_store: Arc, + time_source: Arc, + task_scheduler: Arc, + flow_configuration_service: Arc, + dependency_graph_service: Arc, + ) -> Self { + Self { + state: Arc::new(Mutex::new(State::default())), + run_config, + event_bus, + flow_event_store, + time_source, + task_scheduler, + flow_configuration_service, + dependency_graph_service, + } + } + + fn round_time(&self, time: DateTime) -> Result, InternalError> { + let rounded_time = time + .duration_round(self.run_config.awaiting_step) + .int_err()?; + Ok(rounded_time) + } + + #[tracing::instrument(level = "debug", skip_all)] + async fn run_current_timeslot(&self) { + let planned_flows: Vec<_> = { + let mut state = self.state.lock().unwrap(); + state.time_wheel.take_nearest_planned_flows() + }; + + let planned_task_futures: Vec<_> = planned_flows + .iter() + .map(async move |flow_id| { + let mut flow = Flow::load(*flow_id, self.flow_event_store.as_ref()) + .await + .int_err()?; + + self.schedule_flow_task(&mut flow).await?; + Ok(()) + }) + .collect(); + + let results = futures::future::join_all(planned_task_futures).await; + results + .into_iter() + .filter(|res| res.is_err()) + .map(|e| e.err().unwrap()) + .for_each(|e: InternalError| { + tracing::error!(error=?e, "Scheduling flow failed"); + }); + } + + #[tracing::instrument(level = "debug", skip_all)] + async fn initialize_auto_polling_flows_from_configurations( + &self, + start_time: DateTime, + ) -> Result<(), InternalError> { + let enabled_configurations: Vec<_> = self + .flow_configuration_service + .list_enabled_configurations() + .try_collect() + .await + .int_err()?; + + for enabled_config in enabled_configurations { + self.activate_flow_configuration( + start_time, + enabled_config.flow_key, + enabled_config.rule, + ) + .await?; + } + + Ok(()) + } + + #[tracing::instrument(level = "trace", skip_all, fields(?flow_key, ?rule))] + async fn activate_flow_configuration( + &self, + start_time: DateTime, + flow_key: FlowKey, + rule: FlowConfigurationRule, + ) -> Result<(), InternalError> { + match &flow_key { + FlowKey::Dataset(dataset_flow_key) => { + if let FlowConfigurationRule::Schedule(schedule) = &rule { + self.enqueue_auto_polling_flow(start_time, &flow_key, schedule) + .await?; + } + + let mut state = self.state.lock().unwrap(); + state + .active_configs + .add_dataset_flow_config(&dataset_flow_key, rule); + } + FlowKey::System(system_flow_key) => { + if let FlowConfigurationRule::Schedule(schedule) = &rule { + self.enqueue_auto_polling_flow(start_time, &flow_key, schedule) + .await?; + + let mut state = self.state.lock().unwrap(); + state + .active_configs + .add_system_flow_config(system_flow_key.flow_type, schedule.clone()); + } else { + unimplemented!("Doubt will ever need to schedule system flows via conditions") + } + } + } + + Ok(()) + } + + #[tracing::instrument(level = "trace", skip_all, fields(?flow_key))] + async fn try_enqueue_auto_polling_flow_if_enabled( + &self, + start_time: DateTime, + flow_key: &FlowKey, + ) -> Result<(), InternalError> { + let maybe_active_schedule = self + .state + .lock() + .unwrap() + .active_configs + .try_get_flow_schedule(flow_key); + + if let Some(active_schedule) = maybe_active_schedule { + self.enqueue_auto_polling_flow(start_time, flow_key, &active_schedule) + .await?; + } + + Ok(()) + } + + #[tracing::instrument(level = "trace", skip_all, fields(?flow_key, ?schedule))] + async fn enqueue_auto_polling_flow( + &self, + start_time: DateTime, + flow_key: &FlowKey, + schedule: &Schedule, + ) -> Result { + let trigger = FlowTrigger::AutoPolling(FlowTriggerAutoPolling {}); + + match self.find_pending_flow(flow_key) { + // If flow is already pending, simply merge triggers + Some(flow_id) => self.merge_secondary_flow_trigger(flow_id, trigger).await, + + // Otherwise, initiate a new flow, and enqueue it in the time wheel + None => { + let mut flow = self.make_new_flow(flow_key.clone(), trigger).await?; + + let next_activation_time = schedule.next_activation_time(start_time); + self.enqueue_flow(flow.flow_id, next_activation_time)?; + + flow.activate_at_time(self.time_source.now(), next_activation_time) + .int_err()?; + + flow.save(self.flow_event_store.as_ref()).await.int_err()?; + + Ok(flow.into()) + } + } + } + + #[tracing::instrument(level = "trace", skip_all, fields(%dataset_id, ?flow_type, %flow_id))] + async fn enqueue_dependent_dataset_flows( + &self, + start_time: DateTime, + dataset_id: &DatasetID, + flow_type: DatasetFlowType, + flow_id: FlowID, + ) -> Result<(), InternalError> { + // Note: this is applicable to dataset updates only + assert!(flow_type.is_dataset_update()); + + // Extract list of downstream 1 level datasets + let dependent_dataset_ids: Vec<_> = self + .dependency_graph_service + .get_downstream_dependencies(dataset_id) + .await + .int_err()? + .collect() + .await; + + // For each, scan if flows configurations are on + for dependent_dataset_id in dependent_dataset_ids { + let maybe_dependent_start_condition = self + .state + .lock() + .unwrap() + .active_configs + .try_get_dataset_start_condition(&dependent_dataset_id, flow_type); + + if let Some(start_condition) = maybe_dependent_start_condition { + let trigger = FlowTrigger::InputDatasetFlow(FlowTriggerInputDatasetFlow { + input_dataset_id: dataset_id.clone(), + input_flow_type: flow_type, + input_flow_id: flow_id, + }); + + let flow_key = FlowKeyDataset::new(dependent_dataset_id.clone(), flow_type).into(); + match self.find_pending_flow(&flow_key) { + // If flow is already pending for this dataset, simply merge triggers + Some(dependent_flow_id) => { + self.merge_secondary_flow_trigger(dependent_flow_id, trigger) + .await?; + } + + // Otherwise, initiate a new update accordingly to start condition rules + None => { + let mut dependent_dataset_flow = + self.make_new_flow(flow_key, trigger).await?; + + if start_condition.minimal_data_batch.is_some() { + unimplemented!("Data batching not supported yet in scheduler") + } + + if let Some(throttling_period) = start_condition.throttling_period { + // TODO: throttle not from NOW, + // but from last flow of the dependent daataset + let activation_time = start_time + throttling_period; + self.enqueue_flow(dependent_dataset_flow.flow_id, activation_time)?; + + dependent_dataset_flow + .activate_at_time(self.time_source.now(), activation_time) + .int_err()?; + + dependent_dataset_flow + .define_start_condition( + self.time_source.now(), + FlowStartCondition::Throttling(FlowStartConditionThrottling { + interval: throttling_period, + }), + ) + .int_err()?; + } else { + self.enqueue_flow(dependent_dataset_flow.flow_id, start_time)?; + + dependent_dataset_flow + .activate_at_time(self.time_source.now(), start_time) + .int_err()?; + } + + dependent_dataset_flow + .save(self.flow_event_store.as_ref()) + .await + .int_err()?; + } + } + } + } + + Ok(()) + } + + fn find_pending_flow(&self, flow_key: &FlowKey) -> Option { + let state = self.state.lock().unwrap(); + state.pending_flows.try_get_pending_flow(flow_key) + } + + #[tracing::instrument(level = "trace", skip_all, fields(?flow_key, ?trigger))] + async fn make_new_flow( + &self, + flow_key: FlowKey, + trigger: FlowTrigger, + ) -> Result { + let flow = Flow::new( + self.time_source.now(), + self.flow_event_store.new_flow_id(), + flow_key, + trigger, + ); + + let mut state = self.state.lock().unwrap(); + state + .pending_flows + .add_pending_flow(flow.flow_key.clone(), flow.flow_id); + + Ok(flow) + } + + #[tracing::instrument(level = "trace", skip_all, fields(%flow_id, ?trigger))] + async fn merge_secondary_flow_trigger( + &self, + flow_id: FlowID, + trigger: FlowTrigger, + ) -> Result { + let mut flow = Flow::load(flow_id, self.flow_event_store.as_ref()) + .await + .int_err()?; + flow.add_trigger(self.time_source.now(), trigger) + .int_err()?; + flow.save(self.flow_event_store.as_ref()).await.int_err()?; + Ok(flow.into()) + } + + #[tracing::instrument(level = "trace", skip_all, fields(%flow_id, %activation_time))] + fn enqueue_flow( + &self, + flow_id: FlowID, + activation_time: DateTime, + ) -> Result<(), InternalError> { + self.state + .lock() + .unwrap() + .time_wheel + .activate_at(activation_time, flow_id)?; + Ok(()) + } + + #[tracing::instrument(level = "trace", skip_all, fields(flow_id = %flow.flow_id))] + async fn schedule_flow_task(&self, flow: &mut Flow) -> Result<(), InternalError> { + let logical_plan = match &flow.flow_key { + FlowKey::Dataset(flow_key) => match flow_key.flow_type { + DatasetFlowType::Ingest | DatasetFlowType::ExecuteQuery => { + LogicalPlan::UpdateDataset(UpdateDataset { + dataset_id: flow_key.dataset_id.clone(), + }) + } + DatasetFlowType::Compaction => unimplemented!(), + }, + FlowKey::System(flow_key) => { + match flow_key.flow_type { + // TODO: replace on correct logical plan + SystemFlowType::GC => LogicalPlan::Probe(Probe { + dataset_id: None, + busy_time: Some(std::time::Duration::from_secs(20)), + end_with_outcome: Some(TaskOutcome::Success), + }), + } + } + }; + + let task = self + .task_scheduler + .create_task(logical_plan) + .await + .int_err()?; + + flow.on_task_scheduled(self.time_source.now(), task.task_id) + .int_err()?; + flow.save(self.flow_event_store.as_ref()).await.int_err()?; + + let mut state = self.state.lock().unwrap(); + state + .pending_flows + .track_flow_task(flow.flow_id, task.task_id); + + Ok(()) + } + + async fn abort_flow(&self, flow_id: FlowID) -> Result<(), InternalError> { + let mut flow = Flow::load(flow_id, self.flow_event_store.as_ref()) + .await + .int_err()?; + flow.abort(self.time_source.now()).int_err()?; + flow.save(self.flow_event_store.as_ref()).await.int_err()?; + Ok(()) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl FlowService for FlowServiceInMemory { + /// Runs the update main loop + #[tracing::instrument(level = "info", skip_all)] + async fn run(&self, planned_start_time: DateTime) -> Result<(), InternalError> { + // Mark running started + self.state.lock().unwrap().running = true; + + // Initial scheduling + let start_time = self.round_time(planned_start_time)?; + self.initialize_auto_polling_flows_from_configurations(start_time) + .await?; + + // Publish progress event + self.event_bus + .dispatch_event(FlowServiceEvent::ConfigurationLoaded( + FlowServiceEventConfigurationLoaded { + event_time: start_time, + }, + )) + .await + .int_err()?; + + // Main scanning loop + let main_loop_span = tracing::debug_span!("FlowService main loop"); + let _ = main_loop_span.enter(); + let std_awaiting_step = self.run_config.awaiting_step.to_std().int_err()?; + + loop { + // Do we have a timeslot scheduled? + let maybe_nearest_activation_time = { + let state = self.state.lock().unwrap(); + state.time_wheel.nearest_activation_moment() + }; + + // Is it time to execute it yet? + let current_time = self.time_source.now(); + if let Some(nearest_activation_time) = maybe_nearest_activation_time + && nearest_activation_time <= current_time + { + // Run scheduling for current time slot. Should not throw any errors + self.run_current_timeslot().await; + + // Publish progress event + self.event_bus + .dispatch_event(FlowServiceEvent::ExecutedTimeSlot( + FlowServiceEventExecutedTimeSlot { + event_time: nearest_activation_time, + }, + )) + .await + .int_err()?; + } + + tokio::time::sleep(std_awaiting_step).await; + continue; + } + } + + /// Triggers the specified flow manually, unless it's already waiting + #[tracing::instrument( + level = "debug", + skip_all, + fields(?flow_key, %initiator_account_id, %initiator_account_name) + )] + async fn trigger_manual_flow( + &self, + trigger_time: DateTime, + flow_key: FlowKey, + initiator_account_id: AccountID, + initiator_account_name: AccountName, + ) -> Result { + let trigger = FlowTrigger::Manual(FlowTriggerManual { + initiator_account_id, + initiator_account_name, + }); + + match self.find_pending_flow(&flow_key) { + // If flow is already pending, simply merge triggers + Some(flow_id) => self + .merge_secondary_flow_trigger(flow_id, trigger) + .await + .map_err(|e| RequestFlowError::Internal(e)), + + // Otherwise, initiate a new flow and activate it at the nearest scheduler slot + None => { + let mut flow = self.make_new_flow(flow_key, trigger).await?; + let activation_time = self.round_time(trigger_time)?; + self.enqueue_flow(flow.flow_id, activation_time)?; + + flow.activate_at_time(self.time_source.now(), activation_time) + .int_err()?; + flow.save(self.flow_event_store.as_ref()).await.int_err()?; + Ok(flow.into()) + } + } + } + + /// Returns states of flows of certian type associated with a given dataset + /// ordered by creation time from newest to oldest + #[tracing::instrument(level = "debug", skip_all, fields(%dataset_id, ?flow_type))] + fn list_flows_by_dataset_of_type( + &self, + dataset_id: &DatasetID, + flow_type: DatasetFlowType, + ) -> Result { + let dataset_id = dataset_id.clone(); + + Ok(Box::pin(async_stream::try_stream! { + let relevant_flows: Vec<_> = self + .flow_event_store + .get_flows_by_dataset_of_type(&dataset_id, flow_type) + .try_collect() + .await?; + + for flow_id in relevant_flows.into_iter() { + let flow = Flow::load(flow_id, self.flow_event_store.as_ref()).await.int_err()?; + + yield flow.into(); + } + })) + } + + /// Returns states of system flows of certian type + /// ordered by creation time from newest to oldest + #[tracing::instrument(level = "debug", skip_all, fields(?flow_type))] + fn list_system_flows_of_type( + &self, + flow_type: SystemFlowType, + ) -> Result { + Ok(Box::pin(async_stream::try_stream! { + let relevant_flows: Vec<_> = self + .flow_event_store + .get_system_flows_of_type(flow_type) + .try_collect() + .await?; + + for flow_id in relevant_flows.into_iter() { + let flow = Flow::load(flow_id, self.flow_event_store.as_ref()).await.int_err()?; + + yield flow.into(); + } + })) + } + + /// Returns states of flows of any type associated with a given dataset + /// ordered by creation time from newest to oldest + #[tracing::instrument(level = "debug", skip_all, fields(%dataset_id))] + fn list_all_flows_by_dataset( + &self, + dataset_id: &DatasetID, + ) -> Result { + let dataset_id = dataset_id.clone(); + + Ok(Box::pin(async_stream::try_stream! { + let relevant_flows: Vec<_> = self + .flow_event_store + .get_all_flows_by_dataset(&dataset_id) + .try_collect() + .await?; + + for flow_id in relevant_flows.into_iter() { + let flow = Flow::load(flow_id, self.flow_event_store.as_ref()).await.int_err()?; + + yield flow.into(); + } + })) + } + + /// Returns state of all flows, whether they are system-level or + /// dataset-bound, ordered by creation time from newest to oldest + #[tracing::instrument(level = "debug", skip_all)] + fn list_all_flows(&self) -> Result { + Ok(Box::pin(async_stream::try_stream! { + let all_flows: Vec<_> = self + .flow_event_store + .get_all_flows() + .try_collect() + .await?; + + for flow_id in all_flows.into_iter() { + let flow = Flow::load(flow_id, self.flow_event_store.as_ref()).await.int_err()?; + + yield flow.into(); + } + })) + } + + /// Returns state of the latest flow of certain type created for the given + /// dataset + #[tracing::instrument(level = "debug", skip_all, fields(%dataset_id, ?flow_type))] + async fn get_last_flow_by_dataset_of_type( + &self, + dataset_id: &DatasetID, + flow_type: DatasetFlowType, + ) -> Result, GetLastDatasetFlowError> { + let res = match self + .flow_event_store + .get_last_dataset_flow_of_type(dataset_id, flow_type) + { + Some(flow_id) => Some(self.get_flow(flow_id).await.int_err()?), + None => None, + }; + Ok(res) + } + + /// Returns state of the latest system flow of certain type + #[tracing::instrument(level = "debug", skip_all, fields(?flow_type))] + async fn get_last_system_flow_of_type( + &self, + flow_type: SystemFlowType, + ) -> Result, GetLastSystemtFlowError> { + let res = match self + .flow_event_store + .get_last_system_flow_of_type(flow_type) + { + Some(flow_id) => Some(self.get_flow(flow_id).await.int_err()?), + None => None, + }; + Ok(res) + } + + /// Returns current state of a given flow + #[tracing::instrument(level = "debug", skip_all, fields(%flow_id))] + async fn get_flow(&self, flow_id: FlowID) -> Result { + let flow = Flow::load(flow_id, self.flow_event_store.as_ref()).await?; + Ok(flow.into()) + } + + /// Attempts to cancel the given flow + #[tracing::instrument( + level = "debug", + skip_all, + fields(%flow_id, %by_account_id, %by_account_name) + )] + async fn cancel_flow( + &self, + flow_id: FlowID, + by_account_id: AccountID, + by_account_name: AccountName, + ) -> Result { + let mut flow = Flow::load(flow_id, self.flow_event_store.as_ref()).await?; + + if flow.can_cancel() { + flow.cancel(self.time_source.now(), by_account_id, by_account_name) + .int_err()?; + flow.save(self.flow_event_store.as_ref()).await.int_err()?; + + let mut state = self.state.lock().unwrap(); + if state.time_wheel.is_flow_activation_planned(flow_id) { + state + .time_wheel + .cancel_flow_activation(flow_id) + .map_err(|e| CancelFlowError::Internal(e.int_err()))?; + } + + state.pending_flows.drop_pending_flow(&flow.flow_key); + } + + Ok(flow.into()) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl AsyncEventHandler for FlowServiceInMemory { + #[tracing::instrument(level = "debug", skip_all, fields(?event))] + async fn handle(&self, event: &TaskEventFinished) -> Result<(), InternalError> { + // Is this a task associated with flows? + let maybe_flow_id = { + let state = self.state.lock().unwrap(); + if !state.running { + // Abort if running hasn't started yet + return Ok(()); + } + state.pending_flows.try_get_flow_id_by_task(event.task_id) + }; + + if let Some(flow_id) = maybe_flow_id { + let mut flow = Flow::load(flow_id, self.flow_event_store.as_ref()) + .await + .int_err()?; + flow.on_task_finished(self.time_source.now(), event.task_id, event.outcome) + .int_err()?; + flow.save(self.flow_event_store.as_ref()).await.int_err()?; + + let finish_time = self.round_time(event.event_time)?; + + // In case of success: + // - enqueue updates of dependent datasets + if event.outcome == TaskOutcome::Success { + if let FlowKey::Dataset(flow_key) = &flow.flow_key + && flow_key.flow_type.is_dataset_update() + { + self.enqueue_dependent_dataset_flows( + finish_time, + &flow_key.dataset_id, + flow_key.flow_type, + flow.flow_id, + ) + .await?; + } + } + + { + let mut state = self.state.lock().unwrap(); + state.pending_flows.untrack_flow_by_task(event.task_id); + state.pending_flows.drop_pending_flow(&flow.flow_key); + } + + // In case of success: + // - enqueue next auto-polling flow cycle + if event.outcome == TaskOutcome::Success { + self.try_enqueue_auto_polling_flow_if_enabled(finish_time, &flow.flow_key) + .await?; + } + + // Publish progress event + self.event_bus + .dispatch_event(FlowServiceEvent::FlowFinished( + FlowServiceEventFlowFinished { + event_time: finish_time, + }, + )) + .await + .int_err()?; + + // TODO: retry logic in case of failed outcome + } + + Ok(()) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl AsyncEventHandler for FlowServiceInMemory { + #[tracing::instrument(level = "debug", skip_all, fields(?event))] + async fn handle(&self, event: &FlowConfigurationEventModified) -> Result<(), InternalError> { + if event.paused { + let maybe_pending_flow_id = { + let mut state = self.state.lock().unwrap(); + if !state.running { + // Abort if running hasn't started yet + return Ok(()); + }; + + state.active_configs.drop_flow_config(&event.flow_key); + let maybe_pending_flow_id = state.pending_flows.drop_pending_flow(&event.flow_key); + if let Some(flow_id) = &maybe_pending_flow_id { + state + .time_wheel + .cancel_flow_activation(*flow_id) + .int_err()?; + } + maybe_pending_flow_id + }; + + if let Some(flow_id) = maybe_pending_flow_id { + self.abort_flow(flow_id).await?; + } + + // TODO: should we abort scheduled tasks? + } else { + { + let state = self.state.lock().unwrap(); + if !state.running { + // Abort if running hasn't started yet + return Ok(()); + }; + } + + let activation_time = self.round_time(event.event_time)?; + self.activate_flow_configuration( + activation_time, + event.flow_key.clone(), + event.rule.clone(), + ) + .await? + } + + Ok(()) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl AsyncEventHandler for FlowServiceInMemory { + #[tracing::instrument(level = "debug", skip_all, fields(?event))] + async fn handle(&self, event: &DatasetEventDeleted) -> Result<(), InternalError> { + let flow_ids_2_abort = { + let mut state = self.state.lock().unwrap(); + if !state.running { + // Abort if running hasn't started yet + return Ok(()); + }; + + state.active_configs.drop_dataset_configs(&event.dataset_id); + + // For every possible dataset flow: + // - drop it from pending state + // - drop queued activations + // - collect ID of aborted flow + let mut flow_ids_2_abort: Vec<_> = Vec::new(); + for flow_type in DatasetFlowType::all() { + if let Some(flow_id) = state + .pending_flows + .drop_dataset_pending_flow(&event.dataset_id, *flow_type) + { + flow_ids_2_abort.push(flow_id); + state.time_wheel.cancel_flow_activation(flow_id).int_err()?; + } + } + flow_ids_2_abort + }; + + // Abort matched flows + for flow_id in flow_ids_2_abort { + let mut flow = Flow::load(flow_id, self.flow_event_store.as_ref()) + .await + .int_err()?; + flow.abort(self.time_source.now()).int_err()?; + flow.save(self.flow_event_store.as_ref()).await.int_err()?; + } + + // Not deleting task->update association, it should be safe. + // Most of the time the outcome of the task will be "Cancelled". + // Even if task squeezes to succeed in between cancellations, it's safe: + // - we will record a successful update, no consequence + // - no further updates will be attempted (schedule deactivated above) + // - no dependent tasks will be launched (dependency graph erases neighbors) + + Ok(()) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/flow-system-inmem/src/services/flow/flow_time_wheel.rs b/src/infra/flow-system-inmem/src/services/flow/flow_time_wheel.rs new file mode 100644 index 0000000000..efe9bc4274 --- /dev/null +++ b/src/infra/flow-system-inmem/src/services/flow/flow_time_wheel.rs @@ -0,0 +1,257 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::cmp::Reverse; +use std::collections::{BinaryHeap, HashMap}; + +use chrono::{DateTime, Utc}; +use kamu_core::InternalError; +use kamu_flow_system::FlowID; +use thiserror::Error; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Default)] +pub(crate) struct FlowTimeWheel { + flow_heap: BinaryHeap>, + flow_activation_times_by_id: HashMap>, +} + +// TODO: assign a score, and use it as an ordering criteria for the tasks within +// the same activation time +#[derive(PartialEq, Eq, PartialOrd, Ord)] +struct FlowRecord { + pub activation_time: DateTime, + pub flow_id: FlowID, +} + +impl FlowRecord { + fn new(activation_time: DateTime, flow_id: FlowID) -> Self { + Self { + activation_time, + flow_id, + } + } +} + +impl FlowTimeWheel { + pub fn nearest_activation_moment(&self) -> Option> { + self.flow_heap.peek().map(|ar| ar.0.activation_time) + } + + pub fn take_nearest_planned_flows(&mut self) -> Vec { + if self.flow_heap.is_empty() { + vec![] + } else { + let activation_moment = self.flow_heap.peek().unwrap().0.activation_time; + + let mut res: Vec<_> = Vec::new(); + while let Some(ar) = self.flow_heap.peek() { + if ar.0.activation_time > activation_moment { + break; + } + + if self.is_flow_activation_planned(ar.0.flow_id) { + res.push(ar.0.flow_id); + } + + self.flow_heap.pop(); + } + + self.clean_top_cancellations(); + + res + } + } + + // TODO: maybe round activation time by a reasonable interval, like a minute, so + // that scoring logic might be inserted + pub fn activate_at( + &mut self, + activation_time: DateTime, + flow_id: FlowID, + ) -> Result<(), InternalError> { + match self.flow_activation_times_by_id.get(&flow_id) { + Some(earlier_activation_time) => { + if activation_time < *earlier_activation_time { + self.unplan_flow(flow_id); + self.plan_flow(FlowRecord::new(activation_time, flow_id)); + } + Ok(()) + } + None => { + self.plan_flow(FlowRecord::new(activation_time, flow_id)); + Ok(()) + } + } + } + + pub fn is_flow_activation_planned(&self, flow_id: FlowID) -> bool { + self.flow_activation_times_by_id.contains_key(&flow_id) + } + + pub fn cancel_flow_activation( + &mut self, + flow_id: FlowID, + ) -> Result<(), TimeWheelCancelActivationError> { + if self.flow_activation_times_by_id.contains_key(&flow_id) { + self.unplan_flow(flow_id); + Ok(()) + } else { + Err(TimeWheelCancelActivationError::FlowNotPlanned( + TimeWheelFlowNotPlannedError { flow_id }, + )) + } + } + + fn plan_flow(&mut self, flow_record: FlowRecord) { + self.flow_activation_times_by_id + .insert(flow_record.flow_id, flow_record.activation_time); + + self.flow_heap.push(Reverse(flow_record)); + } + + fn unplan_flow(&mut self, flow_id: FlowID) { + self.flow_activation_times_by_id.remove(&flow_id); + self.clean_top_cancellations(); + } + + fn clean_top_cancellations(&mut self) { + while let Some(ar) = self.flow_heap.peek() { + if self.is_flow_activation_planned(ar.0.flow_id) { + break; + } + + self.flow_heap.pop(); + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Error, Debug)] +pub(crate) enum TimeWheelCancelActivationError { + #[error(transparent)] + FlowNotPlanned(TimeWheelFlowNotPlannedError), +} + +#[derive(Error, Debug)] +#[error("Flow '{flow_id}' not found planned in the time wheel")] +pub(crate) struct TimeWheelFlowNotPlannedError { + flow_id: FlowID, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[cfg(test)] +mod tests { + use chrono::Duration; + + use super::*; + + const FLOW_ID_1: u64 = 115; + const FLOW_ID_2: u64 = 116; + const FLOW_ID_3: u64 = 117; + const FLOW_ID_4: u64 = 118; + const FLOW_ID_5: u64 = 119; + + #[test] + fn test_sequential_scheduling() { + let mut timewheel = FlowTimeWheel::default(); + assert!(timewheel.nearest_activation_moment().is_none()); + + let now = Utc::now(); + let moment_1 = now + Duration::seconds(10); + let moment_2 = now + Duration::seconds(20); + let moment_3 = now + Duration::seconds(30); + + schedule_flow(&mut timewheel, moment_1, FLOW_ID_1); + schedule_flow(&mut timewheel, moment_1, FLOW_ID_2); + schedule_flow(&mut timewheel, moment_2, FLOW_ID_3); + schedule_flow(&mut timewheel, moment_3, FLOW_ID_4); + schedule_flow(&mut timewheel, moment_3, FLOW_ID_5); + + check_next_time_slot(&mut timewheel, moment_1, &[FLOW_ID_1, FLOW_ID_2]); + check_next_time_slot(&mut timewheel, moment_2, &[FLOW_ID_3]); + check_next_time_slot(&mut timewheel, moment_3, &[FLOW_ID_4, FLOW_ID_5]); + } + + #[test] + fn test_random_order_scheduling() { + let mut timewheel = FlowTimeWheel::default(); + assert!(timewheel.nearest_activation_moment().is_none()); + + let now = Utc::now(); + let moment_1 = now + Duration::seconds(10); + let moment_2 = now + Duration::seconds(20); + let moment_3 = now + Duration::seconds(30); + + schedule_flow(&mut timewheel, moment_2, FLOW_ID_3); + schedule_flow(&mut timewheel, moment_3, FLOW_ID_5); + schedule_flow(&mut timewheel, moment_1, FLOW_ID_1); + schedule_flow(&mut timewheel, moment_3, FLOW_ID_4); + schedule_flow(&mut timewheel, moment_1, FLOW_ID_2); + + check_next_time_slot(&mut timewheel, moment_1, &[FLOW_ID_1, FLOW_ID_2]); + check_next_time_slot(&mut timewheel, moment_2, &[FLOW_ID_3]); + check_next_time_slot(&mut timewheel, moment_3, &[FLOW_ID_4, FLOW_ID_5]); + } + + #[test] + fn test_cancellations() { + let mut timewheel = FlowTimeWheel::default(); + assert!(timewheel.nearest_activation_moment().is_none()); + + let now = Utc::now(); + let moment_1 = now + Duration::seconds(10); + let moment_2 = now + Duration::seconds(20); + let moment_3 = now + Duration::seconds(30); + + schedule_flow(&mut timewheel, moment_1, FLOW_ID_1); + schedule_flow(&mut timewheel, moment_1, FLOW_ID_2); + schedule_flow(&mut timewheel, moment_2, FLOW_ID_3); + schedule_flow(&mut timewheel, moment_3, FLOW_ID_4); + schedule_flow(&mut timewheel, moment_3, FLOW_ID_5); + + timewheel + .cancel_flow_activation(FlowID::new(FLOW_ID_1)) + .unwrap(); + timewheel + .cancel_flow_activation(FlowID::new(FLOW_ID_3)) + .unwrap(); + timewheel + .cancel_flow_activation(FlowID::new(FLOW_ID_5)) + .unwrap(); + + check_next_time_slot(&mut timewheel, moment_1, &[FLOW_ID_2]); + check_next_time_slot(&mut timewheel, moment_3, &[FLOW_ID_4]); + assert!(timewheel.nearest_activation_moment().is_none()); + } + + fn schedule_flow(timewheel: &mut FlowTimeWheel, moment: DateTime, flow_id: u64) { + timewheel.activate_at(moment, FlowID::new(flow_id)).unwrap(); + } + + fn check_next_time_slot( + timewheel: &mut FlowTimeWheel, + moment: DateTime, + flow_ids: &[u64], + ) { + assert_eq!(timewheel.nearest_activation_moment().unwrap(), moment); + assert_eq!( + timewheel.take_nearest_planned_flows(), + flow_ids + .iter() + .map(|id| FlowID::new(*id)) + .collect::>() + ); + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/flow-system-inmem/src/services/flow/mod.rs b/src/infra/flow-system-inmem/src/services/flow/mod.rs new file mode 100644 index 0000000000..b1cecf5b10 --- /dev/null +++ b/src/infra/flow-system-inmem/src/services/flow/mod.rs @@ -0,0 +1,15 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod active_configs_state; +mod flow_service_inmem; +mod flow_time_wheel; +mod pending_flows_state; + +pub use flow_service_inmem::*; diff --git a/src/infra/flow-system-inmem/src/services/flow/pending_flows_state.rs b/src/infra/flow-system-inmem/src/services/flow/pending_flows_state.rs new file mode 100644 index 0000000000..df53f9f014 --- /dev/null +++ b/src/infra/flow-system-inmem/src/services/flow/pending_flows_state.rs @@ -0,0 +1,86 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::collections::HashMap; + +use kamu_flow_system::*; +use kamu_task_system::*; +use opendatafabric::DatasetID; + +use crate::dataset_flow_key::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Default)] +pub(crate) struct PendingFlowsState { + pending_dataset_flows: HashMap, + pending_system_flows: HashMap, + pending_flows_by_tasks: HashMap, +} + +impl PendingFlowsState { + pub fn add_pending_flow(&mut self, flow_key: FlowKey, flow_id: FlowID) { + match flow_key { + FlowKey::Dataset(flow_key) => { + self.pending_dataset_flows.insert(flow_key, flow_id); + } + FlowKey::System(flow_key) => { + self.pending_system_flows + .insert(flow_key.flow_type, flow_id); + } + } + } + + pub fn track_flow_task(&mut self, flow_id: FlowID, task_id: TaskID) { + self.pending_flows_by_tasks.insert(task_id, flow_id); + } + + pub fn drop_pending_flow(&mut self, flow_key: &FlowKey) -> Option { + match flow_key { + FlowKey::Dataset(flow_key) => { + self.drop_dataset_pending_flow(&flow_key.dataset_id, flow_key.flow_type) + } + FlowKey::System(flow_key) => self.pending_system_flows.remove(&flow_key.flow_type), + } + } + + pub fn drop_dataset_pending_flow( + &mut self, + dataset_id: &DatasetID, + flow_type: DatasetFlowType, + ) -> Option { + self.pending_dataset_flows + .remove(BorrowedFlowKeyDataset::new(dataset_id, flow_type).as_trait()) + } + + pub fn untrack_flow_by_task(&mut self, task_id: TaskID) { + self.pending_flows_by_tasks.remove(&task_id); + } + + pub fn try_get_pending_flow(&self, flow_key: &FlowKey) -> Option { + match flow_key { + FlowKey::Dataset(flow_key) => self + .pending_dataset_flows + .get( + BorrowedFlowKeyDataset::new(&flow_key.dataset_id, flow_key.flow_type) + .as_trait(), + ) + .cloned(), + FlowKey::System(flow_key) => { + self.pending_system_flows.get(&flow_key.flow_type).cloned() + } + } + } + + pub fn try_get_flow_id_by_task(&self, task_id: TaskID) -> Option { + self.pending_flows_by_tasks.get(&task_id).cloned() + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/flow-system-inmem/src/services/flow_configuration/flow_configuration_service_inmem.rs b/src/infra/flow-system-inmem/src/services/flow_configuration/flow_configuration_service_inmem.rs new file mode 100644 index 0000000000..cc91d8ca67 --- /dev/null +++ b/src/infra/flow-system-inmem/src/services/flow_configuration/flow_configuration_service_inmem.rs @@ -0,0 +1,180 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::Arc; + +use chrono::{DateTime, Utc}; +use dill::*; +use event_bus::{AsyncEventHandler, EventBus}; +use futures::StreamExt; +use kamu_core::events::DatasetEventDeleted; +use kamu_core::SystemTimeSource; +use kamu_flow_system::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +pub struct FlowConfigurationServiceInMemory { + event_store: Arc, + time_source: Arc, + event_bus: Arc, +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[component(pub)] +#[interface(dyn FlowConfigurationService)] +#[interface(dyn AsyncEventHandler)] +#[scope(Singleton)] +impl FlowConfigurationServiceInMemory { + pub fn new( + event_store: Arc, + time_source: Arc, + event_bus: Arc, + ) -> Self { + Self { + event_store, + time_source, + event_bus, + } + } + + async fn publish_flow_configuration_modified( + &self, + state: &FlowConfigurationState, + request_time: DateTime, + ) -> Result<(), InternalError> { + let event = FlowConfigurationEventModified { + event_time: request_time, + flow_key: state.flow_key.clone(), + paused: !state.is_active(), + rule: state.rule.clone(), + }; + self.event_bus.dispatch_event(event).await + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl FlowConfigurationService for FlowConfigurationServiceInMemory { + /// Find current schedule, which may or may not be associated with the given + /// dataset + #[tracing::instrument(level = "info", skip_all, fields(?flow_key))] + async fn find_configuration( + &self, + flow_key: FlowKey, + ) -> Result, FindFlowConfigurationError> { + let maybe_flow_configuration = + FlowConfiguration::try_load(flow_key, self.event_store.as_ref()).await?; + Ok(maybe_flow_configuration.map(|fcfg| fcfg.into())) + } + + /// Set or modify dataset update schedule + #[tracing::instrument(level = "info", skip_all, fields(?flow_key, %paused, ?rule))] + async fn set_configuration( + &self, + request_time: DateTime, + flow_key: FlowKey, + paused: bool, + rule: FlowConfigurationRule, + ) -> Result { + let maybe_flow_configuration = + FlowConfiguration::try_load(flow_key.clone(), self.event_store.as_ref()).await?; + + match maybe_flow_configuration { + // Modification + Some(mut flow_configuration) => { + flow_configuration + .modify_configuration(self.time_source.now(), paused, rule) + .int_err()?; + + flow_configuration + .save(self.event_store.as_ref()) + .await + .int_err()?; + + self.publish_flow_configuration_modified(&flow_configuration, request_time) + .await?; + + Ok(flow_configuration.into()) + } + // New configuration + None => { + let mut flow_configuration = + FlowConfiguration::new(self.time_source.now(), flow_key.clone(), paused, rule); + + flow_configuration + .save(self.event_store.as_ref()) + .await + .int_err()?; + + self.publish_flow_configuration_modified(&flow_configuration, request_time) + .await?; + + Ok(flow_configuration.into()) + } + } + } + + /// Lists all enabled configurations + fn list_enabled_configurations(&self) -> FlowConfigurationStateStream { + // Note: terribly ineffecient - walks over events multiple times + Box::pin(async_stream::try_stream! { + for system_flow_type in SystemFlowType::all() { + let flow_key = (*system_flow_type).into(); + let maybe_flow_configuration = FlowConfiguration::try_load(flow_key, self.event_store.as_ref()).await.int_err()?; + if let Some(flow_configuration) = maybe_flow_configuration && flow_configuration.is_active() { + yield flow_configuration.into(); + } + } + + let dataset_ids: Vec<_> = self.event_store.list_all_dataset_ids().collect().await; + for dataset_id in dataset_ids { + for dataset_flow_type in DatasetFlowType::all() { + let maybe_flow_configuration = FlowConfiguration::try_load(FlowKeyDataset::new(dataset_id.clone(), *dataset_flow_type).into(), self.event_store.as_ref()).await.int_err()?; + if let Some(flow_configuration) = maybe_flow_configuration && flow_configuration.is_active() { + yield flow_configuration.into(); + } + } + } + }) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +impl AsyncEventHandler for FlowConfigurationServiceInMemory { + #[tracing::instrument(level = "debug", skip_all, fields(?event))] + async fn handle(&self, event: &DatasetEventDeleted) -> Result<(), InternalError> { + for flow_type in DatasetFlowType::all() { + let maybe_flow_configuration = FlowConfiguration::try_load( + FlowKeyDataset::new(event.dataset_id.clone(), *flow_type).into(), + self.event_store.as_ref(), + ) + .await + .int_err()?; + + if let Some(mut flow_configuration) = maybe_flow_configuration { + flow_configuration + .notify_dataset_removed(self.time_source.now()) + .int_err()?; + + flow_configuration + .save(self.event_store.as_ref()) + .await + .int_err()?; + } + } + + Ok(()) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/flow-system-inmem/src/services/flow_configuration/mod.rs b/src/infra/flow-system-inmem/src/services/flow_configuration/mod.rs new file mode 100644 index 0000000000..ae56f54ae3 --- /dev/null +++ b/src/infra/flow-system-inmem/src/services/flow_configuration/mod.rs @@ -0,0 +1,12 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow_configuration_service_inmem; + +pub use flow_configuration_service_inmem::*; diff --git a/src/infra/flow-system-inmem/src/services/mod.rs b/src/infra/flow-system-inmem/src/services/mod.rs new file mode 100644 index 0000000000..0ba9707f2c --- /dev/null +++ b/src/infra/flow-system-inmem/src/services/mod.rs @@ -0,0 +1,14 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod flow; +mod flow_configuration; + +pub use flow::*; +pub use flow_configuration::*; diff --git a/src/infra/flow-system-inmem/tests/mod.rs b/src/infra/flow-system-inmem/tests/mod.rs new file mode 100644 index 0000000000..de6b0bc96f --- /dev/null +++ b/src/infra/flow-system-inmem/tests/mod.rs @@ -0,0 +1,12 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +#![feature(assert_matches)] + +mod tests; diff --git a/src/infra/flow-system-inmem/tests/tests/mod.rs b/src/infra/flow-system-inmem/tests/tests/mod.rs new file mode 100644 index 0000000000..5f120fa8b7 --- /dev/null +++ b/src/infra/flow-system-inmem/tests/tests/mod.rs @@ -0,0 +1,14 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod test_flow_configuration_service_inmem; + +// TODO: Windows scheduling is still unstable, need to debug it later +#[cfg(not(windows))] +mod test_flow_service_inmem; diff --git a/src/infra/flow-system-inmem/tests/tests/test_flow_configuration_service_inmem.rs b/src/infra/flow-system-inmem/tests/tests/test_flow_configuration_service_inmem.rs new file mode 100644 index 0000000000..7f93d5d3b3 --- /dev/null +++ b/src/infra/flow-system-inmem/tests/tests/test_flow_configuration_service_inmem.rs @@ -0,0 +1,459 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::assert_matches::assert_matches; +use std::collections::HashMap; +use std::sync::Arc; + +use chrono::{Duration, Utc}; +use event_bus::EventBus; +use futures::TryStreamExt; +use kamu::testing::MetadataFactory; +use kamu::*; +use kamu_core::*; +use kamu_flow_system::*; +use kamu_flow_system_inmem::*; +use opendatafabric::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_visibility() { + let harness = FlowConfigurationHarness::new(); + assert!(harness.list_enabled_configurations().await.is_empty()); + + let gc_schedule: Schedule = Duration::minutes(30).into(); + harness + .set_system_flow_schedule(SystemFlowType::GC, gc_schedule.clone()) + .await; + + let foo_id = harness.create_root_dataset("foo").await; + let bar_id = harness.create_root_dataset("bar").await; + + let foo_ingest_schedule: Schedule = Duration::days(1).into(); + harness + .set_dataset_flow_schedule( + foo_id.clone(), + DatasetFlowType::Ingest, + foo_ingest_schedule.clone(), + ) + .await; + + let foo_compaction_schedule: Schedule = Duration::weeks(1).into(); + harness + .set_dataset_flow_schedule( + foo_id.clone(), + DatasetFlowType::Compaction, + foo_compaction_schedule.clone(), + ) + .await; + + let bar_ingest_schedule: Schedule = Duration::hours(3).into(); + harness + .set_dataset_flow_schedule( + bar_id.clone(), + DatasetFlowType::Ingest, + bar_ingest_schedule.clone(), + ) + .await; + + let configs = harness.list_enabled_configurations().await; + assert_eq!(4, configs.len()); + + harness.expect_system_flow_schedule(&configs, SystemFlowType::GC, &gc_schedule); + + for (dataset_id, dataset_flow_type, schedule) in [ + ( + foo_id.clone(), + DatasetFlowType::Ingest, + &foo_ingest_schedule, + ), + ( + foo_id, + DatasetFlowType::Compaction, + &foo_compaction_schedule, + ), + (bar_id, DatasetFlowType::Ingest, &bar_ingest_schedule), + ] { + harness.expect_dataset_flow_schedule(&configs, dataset_id, dataset_flow_type, &schedule); + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_pause_resume() { + let harness = FlowConfigurationHarness::new(); + assert!(harness.list_enabled_configurations().await.is_empty()); + + // Make a dataset and configure daily ingestion schedule + let foo_id = harness.create_root_dataset("foo").await; + let foo_ingest_schedule: Schedule = Duration::days(1).into(); + harness + .set_dataset_flow_schedule( + foo_id.clone(), + DatasetFlowType::Ingest, + foo_ingest_schedule.clone(), + ) + .await; + + // It should be visible in the list of enabled configs + let configs = harness.list_enabled_configurations().await; + assert_eq!(1, configs.len()); + harness.expect_dataset_flow_schedule( + &configs, + foo_id.clone(), + DatasetFlowType::Ingest, + &foo_ingest_schedule, + ); + + // Now, pause this flow configuration + harness + .pause_dataset_flow(foo_id.clone(), DatasetFlowType::Ingest) + .await; + + // It should disappear from the list of enabled configs + let configs = harness.list_enabled_configurations().await; + assert_eq!(0, configs.len()); + + // Still we should see it's state as paused in the repository directly + let flow_config_state = harness + .get_dataset_flow_config_from_store(foo_id.clone(), DatasetFlowType::Ingest) + .await; + assert_eq!( + flow_config_state.status, + FlowConfigurationStatus::PausedTemporarily + ); + assert_eq!( + flow_config_state.rule, + FlowConfigurationRule::Schedule(foo_ingest_schedule.clone()) + ); + + // Now, resume the configuraton + harness + .resume_dataset_flow(foo_id.clone(), DatasetFlowType::Ingest) + .await; + + // It should be visible in the list of active configs again + let configs = harness.list_enabled_configurations().await; + assert_eq!(1, configs.len()); + harness.expect_dataset_flow_schedule( + &configs, + foo_id, + DatasetFlowType::Ingest, + &foo_ingest_schedule, + ); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_modify() { + let harness = FlowConfigurationHarness::new(); + assert!(harness.list_enabled_configurations().await.is_empty()); + + // Make a dataset and configure daily ingestion schedule + let foo_id = harness.create_root_dataset("foo").await; + let foo_ingest_schedule: Schedule = Duration::days(1).into(); + harness + .set_dataset_flow_schedule( + foo_id.clone(), + DatasetFlowType::Ingest, + foo_ingest_schedule.clone(), + ) + .await; + + // It should be visible in the list of enabled configs + let configs = harness.list_enabled_configurations().await; + assert_eq!(1, configs.len()); + harness.expect_dataset_flow_schedule( + &configs, + foo_id.clone(), + DatasetFlowType::Ingest, + &foo_ingest_schedule, + ); + + // Now make the schedule weekly + let foo_ingest_schedule_2: Schedule = Duration::weeks(1).into(); + harness + .set_dataset_flow_schedule( + foo_id.clone(), + DatasetFlowType::Ingest, + foo_ingest_schedule_2.clone(), + ) + .await; + + // Observe the updated config + let configs = harness.list_enabled_configurations().await; + assert_eq!(1, configs.len()); + harness.expect_dataset_flow_schedule( + &configs, + foo_id.clone(), + DatasetFlowType::Ingest, + &foo_ingest_schedule_2, + ); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_dataset_deleted() { + let harness = FlowConfigurationHarness::new(); + assert!(harness.list_enabled_configurations().await.is_empty()); + + // Make a dataset and configure daily ingestion schedule + let foo_id = harness.create_root_dataset("foo").await; + let foo_ingest_schedule: Schedule = Duration::days(1).into(); + harness + .set_dataset_flow_schedule( + foo_id.clone(), + DatasetFlowType::Ingest, + foo_ingest_schedule.clone(), + ) + .await; + + // It should be visible in the list of enabled configs + let configs = harness.list_enabled_configurations().await; + assert_eq!(1, configs.len()); + harness.expect_dataset_flow_schedule( + &configs, + foo_id.clone(), + DatasetFlowType::Ingest, + &foo_ingest_schedule, + ); + + // Now, delete the dataset + harness.delete_dataset(&foo_id).await; + + // The dataset should not be visible in the list of enabled configs + let configs = harness.list_enabled_configurations().await; + assert_eq!(0, configs.len()); + + // Still we should see it's state as permanently stopped in the repository + let flow_config_state = harness + .get_dataset_flow_config_from_store(foo_id, DatasetFlowType::Ingest) + .await; + assert_eq!( + flow_config_state.status, + FlowConfigurationStatus::StoppedPermanently + ); + assert_eq!( + flow_config_state.rule, + FlowConfigurationRule::Schedule(foo_ingest_schedule) + ); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +struct FlowConfigurationHarness { + _tmp_dir: tempfile::TempDir, + catalog: dill::Catalog, + dataset_repo: Arc, + flow_configuration_service: Arc, + flow_configuration_event_store: Arc, +} + +impl FlowConfigurationHarness { + fn new() -> Self { + let tmp_dir = tempfile::tempdir().unwrap(); + let datasets_dir = tmp_dir.path().join("datasets"); + std::fs::create_dir(&datasets_dir).unwrap(); + + use dill::Component; + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .add::() + .add::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(datasets_dir) + .with_multi_tenant(false), + ) + .bind::() + .add_value(CurrentAccountSubject::new_test()) + .add::() + .add::() + .build(); + + let flow_configuration_service = catalog.get_one::().unwrap(); + let flow_configuration_event_store = catalog + .get_one::() + .unwrap(); + let dataset_repo = catalog.get_one::().unwrap(); + + Self { + _tmp_dir: tmp_dir, + catalog, + flow_configuration_service, + flow_configuration_event_store, + dataset_repo, + } + } + + async fn list_enabled_configurations(&self) -> HashMap { + let active_configs: Vec<_> = self + .flow_configuration_service + .list_enabled_configurations() + .try_collect::>() + .await + .unwrap(); + + let mut res = HashMap::new(); + for active_config in active_configs { + res.insert(active_config.flow_key.clone(), active_config); + } + res + } + + async fn set_system_flow_schedule(&self, system_flow_type: SystemFlowType, schedule: Schedule) { + self.flow_configuration_service + .set_configuration( + Utc::now(), + system_flow_type.into(), + false, + FlowConfigurationRule::Schedule(schedule), + ) + .await + .unwrap(); + } + + async fn set_dataset_flow_schedule( + &self, + dataset_id: DatasetID, + dataset_flow_type: DatasetFlowType, + schedule: Schedule, + ) { + self.flow_configuration_service + .set_configuration( + Utc::now(), + FlowKeyDataset::new(dataset_id, dataset_flow_type).into(), + false, + FlowConfigurationRule::Schedule(schedule), + ) + .await + .unwrap(); + } + + fn expect_system_flow_schedule( + &self, + enabled_configurations: &HashMap, + system_flow_type: SystemFlowType, + expected_schedule: &Schedule, + ) { + assert_matches!( + enabled_configurations.get(&(system_flow_type.into())), + Some(FlowConfigurationState { + flow_key: _, + status: FlowConfigurationStatus::Active, + rule: FlowConfigurationRule::Schedule(actual_schedule) + }) if actual_schedule == expected_schedule + ); + } + + fn expect_dataset_flow_schedule( + &self, + enabled_configurations: &HashMap, + dataset_id: DatasetID, + dataset_flow_type: DatasetFlowType, + expected_schedule: &Schedule, + ) { + assert_matches!( + enabled_configurations.get(&(FlowKeyDataset::new(dataset_id, dataset_flow_type).into())), + Some(FlowConfigurationState { + flow_key: _, + status: FlowConfigurationStatus::Active, + rule: FlowConfigurationRule::Schedule(actual_schedule) + }) if actual_schedule == expected_schedule + ); + } + + async fn pause_dataset_flow(&self, dataset_id: DatasetID, dataset_flow_type: DatasetFlowType) { + let flow_key: FlowKey = FlowKeyDataset::new(dataset_id, dataset_flow_type).into(); + let current_config = self + .flow_configuration_service + .find_configuration(flow_key.clone()) + .await + .unwrap() + .unwrap(); + + self.flow_configuration_service + .set_configuration(Utc::now(), flow_key, true, current_config.rule) + .await + .unwrap(); + } + + async fn resume_dataset_flow(&self, dataset_id: DatasetID, dataset_flow_type: DatasetFlowType) { + let flow_key: FlowKey = FlowKeyDataset::new(dataset_id, dataset_flow_type).into(); + let current_config = self + .flow_configuration_service + .find_configuration(flow_key.clone()) + .await + .unwrap() + .unwrap(); + + self.flow_configuration_service + .set_configuration(Utc::now(), flow_key, false, current_config.rule) + .await + .unwrap(); + } + + async fn get_dataset_flow_config_from_store( + &self, + dataset_id: DatasetID, + dataset_flow_type: DatasetFlowType, + ) -> FlowConfigurationState { + let flow_key: FlowKey = FlowKeyDataset::new(dataset_id, dataset_flow_type).into(); + let flow_configuration = + FlowConfiguration::load(flow_key, self.flow_configuration_event_store.as_ref()) + .await + .unwrap(); + flow_configuration.into() + } + + async fn create_root_dataset(&self, dataset_name: &str) -> DatasetID { + let result = self + .dataset_repo + .create_dataset_from_snapshot( + None, + MetadataFactory::dataset_snapshot() + .name(DatasetName::new_unchecked(dataset_name)) + .kind(DatasetKind::Root) + .push_event(MetadataFactory::set_polling_source().build()) + .build(), + ) + .await + .unwrap(); + + result.dataset_handle.id + } + + async fn delete_dataset(&self, dataset_id: &DatasetID) { + // Eagerly push dependency graph initialization before deletes. + // It's ignored, if requested 2nd time + let dependency_graph_service = self + .catalog + .get_one::() + .unwrap(); + let dependency_graph_repository = + DependencyGraphRepositoryInMemory::new(self.dataset_repo.clone()); + dependency_graph_service + .eager_initialization(&dependency_graph_repository) + .await + .unwrap(); + + // Do the actual deletion + self.dataset_repo + .delete_dataset(&(dataset_id.as_local_ref())) + .await + .unwrap(); + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/flow-system-inmem/tests/tests/test_flow_service_inmem.rs b/src/infra/flow-system-inmem/tests/tests/test_flow_service_inmem.rs new file mode 100644 index 0000000000..37df488f65 --- /dev/null +++ b/src/infra/flow-system-inmem/tests/tests/test_flow_service_inmem.rs @@ -0,0 +1,1183 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + +use chrono::{DateTime, Duration, DurationRound, Utc}; +use dill::*; +use event_bus::{AsyncEventHandler, EventBus}; +use kamu::testing::MetadataFactory; +use kamu::*; +use kamu_core::*; +use kamu_flow_system::*; +use kamu_flow_system_inmem::*; +use kamu_task_system::*; +use kamu_task_system_inmem::{TaskSchedulerInMemory, TaskSystemEventStoreInMemory}; +use opendatafabric::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_read_initial_config_and_queue_properly() { + let harness = FlowHarness::new(); + + let foo_id = harness.create_root_dataset("foo").await; + harness + .set_dataset_flow_schedule( + Utc::now(), + foo_id.clone(), + DatasetFlowType::Ingest, + Duration::milliseconds(60).into(), + ) + .await; + + let bar_id = harness.create_root_dataset("bar").await; + harness + .set_dataset_flow_schedule( + Utc::now(), + bar_id.clone(), + DatasetFlowType::Ingest, + Duration::milliseconds(90).into(), + ) + .await; + + // Remember start time + let start_time = Utc::now() + .duration_round(Duration::milliseconds(SCHEDULING_ALIGNMENT_MS)) + .unwrap(); + + // Run scheduler concurrently with manual triggers script + let _ = tokio::select! { + res = harness.flow_service.run(start_time) => res.int_err(), + _ = tokio::time::sleep(std::time::Duration::from_millis(120)) => Ok(()), + } + .unwrap(); + + let test_flow_listener = harness.catalog.get_one::().unwrap(); + + let foo_flow_key = FlowKeyDataset::new(foo_id.clone(), DatasetFlowType::Ingest).into(); + let bar_flow_key = FlowKeyDataset::new(bar_id.clone(), DatasetFlowType::Ingest).into(); + + let state = test_flow_listener.state.lock().unwrap(); + assert_eq!(3, state.snapshots.len()); + + let start_moment = state.snapshots[0].0; + let foo_moment = state.snapshots[1].0; + let bar_moment = state.snapshots[2].0; + + assert_eq!(start_time, start_moment); + assert_eq!((foo_moment - start_moment), Duration::milliseconds(60)); // planned time for "foo" + assert_eq!((bar_moment - start_moment), Duration::milliseconds(90)); // planned time for "bar" + + assert_flow_test_checks(&[ + // Snapshot 0: after initial queueing + FlowTestCheck { + snapshot: &state.snapshots[0].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Queued, None), + (&bar_flow_key, FlowStatus::Queued, None), + ], + }, + // Snapshot 1: period passed for 'foo', but not yet for 'bar' + FlowTestCheck { + snapshot: &state.snapshots[1].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Scheduled, None), + (&bar_flow_key, FlowStatus::Queued, None), + ], + }, + // Snapshot 2: period passed for 'foo' and for 'bar' + FlowTestCheck { + snapshot: &state.snapshots[2].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Scheduled, None), + (&bar_flow_key, FlowStatus::Scheduled, None), + ], + }, + ]); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_manual_trigger() { + let harness = FlowHarness::new(); + + let foo_id = harness.create_root_dataset("foo").await; + let bar_id = harness.create_root_dataset("bar").await; + + // Note: only "foo" has auto-schedule, "bar" hasn't + harness + .set_dataset_flow_schedule( + Utc::now(), + foo_id.clone(), + DatasetFlowType::Ingest, + Duration::milliseconds(60).into(), + ) + .await; + + let foo_flow_key: FlowKey = FlowKeyDataset::new(foo_id.clone(), DatasetFlowType::Ingest).into(); + let bar_flow_key: FlowKey = FlowKeyDataset::new(bar_id.clone(), DatasetFlowType::Ingest).into(); + + // Remember start time + let start_time = Utc::now() + .duration_round(Duration::milliseconds(SCHEDULING_ALIGNMENT_MS)) + .unwrap(); + + // Run scheduler concurrently with manual triggers script + let _ = tokio::select! { + res = harness.flow_service.run(start_time) => res.int_err(), + _ = async { + // Sleep < "foo" period + tokio::time::sleep(std::time::Duration::from_millis(40)).await; + let new_time = start_time + Duration::milliseconds(40); + harness.trigger_manual_flow(new_time, foo_flow_key.clone()).await; // "foo" pending already + harness.trigger_manual_flow(new_time, bar_flow_key.clone()).await; // "bar" not queued, starts soon + + // Wake up after foo scheduling + tokio::time::sleep(std::time::Duration::from_millis(20)).await; + let new_time = new_time + Duration::milliseconds(20); + harness.trigger_manual_flow(new_time, foo_flow_key.clone()).await; // "foo" pending already, even running + harness.trigger_manual_flow(new_time, bar_flow_key.clone()).await; // "bar" pending already, event running + + // Make sure nothing got scheduled in near time + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + + } => Ok(()), + } + .unwrap(); + + let test_flow_listener = harness.catalog.get_one::().unwrap(); + + let state = test_flow_listener.state.lock().unwrap(); + assert_eq!(3, state.snapshots.len()); + + let start_moment = state.snapshots[0].0; + let bar_moment = state.snapshots[1].0; + let foo_moment = state.snapshots[2].0; + + assert_eq!(start_moment, start_time); + assert_eq!((bar_moment - start_moment), Duration::milliseconds(40)); // next slot after 40ms trigger with 10ms align + assert_eq!((foo_moment - start_moment), Duration::milliseconds(60)); // 60ms as planned + + assert_flow_test_checks(&[ + // Snapshot 0: after initial queueing, no "bar", only "foo" + FlowTestCheck { + snapshot: &state.snapshots[0].1, + patterns: vec![(&foo_flow_key, FlowStatus::Queued, None)], + }, + // Snapshot 1: "bar" had manual trigger + FlowTestCheck { + snapshot: &state.snapshots[1].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Queued, None), + (&bar_flow_key, FlowStatus::Scheduled, None), + ], + }, + // Snapshot 2: period passed for 'foo' + FlowTestCheck { + snapshot: &state.snapshots[2].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Scheduled, None), + (&bar_flow_key, FlowStatus::Scheduled, None), + ], + }, + ]); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_dataset_flow_configuration_paused_resumed_modified() { + let harness = FlowHarness::new(); + + let foo_id = harness.create_root_dataset("foo").await; + let bar_id: DatasetID = harness.create_root_dataset("bar").await; + harness + .set_dataset_flow_schedule( + Utc::now(), + foo_id.clone(), + DatasetFlowType::Ingest, + Duration::milliseconds(50).into(), + ) + .await; + harness + .set_dataset_flow_schedule( + Utc::now(), + bar_id.clone(), + DatasetFlowType::Ingest, + Duration::milliseconds(40).into(), + ) + .await; + + let foo_flow_key: FlowKey = FlowKeyDataset::new(foo_id.clone(), DatasetFlowType::Ingest).into(); + let bar_flow_key: FlowKey = FlowKeyDataset::new(bar_id.clone(), DatasetFlowType::Ingest).into(); + + let test_flow_listener = harness.catalog.get_one::().unwrap(); + + // Remember start time + let start_time = Utc::now() + .duration_round(Duration::milliseconds(SCHEDULING_ALIGNMENT_MS)) + .unwrap(); + + // Run scheduler concurrently with manual triggers script + let _ = tokio::select! { + res = harness.flow_service.run(start_time) => res.int_err(), + _ = async { + // Sleep < "foo"/"bar" period + tokio::time::sleep(std::time::Duration::from_millis(25)).await; + harness.pause_dataset_flow(start_time + Duration::milliseconds(25), foo_id.clone(), DatasetFlowType::Ingest).await; + harness.pause_dataset_flow(start_time + Duration::milliseconds(25), bar_id.clone(), DatasetFlowType::Ingest).await; + test_flow_listener + .snapshot_flows(start_time + Duration::milliseconds(25)) + .await; + + // Wake up after initially planned "bar" and "foo" scheduling + tokio::time::sleep(std::time::Duration::from_millis(30)).await; + harness.resume_dataset_flow(start_time + Duration::milliseconds(55), foo_id.clone(), DatasetFlowType::Ingest).await; + harness.set_dataset_flow_schedule(start_time + Duration::milliseconds(55), bar_id.clone(), DatasetFlowType::Ingest, Duration::milliseconds(30).into()).await; + + test_flow_listener + .snapshot_flows(start_time + Duration::milliseconds(55)) + .await; + + // "foo" will get rescheduled in 50 ms, "bar" in 30ms, leave extra for stabilization + tokio::time::sleep(std::time::Duration::from_millis(70)).await; + + } => Ok(()), + } + .unwrap(); + + let state = test_flow_listener.state.lock().unwrap(); + assert_eq!(5, state.snapshots.len()); + + let start_moment = state.snapshots[0].0; + let pause_moment = state.snapshots[1].0; + let resume_moment = state.snapshots[2].0; + let bar_sch_moment = state.snapshots[3].0; + let foo_sch_moment = state.snapshots[4].0; + + assert_eq!(start_moment, start_time); + assert_eq!((pause_moment - start_moment), Duration::milliseconds(25)); + assert_eq!((resume_moment - start_moment), Duration::milliseconds(55)); + assert_eq!((bar_sch_moment - start_moment), Duration::milliseconds(90)); + assert_eq!((foo_sch_moment - start_moment), Duration::milliseconds(110)); + + assert_flow_test_checks(&[ + // Snapshot 0: after initial queueing + FlowTestCheck { + snapshot: &state.snapshots[0].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Queued, None), + (&bar_flow_key, FlowStatus::Queued, None), + ], + }, + // Snapshot 1: "foo" paused, "bar" paused + FlowTestCheck { + snapshot: &state.snapshots[1].1, + patterns: vec![ + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Aborted), + ), + ( + &bar_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Aborted), + ), + ], + }, + // Snapshot 2: "foo" resumed, "bar" resumed + FlowTestCheck { + snapshot: &state.snapshots[2].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Queued, None), + (&bar_flow_key, FlowStatus::Queued, None), + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Aborted), + ), + ( + &bar_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Aborted), + ), + ], + }, + // Snapshot 3: "bar" scheduled + FlowTestCheck { + snapshot: &state.snapshots[3].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Queued, None), + (&bar_flow_key, FlowStatus::Scheduled, None), + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Aborted), + ), + ( + &bar_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Aborted), + ), + ], + }, + // Snapshot 4: "foo" scheduled + FlowTestCheck { + snapshot: &state.snapshots[4].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Scheduled, None), + (&bar_flow_key, FlowStatus::Scheduled, None), + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Aborted), + ), + ( + &bar_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Aborted), + ), + ], + }, + ]); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_dataset_deleted() { + let harness = FlowHarness::new(); + + let foo_id = harness.create_root_dataset("foo").await; + let bar_id = harness.create_root_dataset("bar").await; + + harness + .set_dataset_flow_schedule( + Utc::now(), + foo_id.clone(), + DatasetFlowType::Ingest, + Duration::milliseconds(50).into(), + ) + .await; + harness + .set_dataset_flow_schedule( + Utc::now(), + bar_id.clone(), + DatasetFlowType::Ingest, + Duration::milliseconds(70).into(), + ) + .await; + + let foo_flow_key: FlowKey = FlowKeyDataset::new(foo_id.clone(), DatasetFlowType::Ingest).into(); + let bar_flow_key: FlowKey = FlowKeyDataset::new(bar_id.clone(), DatasetFlowType::Ingest).into(); + + // Remember start time + let start_time = Utc::now() + .duration_round(Duration::milliseconds(SCHEDULING_ALIGNMENT_MS)) + .unwrap(); + + // Flow listener will collect snapshots at important moments of time + let test_flow_listener = harness.catalog.get_one::().unwrap(); + + // Run scheduler concurrently with manual triggers script + let _ = tokio::select! { + res = harness.flow_service.run(start_time) => res.int_err(), + _ = async { + // Sleep < "foo" period + tokio::time::sleep(std::time::Duration::from_millis(25)).await; + harness.delete_dataset(&foo_id).await; + test_flow_listener + .snapshot_flows(start_time + Duration::milliseconds(25)) + .await; + + // Wake up after bar scheduling + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + harness.delete_dataset(&bar_id).await; + test_flow_listener + .snapshot_flows(start_time + Duration::milliseconds(75)) + .await; + + // Make sure nothing got scheduled in near time + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + + } => Ok(()), + } + .unwrap(); + + let state = test_flow_listener.state.lock().unwrap(); + assert_eq!(4, state.snapshots.len()); + + let start_moment = state.snapshots[0].0; + let foo_del_moment = state.snapshots[1].0; + let bar_sch_moment = state.snapshots[2].0; + let bar_del_moment = state.snapshots[3].0; + + assert_eq!(start_moment, start_time); + assert_eq!((foo_del_moment - start_moment), Duration::milliseconds(25)); + assert_eq!((bar_sch_moment - start_moment), Duration::milliseconds(70)); + assert_eq!((bar_del_moment - start_moment), Duration::milliseconds(75)); + + assert_flow_test_checks(&[ + // Snapshot 0: after initial queueing + FlowTestCheck { + snapshot: &state.snapshots[0].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Queued, None), + (&bar_flow_key, FlowStatus::Queued, None), + ], + }, + // Snapshot 1: "foo" delete moment + FlowTestCheck { + snapshot: &state.snapshots[1].1, + patterns: vec![ + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Aborted), + ), + (&bar_flow_key, FlowStatus::Queued, None), + ], + }, + // Snapshot 2: period passed for 'bar' + FlowTestCheck { + snapshot: &state.snapshots[2].1, + patterns: vec![ + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Aborted), + ), + (&bar_flow_key, FlowStatus::Scheduled, None), + ], + }, + // Snapshot 3: "bar" delete moment + FlowTestCheck { + snapshot: &state.snapshots[3].1, + patterns: vec![ + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Aborted), + ), + ( + &bar_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Aborted), + ), + ], + }, + ]); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_task_completions_trigger_next_loop_on_success() { + let harness = FlowHarness::new(); + + let foo_id = harness.create_root_dataset("foo").await; + let bar_id = harness.create_root_dataset("bar").await; + let baz_id = harness.create_root_dataset("baz").await; + + for dataset_id in [&foo_id, &bar_id, &baz_id] { + harness + .set_dataset_flow_schedule( + Utc::now(), + dataset_id.clone(), + DatasetFlowType::Ingest, + Duration::milliseconds(40).into(), + ) + .await; + } + + // Enforce dependency graph initialization + harness.eager_dependencies_graph_init().await; + + // Remember start time + let start_time = Utc::now() + .duration_round(Duration::milliseconds(SCHEDULING_ALIGNMENT_MS)) + .unwrap(); + + // Flow listener will collect snapshots at important moments of time + let test_flow_listener = harness.catalog.get_one::().unwrap(); + + // Obtain access to event bus + let event_bus = harness.catalog.get_one::().unwrap(); + + // Run scheduler concurrently with manual triggers script + let _ = tokio::select! { + res = harness.flow_service.run(start_time) => res.int_err(), + _ = async { + // Each of 3 datasets should be scheduled after this time + let mut next_time = start_time + Duration::milliseconds(60); + tokio::time::sleep(std::time::Duration::from_millis(60)).await; + + // Plan different task execution outcomes for each dataset + let mut planned_outcomes = HashMap::new(); + planned_outcomes.insert(&foo_id, TaskOutcome::Success); + planned_outcomes.insert(&bar_id, TaskOutcome::Failed); + planned_outcomes.insert(&baz_id, TaskOutcome::Cancelled); + + // Determine which task state belongs to which dataset + let scheduled_tasks = harness.take_scheduled_tasks().await; + let mut scheduled_tasks_by_dataset_id = HashMap::new(); + for scheduled_task in scheduled_tasks { + let task_dataset_id = match &scheduled_task.logical_plan { + LogicalPlan::UpdateDataset(lp) => lp.dataset_id.clone(), + _ => unreachable!() + }; + scheduled_tasks_by_dataset_id.insert(task_dataset_id, scheduled_task); + }; + + // Send task finished event for each dataset with certain interval + for dataset_id in [&foo_id, &bar_id, &baz_id] { + let dataset_task = scheduled_tasks_by_dataset_id.get(dataset_id).unwrap(); + event_bus.dispatch_event(TaskEventFinished { + event_time: next_time, + task_id: dataset_task.task_id, + outcome: *(planned_outcomes.get(dataset_id).unwrap()) + }).await.unwrap(); + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + next_time += Duration::milliseconds(10); + } + + // Let the succeeded dataset to schedule another update. 40s + 20s max waiting + tokio::time::sleep(std::time::Duration::from_millis(70)).await; + + } => Ok(()), + } + .unwrap(); + + let state = test_flow_listener.state.lock().unwrap(); + assert_eq!(6, state.snapshots.len()); + + let start_moment = state.snapshots[0].0; + let schedule_moment = state.snapshots[1].0; + let finish_foo_moment = state.snapshots[2].0; + let finish_bar_moment = state.snapshots[3].0; + let finish_baz_moment = state.snapshots[4].0; + let reschedule_foo_moment = state.snapshots[5].0; + + assert_eq!(start_moment, start_time); + assert_eq!((schedule_moment - start_moment), Duration::milliseconds(40)); + assert_eq!( + (finish_foo_moment - start_moment), + Duration::milliseconds(60) + ); + assert_eq!( + (finish_bar_moment - start_moment), + Duration::milliseconds(70) + ); + assert_eq!( + (finish_baz_moment - start_moment), + Duration::milliseconds(80) + ); + assert_eq!( + (reschedule_foo_moment - start_moment), + Duration::milliseconds(100) + ); + + let foo_flow_key: FlowKey = FlowKeyDataset::new(foo_id.clone(), DatasetFlowType::Ingest).into(); + let bar_flow_key: FlowKey = FlowKeyDataset::new(bar_id.clone(), DatasetFlowType::Ingest).into(); + let baz_flow_key: FlowKey = FlowKeyDataset::new(baz_id.clone(), DatasetFlowType::Ingest).into(); + + assert_flow_test_checks(&[ + // Snapshot 0: after initial queueing + FlowTestCheck { + snapshot: &state.snapshots[0].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Queued, None), + (&bar_flow_key, FlowStatus::Queued, None), + (&baz_flow_key, FlowStatus::Queued, None), + ], + }, + // Snapshot 1: all 3 are scheduled + FlowTestCheck { + snapshot: &state.snapshots[1].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Scheduled, None), + (&bar_flow_key, FlowStatus::Scheduled, None), + (&baz_flow_key, FlowStatus::Scheduled, None), + ], + }, + // Snapshot 2: "foo" finished, enqueued for round 2 + FlowTestCheck { + snapshot: &state.snapshots[2].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Queued, None), + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Success), + ), + (&bar_flow_key, FlowStatus::Scheduled, None), + (&baz_flow_key, FlowStatus::Scheduled, None), + ], + }, + // Snapshot 3: "bar" finished with Fail + FlowTestCheck { + snapshot: &state.snapshots[3].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Queued, None), + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Success), + ), + ( + &bar_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Failed), + ), + (&baz_flow_key, FlowStatus::Scheduled, None), + ], + }, + // Snapshot 4: "baz" finished with Cancel + FlowTestCheck { + snapshot: &state.snapshots[4].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Queued, None), + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Success), + ), + ( + &bar_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Failed), + ), + ( + &baz_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Cancelled), + ), + ], + }, + // Snapshot 5: "foo" scheduled, round 2 + FlowTestCheck { + snapshot: &state.snapshots[5].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Scheduled, None), + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Success), + ), + ( + &bar_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Failed), + ), + ( + &baz_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Cancelled), + ), + ], + }, + ]); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_update_success_triggers_update_of_derived_datasets() { + let harness = FlowHarness::new(); + + let foo_id = harness.create_root_dataset("foo").await; + let bar_id = harness + .create_derived_dataset("bar", vec![foo_id.clone()]) + .await; + let baz_id = harness + .create_derived_dataset("baz", vec![foo_id.clone()]) + .await; + + harness + .set_dataset_flow_schedule( + Utc::now(), + foo_id.clone(), + DatasetFlowType::Ingest, + Duration::milliseconds(30).into(), + ) + .await; + + for dataset_id in [&bar_id, &baz_id] { + harness + .set_dataset_flow_start_condition( + Utc::now(), + dataset_id.clone(), + DatasetFlowType::Ingest, + StartConditionConfiguration { + throttling_period: None, + minimal_data_batch: None, + }, + ) + .await; + } + + // Enforce dependency graph initialization + harness.eager_dependencies_graph_init().await; + + // Remember start time + let start_time = Utc::now() + .duration_round(Duration::milliseconds(SCHEDULING_ALIGNMENT_MS)) + .unwrap(); + + // Flow listener will collect snapshots at important moments of time + let test_flow_listener = harness.catalog.get_one::().unwrap(); + + // Obtain access to event bus + let event_bus = harness.catalog.get_one::().unwrap(); + + // Run scheduler concurrently with manual triggers script + let _ = tokio::select! { + res = harness.flow_service.run(start_time) => res.int_err(), + _ = async { + // "foo" is definitely schedule now + let next_time = start_time + Duration::milliseconds(50); + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + + // Extract dataset tasks + let scheduled_tasks = harness.take_scheduled_tasks().await; + assert_eq!(1, scheduled_tasks.len()); + let task_dataset_id = match &scheduled_tasks[0].logical_plan { + LogicalPlan::UpdateDataset(lp) => lp.dataset_id.clone(), + _ => unreachable!() + }; + assert_eq!(task_dataset_id, foo_id); + + // Send finished for this task + event_bus.dispatch_event(TaskEventFinished { + event_time: next_time, + task_id: scheduled_tasks[0].task_id, + outcome: TaskOutcome::Success, + }).await.unwrap(); + + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + + } => Ok(()) + }; + + let state = test_flow_listener.state.lock().unwrap(); + assert_eq!(5, state.snapshots.len()); + + let start_moment = state.snapshots[0].0; + let schedule_foo_moment = state.snapshots[1].0; + let finish_foo_moment = state.snapshots[2].0; + let schedule_bar_baz_moment = state.snapshots[3].0; + let reschedule_foo_moment = state.snapshots[4].0; + + assert_eq!(start_moment, start_time); + assert_eq!( + (schedule_foo_moment - start_moment), + Duration::milliseconds(30) + ); + assert_eq!( + (finish_foo_moment - start_moment), + Duration::milliseconds(50) + ); + assert_eq!( + (schedule_bar_baz_moment - start_moment), + Duration::milliseconds(50) + ); + assert_eq!( + (reschedule_foo_moment - start_moment), + Duration::milliseconds(80) + ); + + let foo_flow_key: FlowKey = FlowKeyDataset::new(foo_id.clone(), DatasetFlowType::Ingest).into(); + let bar_flow_key: FlowKey = FlowKeyDataset::new(bar_id.clone(), DatasetFlowType::Ingest).into(); + let baz_flow_key: FlowKey = FlowKeyDataset::new(baz_id.clone(), DatasetFlowType::Ingest).into(); + + assert_flow_test_checks(&[ + // Snapshot 0: after initial queueing + FlowTestCheck { + snapshot: &state.snapshots[0].1, + patterns: vec![(&foo_flow_key, FlowStatus::Queued, None)], + }, + // Snapshot 1: "foo" scheduled + FlowTestCheck { + snapshot: &state.snapshots[1].1, + patterns: vec![(&foo_flow_key, FlowStatus::Scheduled, None)], + }, + // Snapshot 2: "foo" finished + FlowTestCheck { + snapshot: &state.snapshots[2].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Queued, None), + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Success), + ), + (&bar_flow_key, FlowStatus::Queued, None), + (&baz_flow_key, FlowStatus::Queued, None), + ], + }, + // Snapshot 3: "bar" & "baz" scheduled + FlowTestCheck { + snapshot: &state.snapshots[3].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Queued, None), + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Success), + ), + (&bar_flow_key, FlowStatus::Scheduled, None), + (&baz_flow_key, FlowStatus::Scheduled, None), + ], + }, + // Snapshot 4: "foo" rescheduled + FlowTestCheck { + snapshot: &state.snapshots[4].1, + patterns: vec![ + (&foo_flow_key, FlowStatus::Scheduled, None), + ( + &foo_flow_key, + FlowStatus::Finished, + Some(FlowOutcome::Success), + ), + (&bar_flow_key, FlowStatus::Scheduled, None), + (&baz_flow_key, FlowStatus::Scheduled, None), + ], + }, + ]); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +// TODO next: +// - derived more than 1 level +// - throttling derived +// - cancelling queued/scheduled flow (at flow level, not at task level) + +///////////////////////////////////////////////////////////////////////////////////////// + +struct FlowTestCheck<'a> { + snapshot: &'a HashMap>, + patterns: Vec<(&'a FlowKey, FlowStatus, Option)>, +} + +fn assert_flow_test_checks<'a>(flow_test_checks: &[FlowTestCheck<'a>]) { + for test_check in flow_test_checks { + let mut pattern_idx_per_key = HashMap::new(); + + let snapshot_total_flows: usize = test_check.snapshot.values().map(|v| v.len()).sum(); + assert_eq!(snapshot_total_flows, test_check.patterns.len()); + + for pattern in test_check.patterns.iter() { + let flow_states = test_check.snapshot.get(pattern.0).unwrap(); + + let index = if let Some(index) = pattern_idx_per_key.get_mut(pattern.0) { + *index += 1; + *index + } else { + pattern_idx_per_key.insert(pattern.0, 0); + 0 + }; + + let flow_state = flow_states.get(index).unwrap(); + + assert_eq!(flow_state.status(), pattern.1); + assert_eq!(flow_state.outcome, pattern.2); + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +struct TestFlowSystemListener { + flow_service: Arc, + state: Arc>, +} + +#[derive(Default)] +struct TestFlowSystemListenerState { + snapshots: Vec<(DateTime, HashMap>)>, +} + +#[component(pub)] +#[scope(Singleton)] +#[interface(dyn AsyncEventHandler)] +impl TestFlowSystemListener { + fn new(flow_service: Arc) -> Self { + Self { + flow_service, + state: Arc::new(Mutex::new(TestFlowSystemListenerState::default())), + } + } + + async fn snapshot_flows(&self, event_time: DateTime) { + use futures::TryStreamExt; + let flows: Vec<_> = self + .flow_service + .list_all_flows() + .unwrap() + .try_collect() + .await + .unwrap(); + + let mut flow_states_map: HashMap> = HashMap::new(); + for flow in flows { + flow_states_map + .entry(flow.flow_key.clone()) + .and_modify(|flows| flows.push(flow.clone())) + .or_insert(vec![flow]); + } + + let mut state = self.state.lock().unwrap(); + state.snapshots.push((event_time, flow_states_map)); + } +} + +#[async_trait::async_trait] +impl AsyncEventHandler for TestFlowSystemListener { + async fn handle(&self, event: &FlowServiceEvent) -> Result<(), InternalError> { + self.snapshot_flows(event.event_time()).await; + Ok(()) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +const SCHEDULING_ALIGNMENT_MS: i64 = 10; + +///////////////////////////////////////////////////////////////////////////////////////// + +struct FlowHarness { + _tmp_dir: tempfile::TempDir, + catalog: dill::Catalog, + dataset_repo: Arc, + flow_configuration_service: Arc, + flow_service: Arc, + task_scheduler: Arc, +} + +impl FlowHarness { + fn new() -> Self { + let tmp_dir = tempfile::tempdir().unwrap(); + let datasets_dir = tmp_dir.path().join("datasets"); + std::fs::create_dir(&datasets_dir).unwrap(); + + let catalog = dill::CatalogBuilder::new() + .add::() + .add_value(FlowServiceRunConfig::new(Duration::milliseconds( + SCHEDULING_ALIGNMENT_MS, + ))) + .add::() + .add::() + .add::() + .add::() + .add::() + .add_builder( + DatasetRepositoryLocalFs::builder() + .with_root(datasets_dir) + .with_multi_tenant(false), + ) + .bind::() + .add_value(CurrentAccountSubject::new_test()) + .add::() + .add::() + .add::() + .add::() + .add::() + .build(); + + let flow_service = catalog.get_one::().unwrap(); + let flow_configuration_service = catalog.get_one::().unwrap(); + let dataset_repo = catalog.get_one::().unwrap(); + let task_scheduler = catalog.get_one::().unwrap(); + + Self { + _tmp_dir: tmp_dir, + catalog, + flow_service, + flow_configuration_service, + dataset_repo, + task_scheduler, + } + } + + async fn create_root_dataset(&self, dataset_name: &str) -> DatasetID { + let result = self + .dataset_repo + .create_dataset_from_snapshot( + None, + MetadataFactory::dataset_snapshot() + .name(DatasetName::new_unchecked(dataset_name)) + .kind(DatasetKind::Root) + .push_event(MetadataFactory::set_polling_source().build()) + .build(), + ) + .await + .unwrap(); + + result.dataset_handle.id + } + + async fn create_derived_dataset( + &self, + dataset_name: &str, + input_ids: Vec, + ) -> DatasetID { + let mut input_aliases = Vec::new(); + for input_id in input_ids { + let input_hdl = self + .dataset_repo + .resolve_dataset_ref(&input_id.as_local_ref()) + .await + .unwrap(); + input_aliases.push(input_hdl.alias); + } + + let create_result = self + .dataset_repo + .create_dataset_from_snapshot( + None, + MetadataFactory::dataset_snapshot() + .name(DatasetName::new_unchecked(dataset_name)) + .kind(DatasetKind::Derivative) + .push_event(MetadataFactory::set_transform_aliases(input_aliases).build()) + .build(), + ) + .await + .unwrap(); + create_result.dataset_handle.id + } + + async fn eager_dependencies_graph_init(&self) { + let dependency_graph_service = self + .catalog + .get_one::() + .unwrap(); + let dependency_graph_repository = + DependencyGraphRepositoryInMemory::new(self.dataset_repo.clone()); + dependency_graph_service + .eager_initialization(&dependency_graph_repository) + .await + .unwrap(); + } + + async fn delete_dataset(&self, dataset_id: &DatasetID) { + // Eagerly push dependency graph initialization before deletes. + // It's ignored, if requested 2nd time + self.eager_dependencies_graph_init().await; + + // Do the actual deletion + self.dataset_repo + .delete_dataset(&(dataset_id.as_local_ref())) + .await + .unwrap(); + } + + async fn set_dataset_flow_schedule( + &self, + request_time: DateTime, + dataset_id: DatasetID, + dataset_flow_type: DatasetFlowType, + schedule: Schedule, + ) { + self.flow_configuration_service + .set_configuration( + request_time, + FlowKeyDataset::new(dataset_id, dataset_flow_type).into(), + false, + FlowConfigurationRule::Schedule(schedule), + ) + .await + .unwrap(); + } + + async fn set_dataset_flow_start_condition( + &self, + request_time: DateTime, + dataset_id: DatasetID, + dataset_flow_type: DatasetFlowType, + start_condition: StartConditionConfiguration, + ) { + self.flow_configuration_service + .set_configuration( + request_time, + FlowKeyDataset::new(dataset_id, dataset_flow_type).into(), + false, + FlowConfigurationRule::StartCondition(start_condition), + ) + .await + .unwrap(); + } + + async fn pause_dataset_flow( + &self, + request_time: DateTime, + dataset_id: DatasetID, + dataset_flow_type: DatasetFlowType, + ) { + let flow_key: FlowKey = FlowKeyDataset::new(dataset_id, dataset_flow_type).into(); + let current_config = self + .flow_configuration_service + .find_configuration(flow_key.clone()) + .await + .unwrap() + .unwrap(); + + self.flow_configuration_service + .set_configuration(request_time, flow_key, true, current_config.rule) + .await + .unwrap(); + } + + async fn resume_dataset_flow( + &self, + request_time: DateTime, + dataset_id: DatasetID, + dataset_flow_type: DatasetFlowType, + ) { + let flow_key: FlowKey = FlowKeyDataset::new(dataset_id, dataset_flow_type).into(); + let current_config = self + .flow_configuration_service + .find_configuration(flow_key.clone()) + .await + .unwrap() + .unwrap(); + + self.flow_configuration_service + .set_configuration(request_time, flow_key, false, current_config.rule) + .await + .unwrap(); + } + + async fn trigger_manual_flow(&self, trigger_time: DateTime, flow_key: FlowKey) { + self.flow_service + .trigger_manual_flow( + trigger_time, + flow_key, + FAKE_ACCOUNT_ID.to_string(), + AccountName::new_unchecked(auth::DEFAULT_ACCOUNT_NAME), + ) + .await + .unwrap(); + } + + async fn take_scheduled_tasks(&self) -> Vec { + let mut task_states: Vec<_> = Vec::new(); + while let Some(task_id) = self.task_scheduler.try_take().await.unwrap() { + let task_state = self.task_scheduler.get_task(task_id).await.unwrap(); + task_states.push(task_state); + } + task_states + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/infra/task-system-inmem/Cargo.toml b/src/infra/task-system-inmem/Cargo.toml index 3bb4cc7f76..deed4f690b 100644 --- a/src/infra/task-system-inmem/Cargo.toml +++ b/src/infra/task-system-inmem/Cargo.toml @@ -19,16 +19,18 @@ doctest = false [dependencies] opendatafabric = { workspace = true } +event-bus = { workspace = true } kamu-core = { workspace = true } kamu-task-system = { workspace = true } async-stream = "0.3" async-trait = { version = "0.1", default-features = false } chrono = { version = "0.4", default-features = false } -dill = "0.7" +dill = "0.8" futures = "0.3" thiserror = { version = "1", default-features = false } tokio = { version = "1", default-features = false, features=[] } +tokio-stream = { version = "0.1", default-features = false } tracing = { version = "0.1", default-features = false } url = { version = "2", default-features = false, features = ["serde"] } diff --git a/src/infra/task-system-inmem/src/lib.rs b/src/infra/task-system-inmem/src/lib.rs index 4ad86638bc..780f22f130 100644 --- a/src/infra/task-system-inmem/src/lib.rs +++ b/src/infra/task-system-inmem/src/lib.rs @@ -8,6 +8,7 @@ // by the Apache License, Version 2.0. #![feature(error_generic_member_access)] +#![feature(hash_set_entry)] // Re-exports pub use kamu_task_system as domain; diff --git a/src/infra/task-system-inmem/src/task_executor_inmem.rs b/src/infra/task-system-inmem/src/task_executor_inmem.rs index 139691f611..d9db587ddb 100644 --- a/src/infra/task-system-inmem/src/task_executor_inmem.rs +++ b/src/infra/task-system-inmem/src/task_executor_inmem.rs @@ -10,29 +10,51 @@ use std::sync::Arc; use dill::*; -use kamu_core::{PullOptions, PullService}; +use event_bus::EventBus; +use kamu_core::{PullOptions, PullService, SystemTimeSource}; use kamu_task_system::*; pub struct TaskExecutorInMemory { task_sched: Arc, event_store: Arc, + event_bus: Arc, + time_source: Arc, catalog: Catalog, } #[component(pub)] +#[interface(dyn TaskExecutor)] #[scope(Singleton)] impl TaskExecutorInMemory { pub fn new( task_sched: Arc, event_store: Arc, + event_bus: Arc, + time_source: Arc, catalog: Catalog, ) -> Self { Self { task_sched, event_store, + event_bus, + time_source, catalog, } } + + async fn publish_task_finished( + &self, + task_id: TaskID, + outcome: TaskOutcome, + ) -> Result<(), InternalError> { + self.event_bus + .dispatch_event(TaskEventFinished { + event_time: self.time_source.now(), + task_id, + outcome, + }) + .await + } } #[async_trait::async_trait] @@ -51,10 +73,9 @@ impl TaskExecutor for TaskExecutorInMemory { "Executing task", ); - let pull_svc = self.catalog.get_one::().int_err()?; - let outcome = match &task.logical_plan { LogicalPlan::UpdateDataset(upd) => { + let pull_svc = self.catalog.get_one::().int_err()?; let res = pull_svc .pull(&upd.dataset_id.as_any_ref(), PullOptions::default(), None) .await; @@ -85,8 +106,10 @@ impl TaskExecutor for TaskExecutorInMemory { // Refresh the task in case it was updated concurrently (e.g. late cancellation) task.update(self.event_store.as_ref()).await.int_err()?; - task.finish(outcome).int_err()?; + task.finish(self.time_source.now(), outcome).int_err()?; task.save(self.event_store.as_ref()).await.int_err()?; + + self.publish_task_finished(task.task_id, outcome).await?; } } } diff --git a/src/infra/task-system-inmem/src/task_scheduler_inmem.rs b/src/infra/task-system-inmem/src/task_scheduler_inmem.rs index dc828a0f5f..dbfb8d997f 100644 --- a/src/infra/task-system-inmem/src/task_scheduler_inmem.rs +++ b/src/infra/task-system-inmem/src/task_scheduler_inmem.rs @@ -12,12 +12,14 @@ use std::sync::{Arc, Mutex}; use dill::*; use futures::TryStreamExt; +use kamu_core::SystemTimeSource; use kamu_task_system::*; use opendatafabric::DatasetID; pub struct TaskSchedulerInMemory { state: Arc>, event_store: Arc, + time_source: Arc, } #[derive(Default)] @@ -26,12 +28,17 @@ struct State { } #[component(pub)] +#[interface(dyn TaskScheduler)] #[scope(Singleton)] impl TaskSchedulerInMemory { - pub fn new(event_store: Arc) -> Self { + pub fn new( + event_store: Arc, + time_source: Arc, + ) -> Self { Self { state: Arc::new(Mutex::new(State::default())), event_store, + time_source, } } } @@ -40,7 +47,11 @@ impl TaskSchedulerInMemory { impl TaskScheduler for TaskSchedulerInMemory { #[tracing::instrument(level = "info", skip_all, fields(?logical_plan))] async fn create_task(&self, logical_plan: LogicalPlan) -> Result { - let mut task = Task::new(self.event_store.new_task_id(), logical_plan); + let mut task = Task::new( + self.time_source.now(), + self.event_store.new_task_id(), + logical_plan, + ); task.save(self.event_store.as_ref()).await.int_err()?; let queue_len = { @@ -69,7 +80,7 @@ impl TaskScheduler for TaskSchedulerInMemory { let mut task = Task::load(task_id, self.event_store.as_ref()).await?; if task.can_cancel() { - task.cancel().int_err()?; + task.cancel(self.time_source.now()).int_err()?; task.save(self.event_store.as_ref()).await.int_err()?; let mut state = self.state.lock().unwrap(); @@ -80,11 +91,14 @@ impl TaskScheduler for TaskSchedulerInMemory { } #[tracing::instrument(level = "info", skip_all, fields(%dataset_id))] - fn list_tasks_by_dataset(&self, dataset_id: &DatasetID) -> TaskStateStream { + fn list_tasks_by_dataset( + &self, + dataset_id: &DatasetID, + ) -> Result { let dataset_id = dataset_id.clone(); // TODO: This requires a lot more thinking on how to make this performant - Box::pin(async_stream::try_stream! { + Ok(Box::pin(async_stream::try_stream! { let relevant_tasks: Vec<_> = self .event_store .get_tasks_by_dataset(&dataset_id) @@ -96,7 +110,7 @@ impl TaskScheduler for TaskSchedulerInMemory { yield task.into(); } - }) + })) } // TODO: Use signaling instead of a loop @@ -126,7 +140,7 @@ impl TaskScheduler for TaskSchedulerInMemory { let mut task = Task::load(task_id, self.event_store.as_ref()) .await .int_err()?; - task.run().int_err()?; + task.run(self.time_source.now()).int_err()?; task.save(self.event_store.as_ref()).await.int_err()?; tracing::info!( diff --git a/src/infra/task-system-inmem/src/task_system_event_store_inmem.rs b/src/infra/task-system-inmem/src/task_system_event_store_inmem.rs index 8d1bb27f12..12ab0827a8 100644 --- a/src/infra/task-system-inmem/src/task_system_event_store_inmem.rs +++ b/src/infra/task-system-inmem/src/task_system_event_store_inmem.rs @@ -8,19 +8,18 @@ // by the Apache License, Version 2.0. use std::collections::hash_map::{Entry, HashMap}; -use std::sync::{Arc, Mutex}; use dill::*; use kamu_task_system::*; use opendatafabric::DatasetID; pub struct TaskSystemEventStoreInMemory { - state: Arc>, + inner: EventStoreInMemory, } #[derive(Default)] struct State { - events: Vec, + events: Vec, tasks_by_dataset: HashMap>, last_task_id: Option, } @@ -38,20 +37,35 @@ impl State { } } +impl EventStoreState for State { + fn events_count(&self) -> usize { + self.events.len() + } + + fn get_events(&self) -> &[::Event] { + &self.events + } + + fn add_event(&mut self, event: ::Event) { + self.events.push(event); + } +} + #[component(pub)] +#[interface(dyn TaskSystemEventStore)] #[scope(Singleton)] impl TaskSystemEventStoreInMemory { pub fn new() -> Self { Self { - state: Arc::new(Mutex::new(State::default())), + inner: EventStoreInMemory::new(), } } fn update_index_by_dataset( tasks_by_dataset: &mut HashMap>, - event: &TaskSystemEvent, + event: &TaskEvent, ) { - if let TaskSystemEvent::TaskCreated(e) = &event { + if let TaskEvent::TaskCreated(e) = &event { if let Some(dataset_id) = e.logical_plan.dataset_id() { let entries = match tasks_by_dataset.entry(dataset_id.clone()) { Entry::Occupied(v) => v.into_mut(), @@ -66,67 +80,38 @@ impl TaskSystemEventStoreInMemory { #[async_trait::async_trait] impl EventStore for TaskSystemEventStoreInMemory { async fn len(&self) -> Result { - Ok(self.state.lock().unwrap().events.len()) + self.inner.len().await } fn get_events<'a>( &'a self, task_id: &TaskID, opts: GetEventsOpts, - ) -> EventStream<'a, TaskSystemEvent> { - let task_id = task_id.clone(); - - // TODO: This should be a buffered stream so we don't lock per event - Box::pin(async_stream::try_stream! { - let mut seen = opts.from.map(|id| (id.into_inner() + 1) as usize).unwrap_or(0); - - loop { - let next = { - let s = self.state.lock().unwrap(); - - let to = opts.to.map(|id| (id.into_inner() + 1) as usize).unwrap_or(s.events.len()); - - s.events[..to] - .iter() - .enumerate() - .skip(seen) - .filter(|(_, e)| e.task_id() == task_id) - .map(|(i, e)| (i, e.clone())) - .next() - }; - - match next { - None => break, - Some((i, event)) => { - seen = i + 1; - yield (EventID::new(i as u64), event) - } - } - } - }) + ) -> EventStream<'a, TaskEvent> { + self.inner.get_events(task_id, opts) } - // TODO: concurrency async fn save_events( &self, - _task_id: &TaskID, - events: Vec, + task_id: &TaskID, + events: Vec, ) -> Result { - let mut s = self.state.lock().unwrap(); - - for event in events { - Self::update_index_by_dataset(&mut s.tasks_by_dataset, &event); - s.events.push(event); + { + let state = self.inner.as_state(); + let mut g = state.lock().unwrap(); + for event in &events { + Self::update_index_by_dataset(&mut g.tasks_by_dataset, &event); + } } - Ok(EventID::new((s.events.len() - 1) as u64)) + self.inner.save_events(task_id, events).await } } #[async_trait::async_trait] impl TaskSystemEventStore for TaskSystemEventStoreInMemory { fn new_task_id(&self) -> TaskID { - self.state.lock().unwrap().next_task_id() + self.inner.as_state().lock().unwrap().next_task_id() } fn get_tasks_by_dataset<'a>(&'a self, dataset_id: &DatasetID) -> TaskIDStream<'a> { @@ -135,8 +120,9 @@ impl TaskSystemEventStore for TaskSystemEventStoreInMemory { // TODO: This should be a buffered stream so we don't lock per record Box::pin(async_stream::try_stream! { let mut pos = { - let s = self.state.lock().unwrap(); - s.tasks_by_dataset.get(&dataset_id).map(|tasks| tasks.len()).unwrap_or(0) + let state = self.inner.as_state(); + let g = state.lock().unwrap(); + g.tasks_by_dataset.get(&dataset_id).map(|tasks| tasks.len()).unwrap_or(0) }; loop { @@ -147,8 +133,9 @@ impl TaskSystemEventStore for TaskSystemEventStoreInMemory { pos -= 1; let next = { - let s = self.state.lock().unwrap(); - s.tasks_by_dataset + let state = self.inner.as_state(); + let g = state.lock().unwrap(); + g.tasks_by_dataset .get(&dataset_id) .and_then(|tasks| tasks.get(pos).cloned()) }; diff --git a/src/infra/task-system-inmem/tests/tests/test_event_store_inmem.rs b/src/infra/task-system-inmem/tests/tests/test_event_store_inmem.rs index fe6565d439..5d00a5bbf5 100644 --- a/src/infra/task-system-inmem/tests/tests/test_event_store_inmem.rs +++ b/src/infra/task-system-inmem/tests/tests/test_event_store_inmem.rs @@ -42,7 +42,7 @@ async fn test_event_store_get_streams() { let task_id_2 = TaskID::new(321); let dataset_id = DatasetID::from_pub_key_ed25519(b"foo"); - let event_1 = TaskCreated { + let event_1 = TaskEventCreated { event_time: Utc::now(), task_id: task_id_1, logical_plan: Probe { @@ -52,7 +52,7 @@ async fn test_event_store_get_streams() { .into(), }; - let event_2 = TaskCreated { + let event_2 = TaskEventCreated { event_time: Utc::now(), task_id: task_id_2, logical_plan: Probe { @@ -62,7 +62,7 @@ async fn test_event_store_get_streams() { .into(), }; - let event_3 = TaskFinished { + let event_3 = TaskEventFinished { event_time: Utc::now(), task_id: task_id_1, outcome: TaskOutcome::Cancelled, @@ -70,7 +70,7 @@ async fn test_event_store_get_streams() { event_store .save_events( - &task_id_1, // Cheating a bit + &task_id_1, // Cheating a bit, vec![ event_1.clone().into(), event_2.clone().into(), diff --git a/src/infra/task-system-inmem/tests/tests/test_task_aggregate.rs b/src/infra/task-system-inmem/tests/tests/test_task_aggregate.rs index 6ffae5bc52..cb936ae888 100644 --- a/src/infra/task-system-inmem/tests/tests/test_task_aggregate.rs +++ b/src/infra/task-system-inmem/tests/tests/test_task_aggregate.rs @@ -9,6 +9,7 @@ use std::assert_matches::assert_matches; +use chrono::Utc; use kamu_task_system_inmem::domain::*; use kamu_task_system_inmem::*; @@ -16,7 +17,11 @@ use kamu_task_system_inmem::*; async fn test_task_agg_create_new() { let event_store = TaskSystemEventStoreInMemory::new(); - let mut task = Task::new(event_store.new_task_id(), Probe::default().into()); + let mut task = Task::new( + Utc::now(), + event_store.new_task_id(), + Probe::default().into(), + ); assert_eq!(event_store.len().await.unwrap(), 0); @@ -36,17 +41,17 @@ async fn test_task_save_load_update() { let event_store = TaskSystemEventStoreInMemory::new(); let task_id = event_store.new_task_id(); - let mut task = Task::new(task_id, Probe::default().into()); + let mut task = Task::new(Utc::now(), task_id, Probe::default().into()); task.save(&event_store).await.unwrap(); - task.run().unwrap(); - task.cancel().unwrap(); + task.run(Utc::now()).unwrap(); + task.cancel(Utc::now()).unwrap(); task.save(&event_store).await.unwrap(); let cancel_event = *task.last_stored_event().unwrap(); assert_eq!(event_store.len().await.unwrap(), 3); - task.finish(TaskOutcome::Cancelled).unwrap(); + task.finish(Utc::now(), TaskOutcome::Cancelled).unwrap(); task.save(&event_store).await.unwrap(); assert_eq!(event_store.len().await.unwrap(), 4); @@ -77,8 +82,12 @@ async fn test_task_save_load_update() { async fn test_task_agg_illegal_transition() { let event_store = TaskSystemEventStoreInMemory::new(); - let mut task = Task::new(event_store.new_task_id(), Probe::default().into()); - task.finish(TaskOutcome::Cancelled).unwrap(); + let mut task = Task::new( + Utc::now(), + event_store.new_task_id(), + Probe::default().into(), + ); + task.finish(Utc::now(), TaskOutcome::Cancelled).unwrap(); - assert_matches!(task.run(), Err(ProjectionError { .. })); + assert_matches!(task.run(Utc::now(),), Err(ProjectionError { .. })); } diff --git a/src/infra/task-system-inmem/tests/tests/test_task_scheduler_inmem.rs b/src/infra/task-system-inmem/tests/tests/test_task_scheduler_inmem.rs index 6e478b97a7..e8440a8a17 100644 --- a/src/infra/task-system-inmem/tests/tests/test_task_scheduler_inmem.rs +++ b/src/infra/task-system-inmem/tests/tests/test_task_scheduler_inmem.rs @@ -10,13 +10,16 @@ use std::assert_matches::assert_matches; use std::sync::Arc; +use chrono::Utc; +use kamu_core::SystemTimeSourceStub; use kamu_task_system_inmem::domain::*; use kamu_task_system_inmem::*; #[test_log::test(tokio::test)] async fn test_creates_task() { let event_store = Arc::new(TaskSystemEventStoreInMemory::new()); - let task_sched = TaskSchedulerInMemory::new(event_store); + let time_source = Arc::new(SystemTimeSourceStub::new_set(Utc::now())); + let task_sched = TaskSchedulerInMemory::new(event_store, time_source); let logical_plan_expected: LogicalPlan = Probe { ..Probe::default() }.into(); @@ -40,7 +43,8 @@ async fn test_creates_task() { #[test_log::test(tokio::test)] async fn test_queues_tasks() { let event_store = Arc::new(TaskSystemEventStoreInMemory::new()); - let task_sched = TaskSchedulerInMemory::new(event_store); + let time_source = Arc::new(SystemTimeSourceStub::new_set(Utc::now())); + let task_sched = TaskSchedulerInMemory::new(event_store, time_source); let task_id_1 = task_sched .create_task(Probe { ..Probe::default() }.into()) @@ -62,7 +66,8 @@ async fn test_queues_tasks() { #[test_log::test(tokio::test)] async fn test_task_cancellation() { let event_store = Arc::new(TaskSystemEventStoreInMemory::new()); - let task_sched = TaskSchedulerInMemory::new(event_store); + let time_source = Arc::new(SystemTimeSourceStub::new_set(Utc::now())); + let task_sched = TaskSchedulerInMemory::new(event_store, time_source); let task_id_1 = task_sched .create_task(Probe { ..Probe::default() }.into()) diff --git a/src/utils/container-runtime/Cargo.toml b/src/utils/container-runtime/Cargo.toml index c456280391..d0c3ce9c3d 100644 --- a/src/utils/container-runtime/Cargo.toml +++ b/src/utils/container-runtime/Cargo.toml @@ -20,7 +20,7 @@ doctest = false [dependencies] async-trait = "0.1" cfg-if = "1" -dill = "0.7" +dill = "0.8" libc = "0.2" rand = "0.8" regex = "1" diff --git a/src/utils/enum-variants/src/lib.rs b/src/utils/enum-variants/src/lib.rs index c83d8e4e8d..cc2aa40b23 100644 --- a/src/utils/enum-variants/src/lib.rs +++ b/src/utils/enum-variants/src/lib.rs @@ -104,3 +104,5 @@ macro_rules! impl_enum_variant { } }; } + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/utils/event-bus/Cargo.toml b/src/utils/event-bus/Cargo.toml new file mode 100644 index 0000000000..d887e6060f --- /dev/null +++ b/src/utils/event-bus/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "event-bus" +description = "Simple in-memory event bus" +version = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +readme = { workspace = true } +license-file = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +publish = { workspace = true } + + +[lib] +doctest = false + + +[dependencies] +internal-error = { workspace = true } + +async-trait = "0.1" +dill = "0.8" +futures = "0.3" + +[dev-dependencies] +env_logger = "0.10" +test-group = { version = "1" } +test-log = { version = "0.2", features = ["trace"] } +tokio = { version = "1", default-features = false, features=["rt", "macros"] } +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +thiserror = { version = "1", default-features = false } diff --git a/src/utils/event-bus/src/event_bus.rs b/src/utils/event-bus/src/event_bus.rs new file mode 100644 index 0000000000..ea480746d4 --- /dev/null +++ b/src/utils/event-bus/src/event_bus.rs @@ -0,0 +1,187 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::any::{Any, TypeId}; +use std::collections::HashMap; +use std::future::Future; +use std::pin::Pin; +use std::sync::{Arc, Mutex}; + +use dill::{component, scope, Catalog, Singleton}; +use internal_error::InternalError; + +use crate::{AsyncEventHandler, EventHandler}; + +///////////////////////////////////////////////////////////////////////////////////////// + +pub struct EventBus { + catalog: Arc, + state: Mutex, +} + +#[component(pub)] +#[scope(Singleton)] +impl EventBus { + pub fn new(catalog: Arc) -> EventBus { + Self { + catalog, + state: Mutex::new(State::new()), + } + } + + pub fn subscribe_async_closure(&self, callback: H) + where + TEvent: 'static + Clone, + H: Fn(Arc) -> HFut + Send + Sync + 'static, + HFut: std::future::Future> + Send + 'static, + { + let mut state = self.state.lock().unwrap(); + + let event_handlers = state.take_closure_handlers_for::(); + + let async_closure_handler = Arc::new( + move |event: Arc| + -> Pin> + Send>> { + Box::pin(callback(event)) + }, + ); + + event_handlers.0.push(async_closure_handler); + } + + pub async fn dispatch_event(&self, event: TEvent) -> Result<(), InternalError> + where + TEvent: 'static + Clone, + { + self.sync_dispatch(&event)?; + self.async_dispatch(&event).await?; + self.async_closures_dispatch(&event).await?; + + Ok(()) + } + + fn sync_dispatch(&self, event: &TEvent) -> Result<(), InternalError> { + let sync_handlers = self + .catalog + .get::>>() + .unwrap(); + + for sync_handler in sync_handlers { + sync_handler.handle(event)?; + } + + Ok(()) + } + + async fn async_dispatch( + &self, + event: &TEvent, + ) -> Result<(), InternalError> { + let async_handlers = self + .catalog + .get::>>() + .unwrap(); + + let async_handler_futures: Vec<_> = async_handlers + .iter() + .map(|handler| handler.handle(event)) + .collect(); + + let results = futures::future::join_all(async_handler_futures).await; + results.into_iter().try_for_each(|res| res)?; + + Ok(()) + } + + async fn async_closures_dispatch( + &self, + event: &TEvent, + ) -> Result<(), InternalError> { + let maybe_closure_handlers: Option> = { + let state = self.state.lock().unwrap(); + let maybe_event_handlers = state.get_closure_handlers_for::(); + maybe_event_handlers.map(|handlers| handlers.clone()) + }; + + if let Some(closure_handlers) = maybe_closure_handlers { + let event_arc = Arc::new(event.clone()); + let closure_handler_futures: Vec<_> = closure_handlers + .0 + .iter() + .map(|handler| (*handler).call((event_arc.clone(),))) + .collect(); + + let results = futures::future::join_all(closure_handler_futures).await; + results.into_iter().try_for_each(|res| res)?; + } + + Ok(()) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +struct State { + closure_handlers_by_event_type: HashMap>, +} + +impl State { + pub fn new() -> Self { + Self { + closure_handlers_by_event_type: HashMap::new(), + } + } + + pub fn get_closure_handlers_for<'a, TEvent: 'static + Clone>( + &'a self, + ) -> Option<&'a EventClosureHandlers> { + if let Some(event_handlers) = self + .closure_handlers_by_event_type + .get(&TypeId::of::()) + { + let event_handlers = event_handlers + .downcast_ref::>() + .unwrap(); + Some(event_handlers) + } else { + None + } + } + + pub fn take_closure_handlers_for( + &mut self, + ) -> &mut EventClosureHandlers { + self.closure_handlers_by_event_type + .entry(TypeId::of::()) + .or_insert(Box::new(EventClosureHandlers::::default())) + .downcast_mut::>() + .unwrap() + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +type AsyncEventClosure = Arc< + dyn Fn(Arc) -> Pin> + Send>> + + Send + + Sync, +>; + +///////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Clone)] +struct EventClosureHandlers(Vec>); + +impl Default for EventClosureHandlers { + fn default() -> Self { + Self(vec![]) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/utils/event-bus/src/event_handler.rs b/src/utils/event-bus/src/event_handler.rs new file mode 100644 index 0000000000..f380e4867d --- /dev/null +++ b/src/utils/event-bus/src/event_handler.rs @@ -0,0 +1,25 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use internal_error::InternalError; + +///////////////////////////////////////////////////////////////////////////////////////// + +pub trait EventHandler: Sync + Send { + fn handle(&self, event: &TEvent) -> Result<(), InternalError>; +} + +///////////////////////////////////////////////////////////////////////////////////////// + +#[async_trait::async_trait] +pub trait AsyncEventHandler: Sync + Send { + async fn handle(&self, event: &TEvent) -> Result<(), InternalError>; +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/utils/event-bus/src/lib.rs b/src/utils/event-bus/src/lib.rs new file mode 100644 index 0000000000..3c60511e6e --- /dev/null +++ b/src/utils/event-bus/src/lib.rs @@ -0,0 +1,16 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +#![feature(fn_traits)] + +mod event_bus; +mod event_handler; + +pub use event_bus::*; +pub use event_handler::*; diff --git a/src/utils/event-bus/tests/mod.rs b/src/utils/event-bus/tests/mod.rs new file mode 100644 index 0000000000..6246597fc9 --- /dev/null +++ b/src/utils/event-bus/tests/mod.rs @@ -0,0 +1,10 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +mod test_event_bus; diff --git a/src/utils/event-bus/tests/test_event_bus.rs b/src/utils/event-bus/tests/test_event_bus.rs new file mode 100644 index 0000000000..c46b76879a --- /dev/null +++ b/src/utils/event-bus/tests/test_event_bus.rs @@ -0,0 +1,129 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::sync::{Arc, Mutex}; + +use dill::*; +use event_bus::{AsyncEventHandler, EventBus, EventHandler}; +use internal_error::{ErrorIntoInternal, InternalError}; + +//////////////////////////////////////////////////////////////////////////////////////// + +#[derive(Copy, Clone)] +struct Event {} + +//////////////////////////////////////////////////////////////////////////////////////// + +struct TestSyncHandler { + invoked: Arc>, +} + +#[component(pub)] +#[interface(dyn EventHandler)] +#[scope(Singleton)] +impl TestSyncHandler { + fn new() -> Self { + Self { + invoked: Arc::new(Mutex::new(false)), + } + } + + fn was_invoked(&self) -> bool { + *self.invoked.lock().unwrap() + } +} + +impl EventHandler for TestSyncHandler { + fn handle(&self, _: &Event) -> Result<(), InternalError> { + let mut invoked = self.invoked.lock().unwrap(); + *invoked = true; + Ok(()) + } +} + +#[test_log::test(tokio::test)] +async fn test_bus_sync_handler() { + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .build(); + let event_bus = catalog.get_one::().unwrap(); + + event_bus.dispatch_event(Event {}).await.unwrap(); + + let handler = catalog.get_one::().unwrap(); + assert!(handler.was_invoked()); +} + +//////////////////////////////////////////////////////////////////////////////////////// + +struct TestAsyncHandler { + invoked: Arc>, +} + +#[component(pub)] +#[interface(dyn AsyncEventHandler)] +#[scope(Singleton)] +impl TestAsyncHandler { + fn new() -> Self { + Self { + invoked: Arc::new(Mutex::new(false)), + } + } + + fn was_invoked(&self) -> bool { + *self.invoked.lock().unwrap() + } +} + +#[async_trait::async_trait] +impl AsyncEventHandler for TestAsyncHandler { + async fn handle(&self, _: &Event) -> Result<(), InternalError> { + let mut invoked = self.invoked.lock().unwrap(); + *invoked = true; + Ok(()) + } +} + +#[test_log::test(tokio::test)] +async fn test_bus_async_handler() { + let catalog = dill::CatalogBuilder::new() + .add::() + .add::() + .build(); + let event_bus = catalog.get_one::().unwrap(); + + event_bus.dispatch_event(Event {}).await.unwrap(); + + let handler = catalog.get_one::().unwrap(); + assert!(handler.was_invoked()); +} + +//////////////////////////////////////////////////////////////////////////////////////// + +#[test_log::test(tokio::test)] +async fn test_bus_async_closure() { + let catalog = dill::CatalogBuilder::new().add::().build(); + + let event_bus = catalog.get_one::().unwrap(); + + #[derive(thiserror::Error, Debug)] + #[error("Test error")] + struct TestError {} + + event_bus.subscribe_async_closure(|_: Arc| async move { + let error = TestError {}; + Err(error.int_err()) + }); + + let res = event_bus.dispatch_event(Event {}).await; + assert!(res.is_err()); +} + +//////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/utils/event-sourcing-macros/src/lib.rs b/src/utils/event-sourcing-macros/src/lib.rs index fd8b139fab..572d6efbd4 100644 --- a/src/utils/event-sourcing-macros/src/lib.rs +++ b/src/utils/event-sourcing-macros/src/lib.rs @@ -71,6 +71,15 @@ pub fn derive_aggregate(tokens: proc_macro::TokenStream) -> proc_macro::TokenStr Ok(Self(agg)) } + #[inline] + pub async fn try_load( + query: <#proj_type as ::event_sourcing::Projection>::Query, + event_store: &#store_type, + ) -> Result, TryLoadError<#proj_type>> { + let maybe_agg = ::event_sourcing::Aggregate::try_load(query, event_store).await?; + Ok(maybe_agg.map(|agg| Self(agg))) + } + #[inline] pub async fn load_ext( query: <#proj_type as ::event_sourcing::Projection>::Query, diff --git a/src/utils/event-sourcing/Cargo.toml b/src/utils/event-sourcing/Cargo.toml index d5c25da9b6..2c427ccd30 100644 --- a/src/utils/event-sourcing/Cargo.toml +++ b/src/utils/event-sourcing/Cargo.toml @@ -21,6 +21,7 @@ doctest = false event-sourcing-macros = { workspace = true } internal-error = { workspace = true } +async-stream = "0.3" async-trait = { version = "0.1", default-features = false } chrono = { version = "0.4", default-features = false } thiserror = { version = "1", default-features = false } diff --git a/src/utils/event-sourcing/src/aggregate.rs b/src/utils/event-sourcing/src/aggregate.rs index 4e9b237752..18fed98045 100644 --- a/src/utils/event-sourcing/src/aggregate.rs +++ b/src/utils/event-sourcing/src/aggregate.rs @@ -99,6 +99,23 @@ where Self::load_ext(query, event_store, LoadOpts::default()).await } + /// Attempt initializing an aggregate from event history, but allow the not + /// found case + #[inline] + pub async fn try_load( + query: Proj::Query, + event_store: &Store, + ) -> Result, TryLoadError> { + match Self::load_ext(query, event_store, LoadOpts::default()).await { + Ok(a) => Ok(Some(a)), + Err(e) => match e { + LoadError::NotFound(_) => Ok(None), + LoadError::Internal(e) => Err(TryLoadError::Internal(e)), + LoadError::ProjectionError(e) => Err(TryLoadError::ProjectionError(e)), + }, + } + } + /// Same as [EventStore::load()] but with extra control knobs #[tracing::instrument( level = "debug", @@ -334,6 +351,14 @@ pub enum LoadError { Internal(#[from] InternalError), } +#[derive(thiserror::Error, Debug)] +pub enum TryLoadError { + #[error(transparent)] + ProjectionError(ProjectionError), + #[error(transparent)] + Internal(#[from] InternalError), +} + impl From for LoadError { fn from(value: GetEventsError) -> Self { match value { diff --git a/src/utils/event-sourcing/src/event_store_inmem.rs b/src/utils/event-sourcing/src/event_store_inmem.rs new file mode 100644 index 0000000000..079bafd93d --- /dev/null +++ b/src/utils/event-sourcing/src/event_store_inmem.rs @@ -0,0 +1,133 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use std::marker::PhantomData; +use std::sync::{Arc, Mutex}; + +use internal_error::InternalError; + +use crate::*; + +///////////////////////////////////////////////////////////////////////////////////////// + +pub struct EventStoreInMemory> { + state: Arc>, + _proj: PhantomData, +} + +impl> EventStoreInMemory { + pub fn new() -> Self { + Self { + state: Arc::new(Mutex::new(State::default())), + _proj: PhantomData, + } + } + + pub fn as_state(&self) -> Arc> { + self.state.clone() + } +} + +#[async_trait::async_trait] +impl> EventStore + for EventStoreInMemory +{ + async fn len(&self) -> Result { + Ok(self.state.lock().unwrap().events_count()) + } + + fn get_events<'a>( + &'a self, + query: &Proj::Query, + opts: GetEventsOpts, + ) -> EventStream<'a, Proj::Event> { + let query = query.clone(); + + // TODO: This should be a buffered stream so we don't lock per event + Box::pin(async_stream::try_stream! { + let mut seen = opts.from.map(|id| (id.into_inner() + 1) as usize).unwrap_or(0); + + loop { + let next = { + let g = self.state.lock().unwrap(); + + let to = opts.to.map(|id| (id.into_inner() + 1) as usize).unwrap_or(g.events_count()); + + g.get_events()[..to] + .iter() + .enumerate() + .skip(seen) + .filter(|(_, e)| e.matches_query(&query)) + .map(|(i, e)| (i, e.clone())) + .next() + }; + + match next { + None => break, + Some((i, event)) => { + seen = i + 1; + yield (EventID::new(i as u64), event) + } + } + } + }) + } + + // TODO: concurrency + async fn save_events( + &self, + _: &Proj::Query, + events: Vec, + ) -> Result { + let mut g = self.state.lock().unwrap(); + for event in events { + g.add_event(event); + } + + Ok(EventID::new((g.events_count() - 1) as u64)) + } +} + +///////////////////////////////////////////////////////////////////////////////////////// + +pub trait EventStoreState: Default + Sync + Send { + fn events_count(&self) -> usize; + + fn get_events(&self) -> &[Proj::Event]; + + fn add_event(&mut self, event: Proj::Event); +} + +///////////////////////////////////////////////////////////////////////////////////////// + +pub struct EventStoreStateImpl { + events: Vec, +} + +impl Default for EventStoreStateImpl { + fn default() -> Self { + Self { events: Vec::new() } + } +} + +impl EventStoreState for EventStoreStateImpl { + fn events_count(&self) -> usize { + self.events.len() + } + + fn get_events(&self) -> &[Proj::Event] { + &self.events + } + + fn add_event(&mut self, event: Proj::Event) { + self.events.push(event); + } +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/utils/event-sourcing/src/lib.rs b/src/utils/event-sourcing/src/lib.rs index 47acb61ac8..a4916375bf 100644 --- a/src/utils/event-sourcing/src/lib.rs +++ b/src/utils/event-sourcing/src/lib.rs @@ -14,9 +14,13 @@ pub use internal_error::*; mod aggregate; mod event_id; mod event_store; +mod event_store_inmem; mod projection; +mod projection_event; pub use aggregate::*; pub use event_id::*; pub use event_store::*; +pub use event_store_inmem::*; pub use projection::*; +pub use projection_event::*; diff --git a/src/utils/event-sourcing/src/projection.rs b/src/utils/event-sourcing/src/projection.rs index f642b12f0f..58e8d7fcd8 100644 --- a/src/utils/event-sourcing/src/projection.rs +++ b/src/utils/event-sourcing/src/projection.rs @@ -7,6 +7,8 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +use crate::ProjectionEvent; + /// Projections reconstruct some state from a series of events #[async_trait::async_trait] #[allow(drop_bounds)] @@ -14,15 +16,14 @@ pub trait Projection where Self: Sized + Send + Sync + 'static, Self::Query: Sized + Send + Sync + 'static, - Self::Event: Sized + Send + Sync + 'static, Self: Clone, Self::Query: Clone, - Self::Event: Clone, Self: std::fmt::Debug, Self::Query: std::fmt::Debug, - Self::Event: std::fmt::Debug, + + Self::Event: ProjectionEvent, { /// Type of the query this projection uses to filter events in the event /// store diff --git a/src/utils/event-sourcing/src/projection_event.rs b/src/utils/event-sourcing/src/projection_event.rs new file mode 100644 index 0000000000..b3311b7d65 --- /dev/null +++ b/src/utils/event-sourcing/src/projection_event.rs @@ -0,0 +1,16 @@ +// Copyright Kamu Data, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +///////////////////////////////////////////////////////////////////////////////////////// + +pub trait ProjectionEvent: Sized + std::fmt::Debug + Clone + Sync + Send + 'static { + fn matches_query(&self, query: &Query) -> bool; +} + +///////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/utils/event-sourcing/tests/test_aggregate.rs b/src/utils/event-sourcing/tests/test_aggregate.rs index 11feafb2aa..68f228299e 100644 --- a/src/utils/event-sourcing/tests/test_aggregate.rs +++ b/src/utils/event-sourcing/tests/test_aggregate.rs @@ -17,6 +17,12 @@ enum CalcEvents { Sub(i32), } +impl ProjectionEvent<()> for CalcEvents { + fn matches_query(&self, _: &()) -> bool { + true + } +} + #[derive(Debug, Clone)] struct CalcState(i32);