From 98c7eab613579d22d4c1c0d8a3c379f9df9747c9 Mon Sep 17 00:00:00 2001
From: PranjalManhgaye <manhgayepranjal@gmail.com>
Date: Wed, 27 May 2026 13:57:05 +0530
Subject: [PATCH 01/11] Stream system test logs to disk during each docker
 phase

Write build, solver, and fieldcompare output incrementally to
system-tests-stdout.log and system-tests-stderr.log, and add phase
logs (system-tests-build.log, -solver.log, -fieldcompare.log) so
failures can be debugged before a test finishes. Extend CI artifact
globs for the new log files.

Fixes the log-file part of #790.
---
 .../generate_reference_results_workflow.yml   |   3 +
 .github/workflows/run_testsuite_workflow.yml  |   3 +
 tools/tests/systemtests/Systemtest.py         | 287 +++++++++++-------
 3 files changed, 186 insertions(+), 107 deletions(-)

diff --git a/.github/workflows/generate_reference_results_workflow.yml b/.github/workflows/generate_reference_results_workflow.yml
index 6ac87f392..9e712126b 100644
--- a/.github/workflows/generate_reference_results_workflow.yml
+++ b/.github/workflows/generate_reference_results_workflow.yml
@@ -98,4 +98,7 @@ jobs:
         path: |
           runs/*/system-tests-stdout.log
           runs/*/system-tests-stderr.log
+          runs/*/system-tests-build.log
+          runs/*/system-tests-solver.log
+          runs/*/system-tests-fieldcompare.log
           runs/*/*/system-tests_*.log
diff --git a/.github/workflows/run_testsuite_workflow.yml b/.github/workflows/run_testsuite_workflow.yml
index 085c5286e..d96df2c5f 100644
--- a/.github/workflows/run_testsuite_workflow.yml
+++ b/.github/workflows/run_testsuite_workflow.yml
@@ -92,6 +92,9 @@ jobs:
         path: |
           runs/*/system-tests-stdout.log
           runs/*/system-tests-stderr.log
+          runs/*/system-tests-build.log
+          runs/*/system-tests-solver.log
+          runs/*/system-tests-fieldcompare.log
           runs/*/*/system-tests_*.log
     - name: tidy up the docker
       run: |
diff --git a/tools/tests/systemtests/Systemtest.py b/tools/tests/systemtests/Systemtest.py
index f438eac03..36998ae6a 100644
--- a/tools/tests/systemtests/Systemtest.py
+++ b/tools/tests/systemtests/Systemtest.py
@@ -1,4 +1,5 @@
 import subprocess
+import threading
 from typing import List, Dict, Optional, Tuple
 from jinja2 import Environment, FileSystemLoader
 from dataclasses import dataclass, field
@@ -24,6 +25,46 @@
 
 DIFF_RESULTS_DIR = "diff-results"
 
+COMBINED_STDOUT_LOG = "system-tests-stdout.log"
+COMBINED_STDERR_LOG = "system-tests-stderr.log"
+PHASE_LOG_FILES = {
+    "build": "system-tests-build.log",
+    "solver": "system-tests-solver.log",
+    "fieldcompare": "system-tests-fieldcompare.log",
+}
+
+
+class _SystemtestLogSink:
+    """Writes subprocess output incrementally to combined and phase log files."""
+
+    def __init__(self, system_test_dir: Path):
+        self._system_test_dir = system_test_dir
+        self._lock = threading.Lock()
+        self._stdout_path = system_test_dir / COMBINED_STDOUT_LOG
+        self._stderr_path = system_test_dir / COMBINED_STDERR_LOG
+        self._stdout_path.write_text("", encoding="utf-8")
+        self._stderr_path.write_text("", encoding="utf-8")
+
+    def begin_phase(self, phase: str) -> None:
+        phase_path = self._system_test_dir / PHASE_LOG_FILES[phase]
+        phase_path.write_text(f"=== {phase} ===\n", encoding="utf-8")
+
+    def append_stdout(self, line: str, phase: str) -> None:
+        with self._lock:
+            with self._stdout_path.open("a", encoding="utf-8") as log_file:
+                log_file.write(line + "\n")
+            phase_path = self._system_test_dir / PHASE_LOG_FILES[phase]
+            with phase_path.open("a", encoding="utf-8") as log_file:
+                log_file.write(line + "\n")
+
+    def append_stderr(self, line: str, phase: str) -> None:
+        with self._lock:
+            with self._stderr_path.open("a", encoding="utf-8") as log_file:
+                log_file.write(line + "\n")
+            phase_path = self._system_test_dir / PHASE_LOG_FILES[phase]
+            with phase_path.open("a", encoding="utf-8") as log_file:
+                log_file.write(f"[stderr] {line}\n")
+
 
 def slugify(value, allow_unicode=False):
     """
@@ -399,6 +440,94 @@ def __unpack_reference_results(self) -> Tuple[bool, str]:
             logging.error(error_message)
             return False, error_message
 
+    def __init_run_logs(self) -> None:
+        self._log_sink = _SystemtestLogSink(self.system_test_dir)
+
+    def _run_docker_compose_subprocess(
+        self,
+        command: List[str],
+        phase: str,
+        timeout: int,
+    ) -> Tuple[int, List[str], List[str]]:
+        """
+        Run a docker compose command, streaming stdout/stderr to log files as they arrive.
+        """
+        stdout_data: List[str] = []
+        stderr_data: List[str] = []
+        log_sink = getattr(self, "_log_sink", None)
+        if log_sink is not None:
+            log_sink.begin_phase(phase)
+        logging.info(f"Docker compose {phase} for {self}")
+
+        try:
+            process = subprocess.Popen(
+                command,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                bufsize=1,
+                start_new_session=True,
+                cwd=self.system_test_dir,
+            )
+        except Exception as e:
+            logging.critical(f"Error starting docker compose {phase} command: {e}")
+            return 1, stdout_data, stderr_data
+
+        def read_stream(stream, is_stderr: bool) -> None:
+            if stream is None:
+                return
+            for line in stream:
+                line = line.rstrip("\n\r")
+                if is_stderr:
+                    stderr_data.append(line)
+                    if log_sink is not None:
+                        log_sink.append_stderr(line, phase)
+                else:
+                    stdout_data.append(line)
+                    if log_sink is not None:
+                        log_sink.append_stdout(line, phase)
+            stream.close()
+
+        stdout_thread = threading.Thread(
+            target=read_stream, args=(process.stdout, False), daemon=True)
+        stderr_thread = threading.Thread(
+            target=read_stream, args=(process.stderr, True), daemon=True)
+        stdout_thread.start()
+        stderr_thread.start()
+
+        try:
+            exit_code = process.wait(timeout=timeout)
+        except KeyboardInterrupt as k:
+            process.kill()
+            stdout_thread.join(timeout=SHORT_TIMEOUT)
+            stderr_thread.join(timeout=SHORT_TIMEOUT)
+            raise KeyboardInterrupt from k
+        except subprocess.TimeoutExpired:
+            logging.critical(
+                f"Systemtest {self} timed out during docker compose {phase} "
+                f"after {timeout}s. Killing the process.")
+            process.kill()
+            try:
+                process.wait(timeout=SHORT_TIMEOUT)
+            except subprocess.TimeoutExpired:
+                pass
+            exit_code = process.returncode if process.returncode is not None else 1
+        except Exception as e:
+            logging.critical(
+                f"Systemtest {self} had serious issues during docker compose {phase}: {e}")
+            process.kill()
+            try:
+                process.wait(timeout=SHORT_TIMEOUT)
+            except subprocess.TimeoutExpired:
+                pass
+            exit_code = process.returncode if process.returncode is not None else 1
+
+        stdout_thread.join(timeout=SHORT_TIMEOUT)
+        stderr_thread.join(timeout=SHORT_TIMEOUT)
+        if exit_code is None:
+            exit_code = process.poll() or 1
+        return exit_code, stdout_data, stderr_data
+
     def _run_field_compare(self):
         """
         Executes the field comparison step after unpacking reference results.
@@ -410,47 +539,31 @@ def _run_field_compare(self):
         time_start = time.perf_counter()
         unpack_success, unpack_error_message = self.__unpack_reference_results()
         if not unpack_success:
+            log_sink = getattr(self, "_log_sink", None)
+            if log_sink is not None:
+                log_sink.begin_phase("fieldcompare")
+                log_sink.append_stderr(unpack_error_message, "fieldcompare")
             elapsed_time = time.perf_counter() - time_start
             return FieldCompareResult(1, [], [unpack_error_message], self, elapsed_time)
         docker_compose_content = self.__get_field_compare_compose_file()
-        stdout_data = []
-        stderr_data = []
 
         with open(self.system_test_dir / "docker-compose.field_compare.yaml", 'w') as file:
             file.write(docker_compose_content)
-        try:
-            # Execute docker-compose command
-            process = subprocess.Popen(['docker',
-                                        'compose',
-                                        '--file',
-                                        'docker-compose.field_compare.yaml',
-                                        'up',
-                                        '--exit-code-from',
-                                        'field-compare'],
-                                       stdout=subprocess.PIPE,
-                                       stderr=subprocess.PIPE,
-                                       start_new_session=True,
-                                       cwd=self.system_test_dir)
-
-            try:
-                stdout, stderr = process.communicate(timeout=self.timeout)
-            except KeyboardInterrupt as k:
-                process.kill()
-                raise KeyboardInterrupt from k
-            except Exception as e:
-                logging.critical(
-                    f"Systemtest {self} had serious issues executing the docker compose command about to kill the docker compose command. Please check the logs! {e}")
-                process.kill()
-                process.communicate(timeout=SHORT_TIMEOUT)
-            stdout_data.extend(stdout.decode().splitlines())
-            stderr_data.extend(stderr.decode().splitlines())
-            process.poll()
-            elapsed_time = time.perf_counter() - time_start
-            return FieldCompareResult(process.returncode, stdout_data, stderr_data, self, elapsed_time)
-        except Exception as e:
-            logging.CRITICAL("Error executing docker compose command:", e)
-            elapsed_time = time.perf_counter() - time_start
-            return FieldCompareResult(1, stdout_data, stderr_data, self, elapsed_time)
+        exit_code, stdout_data, stderr_data = self._run_docker_compose_subprocess(
+            [
+                'docker',
+                'compose',
+                '--file',
+                'docker-compose.field_compare.yaml',
+                'up',
+                '--exit-code-from',
+                'field-compare',
+            ],
+            "fieldcompare",
+            self.timeout,
+        )
+        elapsed_time = time.perf_counter() - time_start
+        return FieldCompareResult(exit_code, stdout_data, stderr_data, self, elapsed_time)
 
     def __archive_fieldcompare_diffs(self) -> None:
         """
@@ -502,41 +615,20 @@ def _build_docker(self):
         with open(self.system_test_dir / "docker-compose.tutorial.yaml", 'w') as file:
             file.write(docker_compose_content)
 
-        stdout_data = []
-        stderr_data = []
-
-        try:
-            # Execute docker-compose command
-            process = subprocess.Popen(['docker',
-                                        'compose',
-                                        '--file',
-                                        'docker-compose.tutorial.yaml',
-                                        'build'],
-                                       stdout=subprocess.PIPE,
-                                       stderr=subprocess.PIPE,
-                                       start_new_session=True,
-                                       cwd=self.system_test_dir)
-
-            try:
-                stdout, stderr = process.communicate(timeout=GLOBAL_TIMEOUT)
-            except KeyboardInterrupt as k:
-                process.kill()
-                # process.send_signal(9)
-                raise KeyboardInterrupt from k
-            except Exception as e:
-                logging.critical(
-                    f"systemtest {self} had serious issues building the docker images via the `docker compose build` command. About to kill the docker compose command. Please check the logs! {e}")
-                process.communicate(timeout=SHORT_TIMEOUT)
-                process.kill()
-
-            stdout_data.extend(stdout.decode().splitlines())
-            stderr_data.extend(stderr.decode().splitlines())
-            elapsed_time = time.perf_counter() - time_start
-            return DockerComposeResult(process.returncode, stdout_data, stderr_data, self, elapsed_time)
-        except Exception as e:
-            logging.critical(f"Error executing docker compose build command: {e}")
-            elapsed_time = time.perf_counter() - time_start
-            return DockerComposeResult(1, stdout_data, stderr_data, self, elapsed_time)
+        exit_code, stdout_data, stderr_data = self._run_docker_compose_subprocess(
+            [
+                'docker',
+                'compose',
+                '--file',
+                'docker-compose.tutorial.yaml',
+                'build',
+                '--progress=plain',
+            ],
+            "build",
+            GLOBAL_TIMEOUT,
+        )
+        elapsed_time = time.perf_counter() - time_start
+        return DockerComposeResult(exit_code, stdout_data, stderr_data, self, elapsed_time)
 
     def _run_tutorial(self):
         """
@@ -547,49 +639,27 @@ def _run_tutorial(self):
         """
         logging.debug(f"Running tutorial {self}")
         time_start = time.perf_counter()
-        stdout_data = []
-        stderr_data = []
-        try:
-            # Execute docker-compose command
-            process = subprocess.Popen(['docker',
-                                        'compose',
-                                        '--file',
-                                        'docker-compose.tutorial.yaml',
-                                        'up'],
-                                       stdout=subprocess.PIPE,
-                                       stderr=subprocess.PIPE,
-                                       start_new_session=True,
-                                       cwd=self.system_test_dir)
-
-            try:
-                stdout, stderr = process.communicate(timeout=self.timeout)
-            except KeyboardInterrupt as k:
-                process.kill()
-                # process.send_signal(9)
-                raise KeyboardInterrupt from k
-            except Exception as e:
-                logging.critical(
-                    f"Systemtest {self} had serious issues executing the docker compose command about to kill the docker compose command. Please check the logs! {e}")
-                process.kill()
-                stdout, stderr = process.communicate(timeout=SHORT_TIMEOUT)
-                process.kill()
-
-            stdout_data.extend(stdout.decode().splitlines())
-            stderr_data.extend(stderr.decode().splitlines())
-            elapsed_time = time.perf_counter() - time_start
-            return DockerComposeResult(process.returncode, stdout_data, stderr_data, self, elapsed_time)
-        except Exception as e:
-            logging.critical(f"Error executing docker compose up command: {e}")
-            elapsed_time = time.perf_counter() - time_start
-            return DockerComposeResult(1, stdout_data, stderr_data, self, elapsed_time)
+        exit_code, stdout_data, stderr_data = self._run_docker_compose_subprocess(
+            [
+                'docker',
+                'compose',
+                '--file',
+                'docker-compose.tutorial.yaml',
+                'up',
+            ],
+            "solver",
+            self.timeout,
+        )
+        elapsed_time = time.perf_counter() - time_start
+        return DockerComposeResult(exit_code, stdout_data, stderr_data, self, elapsed_time)
 
     def __repr__(self):
         return f"{self.tutorial.name} {self.case_combination}"
 
     def __write_logs(self, stdout_data: List[str], stderr_data: List[str]):
-        with open(self.system_test_dir / "system-tests-stdout.log", 'w') as stdout_file:
+        with open(self.system_test_dir / COMBINED_STDOUT_LOG, 'w', encoding="utf-8") as stdout_file:
             stdout_file.write("\n".join(stdout_data))
-        with open(self.system_test_dir / "system-tests-stderr.log", 'w') as stderr_file:
+        with open(self.system_test_dir / COMBINED_STDERR_LOG, 'w', encoding="utf-8") as stderr_file:
             stderr_file.write("\n".join(stderr_data))
 
     def __apply_max_time_override(self):
@@ -632,6 +702,7 @@ def run(self, run_directory: Path):
         Runs the system test by generating the Docker Compose file, copying everything into a run folder, and executing docker-compose up.
         """
         self.__prepare_for_run(run_directory)
+        self.__init_run_logs()
         std_out: List[str] = []
         std_err: List[str] = []
 
@@ -697,6 +768,7 @@ def run_for_reference_results(self, run_directory: Path):
         Runs the system test by generating the Docker Compose files to generate the reference results
         """
         self.__prepare_for_run(run_directory)
+        self.__init_run_logs()
         std_out: List[str] = []
         std_err: List[str] = []
         docker_build_result = self._build_docker()
@@ -744,6 +816,7 @@ def run_only_build(self, run_directory: Path):
         Runs only the build commmand, for example to preheat the caches of the docker builder.
         """
         self.__prepare_for_run(run_directory)
+        self.__init_run_logs()
         std_out: List[str] = []
         std_err: List[str] = []
         docker_build_result = self._build_docker()

From d143329056b70ba9b1ec89698c1433ab1c75abb5 Mon Sep 17 00:00:00 2001
From: PranjalManhgaye <manhgayepranjal@gmail.com>
Date: Wed, 27 May 2026 13:57:12 +0530
Subject: [PATCH 02/11] Improve system test runner console readability

Drop INFO prefix on routine log lines, put test index at the start of
start/finish messages, and add blank lines between tests for easier
scanning in CI and local runs.

Addresses the console formatting part of #790.
---
 tools/tests/systemtests.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/tools/tests/systemtests.py b/tools/tests/systemtests.py
index 6debe18d0..36fb88f9d 100644
--- a/tools/tests/systemtests.py
+++ b/tools/tests/systemtests.py
@@ -10,6 +10,15 @@
 from paths import PRECICE_TUTORIAL_DIR, PRECICE_TESTS_RUN_DIR, PRECICE_TESTS_DIR
 
 
+class _ConsoleLogFormatter(logging.Formatter):
+    """Omit level prefix for INFO/DEBUG; keep it for warnings and errors."""
+
+    def format(self, record: logging.LogRecord) -> str:
+        if record.levelno >= logging.WARNING:
+            return f"{record.levelname}: {record.getMessage()}"
+        return record.getMessage()
+
+
 def main():
     parser = argparse.ArgumentParser(description='systemtest')
 
@@ -29,8 +38,10 @@ def main():
     # Parse the command-line arguments
     args = parser.parse_args()
 
-    # Configure logging based on the provided log level
-    logging.basicConfig(level=args.log_level, format='%(levelname)s: %(message)s')
+    # Configure logging: no "INFO:" prefix on routine messages (#790)
+    handler = logging.StreamHandler()
+    handler.setFormatter(_ConsoleLogFormatter())
+    logging.basicConfig(level=args.log_level, handlers=[handler])
 
     print(f"Using log-level: {args.log_level}")
 
@@ -72,17 +83,21 @@ def main():
     if not systemtests_to_run:
         raise RuntimeError("Did not find any Systemtests to execute.")
 
-    logging.info(f"About to run the following systemtest in the directory {run_directory}:\n {systemtests_to_run}")
+    total = len(systemtests_to_run)
+    logging.info(
+        f"About to run {total} systemtest(s) in the directory {run_directory}:\n {systemtests_to_run}")
 
     results = []
     for number, systemtest in enumerate(systemtests_to_run, start=1):
-        logging.info(f"Started running {systemtest},  {number}/{len(systemtests_to_run)}")
+        print()
+        logging.info(f"[{number}/{total}] Started running {systemtest}")
         t = time.perf_counter()
         result = systemtest.run(run_directory)
         elapsed_time = time.perf_counter() - t
-        logging.info(f"Running {systemtest} took {elapsed_time:^.1f} seconds")
+        logging.info(f"[{number}/{total}] Finished {systemtest} in {elapsed_time:.1f}s")
         results.append(result)
 
+    print()
     system_test_success = True
     for result in results:
         if not result.success:

From afb2c139c306993958772a359d41f36778e1ab66 Mon Sep 17 00:00:00 2001
From: PranjalManhgaye <manhgayepranjal@gmail.com>
Date: Wed, 27 May 2026 14:24:01 +0530
Subject: [PATCH 03/11] Improve system test console visuals (#790)

---
 tools/tests/systemtests.py | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/tools/tests/systemtests.py b/tools/tests/systemtests.py
index 36fb88f9d..e1f2ce466 100644
--- a/tools/tests/systemtests.py
+++ b/tools/tests/systemtests.py
@@ -7,6 +7,8 @@
 from metadata_parser.metdata import Tutorials, Case
 import logging
 import time
+import os
+import sys
 from paths import PRECICE_TUTORIAL_DIR, PRECICE_TESTS_RUN_DIR, PRECICE_TESTS_DIR
 
 
@@ -45,6 +47,23 @@ def main():
 
     print(f"Using log-level: {args.log_level}")
 
+    gh_actions = os.environ.get("GITHUB_ACTIONS", "").lower() == "true"
+    ansi_colors = sys.stdout.isatty() and os.environ.get("TERM", "") not in {"", "dumb"}
+
+    def _style(text: str, color_code: int | None) -> str:
+        if not ansi_colors or color_code is None:
+            return text
+        return f"\x1b[{color_code}m{text}\x1b[0m"
+
+    def _group_start(title: str) -> None:
+        # Only apply folding markers on GitHub actions.
+        if gh_actions:
+            print(f"::group::{title}")
+
+    def _group_end() -> None:
+        if gh_actions:
+            print("::endgroup::")
+
     systemtests_to_run = []
     available_tutorials = Tutorials.from_path(PRECICE_TUTORIAL_DIR)
 
@@ -90,11 +109,22 @@ def main():
     results = []
     for number, systemtest in enumerate(systemtests_to_run, start=1):
         print()
-        logging.info(f"[{number}/{total}] Started running {systemtest}")
+        test_header = f"[{number}/{total}] {systemtest}"
+        _group_start(test_header)
+        print("=" * 80)
+        logging.info(f"Started {systemtest}")
         t = time.perf_counter()
         result = systemtest.run(run_directory)
         elapsed_time = time.perf_counter() - t
-        logging.info(f"[{number}/{total}] Finished {systemtest} in {elapsed_time:.1f}s")
+
+        if result.success:
+            status_label = _style("✅ PASS", 32)
+        else:
+            status_label = _style("❌ FAIL", 31)
+
+        logging.info(f"Finished {systemtest} in {elapsed_time:.1f}s [{status_label}]")
+        _group_end()
+        print("=" * 80)
         results.append(result)
 
     print()

From af979fa1e3ee4697c263d6ea562856fe4a004b11 Mon Sep 17 00:00:00 2001
From: PranjalManhgaye <manhgayepranjal@gmail.com>
Date: Wed, 27 May 2026 14:25:12 +0530
Subject: [PATCH 04/11] Restore [n/m] prefix in Started/Finished visuals

---
 tools/tests/systemtests.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/tests/systemtests.py b/tools/tests/systemtests.py
index e1f2ce466..0db940fe7 100644
--- a/tools/tests/systemtests.py
+++ b/tools/tests/systemtests.py
@@ -112,7 +112,7 @@ def _group_end() -> None:
         test_header = f"[{number}/{total}] {systemtest}"
         _group_start(test_header)
         print("=" * 80)
-        logging.info(f"Started {systemtest}")
+        logging.info(f"[{number}/{total}] Started {systemtest}")
         t = time.perf_counter()
         result = systemtest.run(run_directory)
         elapsed_time = time.perf_counter() - t
@@ -122,7 +122,9 @@ def _group_end() -> None:
         else:
             status_label = _style("❌ FAIL", 31)
 
-        logging.info(f"Finished {systemtest} in {elapsed_time:.1f}s [{status_label}]")
+        logging.info(
+            f"[{number}/{total}] Finished {systemtest} in {elapsed_time:.1f}s [{status_label}]"
+        )
         _group_end()
         print("=" * 80)
         results.append(result)

From f2cdc66f47bdb9fe7a032639cd5438ac595b00f5 Mon Sep 17 00:00:00 2001
From: PranjalManhgaye <manhgayepranjal@gmail.com>
Date: Wed, 27 May 2026 16:09:17 +0530
Subject: [PATCH 05/11] Add changelog entry for system test log streaming
 (#790)

---
 changelog-entries/790.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog-entries/790.md

diff --git a/changelog-entries/790.md b/changelog-entries/790.md
new file mode 100644
index 000000000..1e2cb54f7
--- /dev/null
+++ b/changelog-entries/790.md
@@ -0,0 +1 @@
+- System tests now write logs during each run (build, solver, fieldcompare), including `system-tests-build.log`, `system-tests-solver.log`, and `system-tests-fieldcompare.log`, so failures can be debugged before a test finishes. Console output is easier to scan in local runs and GitHub Actions (fixes [#790](https://github.com/precice/tutorials/issues/790)).

From c1d4b332e73b0b64996b61ef9bf7a8e27f167d47 Mon Sep 17 00:00:00 2001
From: Pranjal <manhgayepranjal@gmail.com>
Date: Wed, 27 May 2026 16:21:08 +0530
Subject: [PATCH 06/11] Add changelog entry for version 801

---
 changelog-entries/{790.md => 801.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename changelog-entries/{790.md => 801.md} (100%)

diff --git a/changelog-entries/790.md b/changelog-entries/801.md
similarity index 100%
rename from changelog-entries/790.md
rename to changelog-entries/801.md

From 2fcdc4f4af86624883dc992646b901c71c05dc31 Mon Sep 17 00:00:00 2001
From: PranjalManhgaye <manhgayepranjal@gmail.com>
Date: Wed, 27 May 2026 20:59:34 +0530
Subject: [PATCH 07/11] Update system test log documentation and workflow hints
 Document combined and per-stage system test logs in README and job summary
 hints, and clarify workflow artifact step names to reflect staged log files.

---
 .../generate_reference_results_workflow.yml   |  2 +-
 .github/workflows/run_testsuite_workflow.yml  |  2 +-
 tools/tests/README.md                         | 42 +++++++++++++++----
 tools/tests/systemtests/Systemtest.py         |  2 +-
 4 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/generate_reference_results_workflow.yml b/.github/workflows/generate_reference_results_workflow.yml
index 9e712126b..6211a64fa 100644
--- a/.github/workflows/generate_reference_results_workflow.yml
+++ b/.github/workflows/generate_reference_results_workflow.yml
@@ -91,7 +91,7 @@ jobs:
         name: system_tests_run_${{ github.run_id }}_${{ github.run_attempt }}_reference_full
         path: |
           runs/*
-    - name: Archive main log files
+    - name: Archive system test logs (combined + per-stage)
       uses: actions/upload-artifact@v7
       with:
         name: system_tests_run_${{ github.run_id }}_${{ github.run_attempt }}_reference_logs
diff --git a/.github/workflows/run_testsuite_workflow.yml b/.github/workflows/run_testsuite_workflow.yml
index d96df2c5f..853a94e89 100644
--- a/.github/workflows/run_testsuite_workflow.yml
+++ b/.github/workflows/run_testsuite_workflow.yml
@@ -85,7 +85,7 @@ jobs:
         name: system_tests_run_${{ github.run_id }}_${{ github.run_attempt }}_full
         path: |
           runs/*
-    - name: Archive main log files
+    - name: Archive system test logs (combined + per-stage)
       uses: actions/upload-artifact@v7
       with:
         name: system_tests_run_${{ github.run_id }}_${{ github.run_attempt }}_logs
diff --git a/tools/tests/README.md b/tools/tests/README.md
index ba4891f6b..0c3c0ca21 100644
--- a/tools/tests/README.md
+++ b/tools/tests/README.md
@@ -36,7 +36,7 @@ Workflow for the preCICE v3 release testing:
 
 6. Download the build artifacts from Summary > runs.
 
-    - In there, you may want to check the `system-tests-stdout.log` and `system-tests-stderr.log` files.
+    - In there, you may want to inspect the combined logs (`system-tests-stdout.log`, `system-tests-stderr.log`) and the per-stage logs (`system-tests-build.log`, `system-tests-solver.log`, `system-tests-fieldcompare.log`).
     - The produced results are in `precice-exports/`, the reference results in `reference-results-unpacked`.
     - Compare using, e.g., ParaView or [fieldcompare](https://gitlab.com/dglaeser/fieldcompare): `fieldcompare dir precice-exports/ reference/`. The `--diff` option will give you `precice-exports/diff_*.vtu` files, while you can also try different tolerances with `-rtol` and `-atol`.
 
@@ -45,7 +45,7 @@ Workflow for the preCICE v3 release testing:
 To test a certain test-suite defined in `tests.yaml`, use:
 
 ```bash
-python3 systemtests.py --suites=fenics-adapter,<someothersuite>
+python3 systemtests.py --suites=fenics_test,<someothersuite>
 ```
 
 To discover all tests, use `python print_test_suites.py`.
@@ -59,13 +59,13 @@ Go to Actions > [Run Testsuite (manual)](https://github.com/precice/tutorials/ac
 After bringing these changes to `master`, the manual triggering option should be visible on the top right. Until that happens, we can only trigger this workflow manually from the [GitHub CLI](https://github.blog/changelog/2021-04-15-github-cli-1-9-enables-you-to-work-with-github-actions-from-your-terminal/):
 
 ```shell
-gh workflow run run_testsuite_manual.yml -f suites=fenics-adapter --ref=develop
+gh workflow run run_testsuite_manual.yml -f suites=fenics_test --ref=develop
 ```
 
 Another example, to use the latest releases and enable debug information of the tests:
 
 ```shell
-gh workflow run run_testsuite_manual.yml -f suites=fenics-adapter -f build_args="PRECICE_REF:v3.1.1,PRECICE_PRESET:production-audit,OPENFOAM_ADAPTER_REF:v1.3.0,PYTHON_BINDINGS_REF:v3.1.0,FENICS_ADAPTER_REF:v2.1.0,SU2_VERSION:7.5.1,SU2_ADAPTER_REF:64d4aff,DEALII_ADAPTER_REF:02c5d18,TUTORIALS_REF:340b447" -f log_level=DEBUG --ref=develop
+gh workflow run run_testsuite_manual.yml -f suites=fenics_test -f build_args="PRECICE_REF:v3.1.1,PRECICE_PRESET:production-audit,OPENFOAM_ADAPTER_REF:v1.3.0,PYTHON_BINDINGS_REF:v3.1.0,FENICS_ADAPTER_REF:v2.1.0,SU2_VERSION:7.5.1,SU2_ADAPTER_REF:64d4aff,DEALII_ADAPTER_REF:02c5d18,TUTORIALS_REF:340b447" -f log_level=DEBUG --ref=develop
 ```
 
 where the `*_REF` should be a specific [commit-ish](https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefcommit-ishacommit-ishalsocommittish).
@@ -75,7 +75,7 @@ Example output:
 ```text
 Run cd tools/tests
   cd tools/tests
-  python systemtests.py --build_args=PRECICE_REF:v3.1.1,PRECICE_PRESET:production-audit,OPENFOAM_ADAPTER_REF:v1.3.0,PYTHON_BINDINGS_REF:v3.1.0,FENICS_ADAPTER_REF:v2.1.0 --suites=fenics-adapter --log-level=DEBUG
+  python systemtests.py --build_args=PRECICE_REF:v3.1.1,PRECICE_PRESET:production-audit,OPENFOAM_ADAPTER_REF:v1.3.0,PYTHON_BINDINGS_REF:v3.1.0,FENICS_ADAPTER_REF:v2.1.0 --suites=fenics_test --log-level=DEBUG
   cd ../../
   shell: /usr/bin/bash -e {0}
 INFO: About to run the following systemtest in the directory /home/precice/runners_root/actions-runner-tutorial/_work/tutorials/tutorials/runs:
@@ -105,7 +105,7 @@ In this case, building and running seems to work out, but the tests fail because
 
 The easiest way to debug a systemtest run is first to have a look at the output written into the action on GitHub.
 If this does not provide enough hints, the next step is to download the generated `system_tests_run_<run_id>_<run_attempt>` artifact. Note that by default this will only be generated if the systemtests fail.
-Inside the archive, a test-specific subfolder like `flow-over-heated-plate_fluid-openfoam-solid-fenics_2023-11-19-211723` contains two log files: `system-tests-stderr.log` and `system-tests-stdout.log`. This can be a starting point for a further investigation. When fieldcompare runs with `--diff`, it writes VTK diff files under `precice-exports/`; if the comparison fails, those files are copied into a `diff-results/` subfolder in the same run directory (mirroring any subpaths under `precice-exports/`) so you can open them (e.g. in ParaView) to see where results differ from the reference. On successful comparisons, `diff-results/` is therefore absent.
+Inside the archive, a test-specific subfolder like `flow-over-heated-plate_fluid-openfoam-solid-fenics_2023-11-19-211723` contains combined logs (`system-tests-stderr.log`, `system-tests-stdout.log`) and per-stage logs (`system-tests-build.log`, `system-tests-solver.log`, `system-tests-fieldcompare.log`). These are a good starting point for further investigation. When fieldcompare runs with `--diff`, it writes VTK diff files under `precice-exports/`; if the comparison fails, those files are copied into a `diff-results/` subfolder in the same run directory (mirroring any subpaths under `precice-exports/`) so you can open them (e.g. in ParaView) to see where results differ from the reference. On successful comparisons, `diff-results/` is therefore absent.
 
 ## Adding new tests
 
@@ -185,6 +185,25 @@ User-facing tools:
   - `print_case_combinations.py`: Prints all possible combinations of tutorial cases, using the `metadata.yaml` files.
   - `build_docker_images.py`: Build the Docker images for each test
   - `generate_reference_results.py`: Executes the system tests with the versions defined in `reference_versions.yaml` and generates the reference data archives, with the names described in `tests.yaml`. (should only be used by the CI Pipeline)
+  - `update_requirements_reference.py`: Regenerates `requirements-reference.txt` with pinned Python versions from `reference_versions.yaml` (for reproducibility, see issue #610).
+  - `validate_requirements_reference.py`: Validates that `requirements-reference.txt` exists and pyprecice version matches `reference_versions.yaml`.
+
+### requirements-reference.txt
+
+A lockfile-style list of pinned Python dependency versions (pyprecice, numpy, matplotlib, nutils, setuptools) for reproducible builds and distributions (see issue [#610](https://github.com/precice/tutorials/issues/610)). This file is a **reference manifest only**: tutorial `run.sh` scripts keep using their own `requirements.txt` (with loose constraints) to avoid merge back-and-forth; system tests use the Docker image’s venv. The reference file records “versions known to work” for a distribution.
+
+**Update at each release.** For best accuracy (match what CI uses), capture from the systemtest Docker image:
+
+```bash
+docker run --rm <python_bindings_or_fenics_image> pip freeze | python3 update_requirements_reference.py --from-freeze
+```
+
+Or regenerate from `reference_versions.yaml` only (pyprecice from PYTHON_BINDINGS_REF; others from script defaults):
+
+```bash
+cd tools/tests
+python3 update_requirements_reference.py
+```
 
 Implementation scripts:
 
@@ -316,7 +335,14 @@ Concrete tests are specified centrally in the file `tests.yaml`. For example:
 
 ```yaml
 test_suites:
-  openfoam-adapter:
+  openfoam_adapter_pr:
+    tutorials:
+      - path: flow-over-heated-plate
+        case_combination:
+          - fluid-openfoam
+          - solid-openfoam
+        reference_result: ./flow-over-heated-plate/reference-results/fluid-openfoam_solid-openfoam.tar.gz
+  openfoam_adapter_release:
     tutorials:
       - path: flow-over-heated-plate
         case_combination:
@@ -333,7 +359,7 @@ test_suites:
 
 The optional `timeout` field (in seconds) sets the maximum time for the solver run and fieldcompare phases of that specific case. If omitted, it defaults to `GLOBAL_TIMEOUT` (currently 900s, overridable via the `PRECICE_SYSTEMTESTS_TIMEOUT` environment variable).
 
-This defines the test suite `openfoam-adapter`, with a case combination to run.
+This defines two test suites, namely `openfoam_adapter_pr` and `openfoam_adapter_release`. Each of them defines which case combinations of which tutorials to run.
 
 ### Generate Reference Results
 
diff --git a/tools/tests/systemtests/Systemtest.py b/tools/tests/systemtests/Systemtest.py
index 36998ae6a..4991abd5f 100644
--- a/tools/tests/systemtests/Systemtest.py
+++ b/tools/tests/systemtests/Systemtest.py
@@ -160,7 +160,7 @@ def _get_length_of_name(results: List[SystemtestResult]) -> int:
         with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
             print("\n\n", file=f)
             print(
-                "In case a test fails, download the archive from the bottom of this page and look into each `system-tests-stdout.log` and `system-tests-stderr.log`. The time spent in each step might already give useful hints.",
+                "In case a test fails, download the archive from the bottom of this page and inspect the combined logs (`system-tests-stdout.log`, `system-tests-stderr.log`) and the per-stage logs (`system-tests-build.log`, `system-tests-solver.log`, `system-tests-fieldcompare.log`). The phase runtimes might already give useful hints.",
                 file=f)
             print(
                 "See the [documentation](https://precice.org/dev-docs-system-tests.html#understanding-what-went-wrong).",

From 43ae44a2d5eadf801d518be60165cb445cbb9711 Mon Sep 17 00:00:00 2001
From: PranjalManhgaye <manhgayepranjal@gmail.com>
Date: Wed, 27 May 2026 21:05:04 +0530
Subject: [PATCH 08/11] Keep README changes focused on staged system test logs
 only

---
 tools/tests/README.md | 42 ++++++++----------------------------------
 1 file changed, 8 insertions(+), 34 deletions(-)

diff --git a/tools/tests/README.md b/tools/tests/README.md
index 0c3c0ca21..ba4891f6b 100644
--- a/tools/tests/README.md
+++ b/tools/tests/README.md
@@ -36,7 +36,7 @@ Workflow for the preCICE v3 release testing:
 
 6. Download the build artifacts from Summary > runs.
 
-    - In there, you may want to inspect the combined logs (`system-tests-stdout.log`, `system-tests-stderr.log`) and the per-stage logs (`system-tests-build.log`, `system-tests-solver.log`, `system-tests-fieldcompare.log`).
+    - In there, you may want to check the `system-tests-stdout.log` and `system-tests-stderr.log` files.
     - The produced results are in `precice-exports/`, the reference results in `reference-results-unpacked`.
     - Compare using, e.g., ParaView or [fieldcompare](https://gitlab.com/dglaeser/fieldcompare): `fieldcompare dir precice-exports/ reference/`. The `--diff` option will give you `precice-exports/diff_*.vtu` files, while you can also try different tolerances with `-rtol` and `-atol`.
 
@@ -45,7 +45,7 @@ Workflow for the preCICE v3 release testing:
 To test a certain test-suite defined in `tests.yaml`, use:
 
 ```bash
-python3 systemtests.py --suites=fenics_test,<someothersuite>
+python3 systemtests.py --suites=fenics-adapter,<someothersuite>
 ```
 
 To discover all tests, use `python print_test_suites.py`.
@@ -59,13 +59,13 @@ Go to Actions > [Run Testsuite (manual)](https://github.com/precice/tutorials/ac
 After bringing these changes to `master`, the manual triggering option should be visible on the top right. Until that happens, we can only trigger this workflow manually from the [GitHub CLI](https://github.blog/changelog/2021-04-15-github-cli-1-9-enables-you-to-work-with-github-actions-from-your-terminal/):
 
 ```shell
-gh workflow run run_testsuite_manual.yml -f suites=fenics_test --ref=develop
+gh workflow run run_testsuite_manual.yml -f suites=fenics-adapter --ref=develop
 ```
 
 Another example, to use the latest releases and enable debug information of the tests:
 
 ```shell
-gh workflow run run_testsuite_manual.yml -f suites=fenics_test -f build_args="PRECICE_REF:v3.1.1,PRECICE_PRESET:production-audit,OPENFOAM_ADAPTER_REF:v1.3.0,PYTHON_BINDINGS_REF:v3.1.0,FENICS_ADAPTER_REF:v2.1.0,SU2_VERSION:7.5.1,SU2_ADAPTER_REF:64d4aff,DEALII_ADAPTER_REF:02c5d18,TUTORIALS_REF:340b447" -f log_level=DEBUG --ref=develop
+gh workflow run run_testsuite_manual.yml -f suites=fenics-adapter -f build_args="PRECICE_REF:v3.1.1,PRECICE_PRESET:production-audit,OPENFOAM_ADAPTER_REF:v1.3.0,PYTHON_BINDINGS_REF:v3.1.0,FENICS_ADAPTER_REF:v2.1.0,SU2_VERSION:7.5.1,SU2_ADAPTER_REF:64d4aff,DEALII_ADAPTER_REF:02c5d18,TUTORIALS_REF:340b447" -f log_level=DEBUG --ref=develop
 ```
 
 where the `*_REF` should be a specific [commit-ish](https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefcommit-ishacommit-ishalsocommittish).
@@ -75,7 +75,7 @@ Example output:
 ```text
 Run cd tools/tests
   cd tools/tests
-  python systemtests.py --build_args=PRECICE_REF:v3.1.1,PRECICE_PRESET:production-audit,OPENFOAM_ADAPTER_REF:v1.3.0,PYTHON_BINDINGS_REF:v3.1.0,FENICS_ADAPTER_REF:v2.1.0 --suites=fenics_test --log-level=DEBUG
+  python systemtests.py --build_args=PRECICE_REF:v3.1.1,PRECICE_PRESET:production-audit,OPENFOAM_ADAPTER_REF:v1.3.0,PYTHON_BINDINGS_REF:v3.1.0,FENICS_ADAPTER_REF:v2.1.0 --suites=fenics-adapter --log-level=DEBUG
   cd ../../
   shell: /usr/bin/bash -e {0}
 INFO: About to run the following systemtest in the directory /home/precice/runners_root/actions-runner-tutorial/_work/tutorials/tutorials/runs:
@@ -105,7 +105,7 @@ In this case, building and running seems to work out, but the tests fail because
 
 The easiest way to debug a systemtest run is first to have a look at the output written into the action on GitHub.
 If this does not provide enough hints, the next step is to download the generated `system_tests_run_<run_id>_<run_attempt>` artifact. Note that by default this will only be generated if the systemtests fail.
-Inside the archive, a test-specific subfolder like `flow-over-heated-plate_fluid-openfoam-solid-fenics_2023-11-19-211723` contains combined logs (`system-tests-stderr.log`, `system-tests-stdout.log`) and per-stage logs (`system-tests-build.log`, `system-tests-solver.log`, `system-tests-fieldcompare.log`). These are a good starting point for further investigation. When fieldcompare runs with `--diff`, it writes VTK diff files under `precice-exports/`; if the comparison fails, those files are copied into a `diff-results/` subfolder in the same run directory (mirroring any subpaths under `precice-exports/`) so you can open them (e.g. in ParaView) to see where results differ from the reference. On successful comparisons, `diff-results/` is therefore absent.
+Inside the archive, a test-specific subfolder like `flow-over-heated-plate_fluid-openfoam-solid-fenics_2023-11-19-211723` contains two log files: `system-tests-stderr.log` and `system-tests-stdout.log`. This can be a starting point for a further investigation. When fieldcompare runs with `--diff`, it writes VTK diff files under `precice-exports/`; if the comparison fails, those files are copied into a `diff-results/` subfolder in the same run directory (mirroring any subpaths under `precice-exports/`) so you can open them (e.g. in ParaView) to see where results differ from the reference. On successful comparisons, `diff-results/` is therefore absent.
 
 ## Adding new tests
 
@@ -185,25 +185,6 @@ User-facing tools:
   - `print_case_combinations.py`: Prints all possible combinations of tutorial cases, using the `metadata.yaml` files.
   - `build_docker_images.py`: Build the Docker images for each test
   - `generate_reference_results.py`: Executes the system tests with the versions defined in `reference_versions.yaml` and generates the reference data archives, with the names described in `tests.yaml`. (should only be used by the CI Pipeline)
-  - `update_requirements_reference.py`: Regenerates `requirements-reference.txt` with pinned Python versions from `reference_versions.yaml` (for reproducibility, see issue #610).
-  - `validate_requirements_reference.py`: Validates that `requirements-reference.txt` exists and pyprecice version matches `reference_versions.yaml`.
-
-### requirements-reference.txt
-
-A lockfile-style list of pinned Python dependency versions (pyprecice, numpy, matplotlib, nutils, setuptools) for reproducible builds and distributions (see issue [#610](https://github.com/precice/tutorials/issues/610)). This file is a **reference manifest only**: tutorial `run.sh` scripts keep using their own `requirements.txt` (with loose constraints) to avoid merge back-and-forth; system tests use the Docker image’s venv. The reference file records “versions known to work” for a distribution.
-
-**Update at each release.** For best accuracy (match what CI uses), capture from the systemtest Docker image:
-
-```bash
-docker run --rm <python_bindings_or_fenics_image> pip freeze | python3 update_requirements_reference.py --from-freeze
-```
-
-Or regenerate from `reference_versions.yaml` only (pyprecice from PYTHON_BINDINGS_REF; others from script defaults):
-
-```bash
-cd tools/tests
-python3 update_requirements_reference.py
-```
 
 Implementation scripts:
 
@@ -335,14 +316,7 @@ Concrete tests are specified centrally in the file `tests.yaml`. For example:
 
 ```yaml
 test_suites:
-  openfoam_adapter_pr:
-    tutorials:
-      - path: flow-over-heated-plate
-        case_combination:
-          - fluid-openfoam
-          - solid-openfoam
-        reference_result: ./flow-over-heated-plate/reference-results/fluid-openfoam_solid-openfoam.tar.gz
-  openfoam_adapter_release:
+  openfoam-adapter:
     tutorials:
       - path: flow-over-heated-plate
         case_combination:
@@ -359,7 +333,7 @@ test_suites:
 
 The optional `timeout` field (in seconds) sets the maximum time for the solver run and fieldcompare phases of that specific case. If omitted, it defaults to `GLOBAL_TIMEOUT` (currently 900s, overridable via the `PRECICE_SYSTEMTESTS_TIMEOUT` environment variable).
 
-This defines two test suites, namely `openfoam_adapter_pr` and `openfoam_adapter_release`. Each of them defines which case combinations of which tutorials to run.
+This defines the test suite `openfoam-adapter`, with a case combination to run.
 
 ### Generate Reference Results
 

From 66b032dc147b31687c7a71aa71539ec7994b3414 Mon Sep 17 00:00:00 2001
From: PranjalManhgaye <manhgayepranjal@gmail.com>
Date: Mon, 1 Jun 2026 14:25:00 +0530
Subject: [PATCH 09/11] Address review feedback on system test logging and
 console output

---
 .../generate_reference_results_workflow.yml   |  6 +-
 .github/workflows/run_testsuite_workflow.yml  |  6 +-
 changelog-entries/801.md                      |  2 +-
 tools/tests/README.md                         |  4 +-
 tools/tests/systemtests.py                    | 63 +++++++++++--------
 tools/tests/systemtests/Systemtest.py         | 56 +++++++++--------
 6 files changed, 76 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/generate_reference_results_workflow.yml b/.github/workflows/generate_reference_results_workflow.yml
index c7ca1bac0..30983dd7e 100644
--- a/.github/workflows/generate_reference_results_workflow.yml
+++ b/.github/workflows/generate_reference_results_workflow.yml
@@ -92,7 +92,7 @@ jobs:
         fi
         git commit -m "${{inputs.commit_msg}}"
         git push
-    - name: Archive system test logs (combined + per-stage)
+    - name: Archive system test logs
       if: ${{ always() }}
       uses: actions/upload-artifact@v7
       with:
@@ -101,8 +101,8 @@ jobs:
           runs/*/system-tests-stdout.log
           runs/*/system-tests-stderr.log
           runs/*/system-tests-build.log
-          runs/*/system-tests-solver.log
-          runs/*/system-tests-fieldcompare.log
+          runs/*/system-tests-run.log
+          runs/*/system-tests-compare.log
           runs/*/*/system-tests_*.log
     - name: Archive run files
       if: failure()
diff --git a/.github/workflows/run_testsuite_workflow.yml b/.github/workflows/run_testsuite_workflow.yml
index a0b8f028c..c3178fd65 100644
--- a/.github/workflows/run_testsuite_workflow.yml
+++ b/.github/workflows/run_testsuite_workflow.yml
@@ -78,7 +78,7 @@ jobs:
         cd tools/tests
         python systemtests.py --build_args=${{ inputs.build_args}} --suites=${{ inputs.suites}} --log-level=${{ inputs.log_level}}
         cd ../../
-    - name: Archive system test logs (combined + per-stage)
+    - name: Archive system test logs
       if: ${{ always() }}
       uses: actions/upload-artifact@v7
       with:
@@ -87,8 +87,8 @@ jobs:
           runs/*/system-tests-stdout.log
           runs/*/system-tests-stderr.log
           runs/*/system-tests-build.log
-          runs/*/system-tests-solver.log
-          runs/*/system-tests-fieldcompare.log
+          runs/*/system-tests-run.log
+          runs/*/system-tests-compare.log
           runs/*/*/system-tests_*.log
     - name: Archive run files
       if: ${{ failure() || inputs.upload_artifacts == 'TRUE' }}
diff --git a/changelog-entries/801.md b/changelog-entries/801.md
index 1e2cb54f7..745806f4a 100644
--- a/changelog-entries/801.md
+++ b/changelog-entries/801.md
@@ -1 +1 @@
-- System tests now write logs during each run (build, solver, fieldcompare), including `system-tests-build.log`, `system-tests-solver.log`, and `system-tests-fieldcompare.log`, so failures can be debugged before a test finishes. Console output is easier to scan in local runs and GitHub Actions (fixes [#790](https://github.com/precice/tutorials/issues/790)).
+- System tests now write logs during each run (build, run, compare), including `system-tests-build.log`, `system-tests-run.log`, and `system-tests-compare.log` [#801](https://github.com/precice/tutorials/pull/801).
diff --git a/tools/tests/README.md b/tools/tests/README.md
index 8a7df6e20..6f7e6cabf 100644
--- a/tools/tests/README.md
+++ b/tools/tests/README.md
@@ -36,7 +36,7 @@ Workflow for the preCICE v3 release testing:
 
 6. Download the build artifacts from Summary > runs.
 
-    - In there, you may want to check the `system-tests-stdout.log` and `system-tests-stderr.log` files.
+    - In there, inspect the combined logs (`system-tests-stdout.log`, `system-tests-stderr.log`) and the per-stage logs (`system-tests-build.log`, `system-tests-run.log`, `system-tests-compare.log`).
     - The produced results are in `precice-exports/`, the reference results in `reference-results-unpacked`.
     - Compare using, e.g., ParaView or [fieldcompare](https://gitlab.com/dglaeser/fieldcompare): `fieldcompare dir precice-exports/ reference/`. The `--diff` option will give you `precice-exports/diff_*.vtu` files, while you can also try different tolerances with `-rtol` and `-atol`.
 
@@ -105,7 +105,7 @@ In this case, building and running seems to work out, but the tests fail because
 
 The easiest way to debug a systemtest run is first to have a look at the output written into the action on GitHub.
 If this does not provide enough hints, the next step is to download the generated `system_tests_run_<run_id>_<run_attempt>` artifact. Note that by default this will only be generated if the systemtests fail.
-Inside the archive, a test-specific subfolder like `flow-over-heated-plate_fluid-openfoam-solid-fenics_2023-11-19-211723` contains two log files: `system-tests-stderr.log` and `system-tests-stdout.log`. This can be a starting point for a further investigation. When fieldcompare runs with `--diff`, it writes VTK (.vtu) diff files under `precice-exports/`; if the comparison fails, those files are copied into a `diff-results/` subfolder in the same run directory (mirroring any subpaths under `precice-exports/`) so you can open them (e.g. in ParaView) to see where results differ from the reference. On successful comparisons, `diff-results/` is therefore absent.
+Inside the archive, a test-specific subfolder like `flow-over-heated-plate_fluid-openfoam-solid-fenics_2023-11-19-211723` contains combined logs (`system-tests-stderr.log`, `system-tests-stdout.log`) and per-stage logs (`system-tests-build.log`, `system-tests-run.log`, `system-tests-compare.log`). These are a good starting point for further investigation. When fieldcompare runs with `--diff`, it writes VTK (.vtu) diff files under `precice-exports/`; if the comparison fails, those files are copied into a `diff-results/` subfolder in the same run directory (mirroring any subpaths under `precice-exports/`) so you can open them (e.g. in ParaView) to see where results differ from the reference. On successful comparisons, `diff-results/` is therefore absent.
 
 ## Adding new tests
 
diff --git a/tools/tests/systemtests.py b/tools/tests/systemtests.py
index 0db940fe7..9544fb2f4 100644
--- a/tools/tests/systemtests.py
+++ b/tools/tests/systemtests.py
@@ -40,14 +40,13 @@ def main():
     # Parse the command-line arguments
     args = parser.parse_args()
 
-    # Configure logging: no "INFO:" prefix on routine messages (#790)
+    # Configure logging
     handler = logging.StreamHandler()
     handler.setFormatter(_ConsoleLogFormatter())
     logging.basicConfig(level=args.log_level, handlers=[handler])
 
-    print(f"Using log-level: {args.log_level}")
-
     gh_actions = os.environ.get("GITHUB_ACTIONS", "").lower() == "true"
+    # Skip ANSI colors when TERM is unset or "dumb" (minimal terminal, common in CI).
     ansi_colors = sys.stdout.isatty() and os.environ.get("TERM", "") not in {"", "dumb"}
 
     def _style(text: str, color_code: int | None) -> str:
@@ -56,7 +55,6 @@ def _style(text: str, color_code: int | None) -> str:
         return f"\x1b[{color_code}m{text}\x1b[0m"
 
     def _group_start(title: str) -> None:
-        # Only apply folding markers on GitHub actions.
         if gh_actions:
             print(f"::group::{title}")
 
@@ -65,6 +63,7 @@ def _group_end() -> None:
             print("::endgroup::")
 
     systemtests_to_run = []
+    test_suites_to_execute = []
     available_tutorials = Tutorials.from_path(PRECICE_TUTORIAL_DIR)
 
     build_args = SystemtestArguments.from_args(args.build_args)
@@ -73,7 +72,6 @@ def _group_end() -> None:
         test_suites_requested = args.suites.split(',')
         available_testsuites = TestSuites.from_yaml(
             PRECICE_TESTS_DIR / "tests.yaml", available_tutorials)
-        test_suites_to_execute = []
         for test_suite_requested in test_suites_requested:
             test_suite_found = available_testsuites.get_by_name(
                 test_suite_requested)
@@ -103,33 +101,48 @@ def _group_end() -> None:
         raise RuntimeError("Did not find any Systemtests to execute.")
 
     total = len(systemtests_to_run)
-    logging.info(
-        f"About to run {total} systemtest(s) in the directory {run_directory}:\n {systemtests_to_run}")
+
+    if test_suites_to_execute:
+        print("Selected test suite(s):", flush=True)
+        print(flush=True)
+        for test_suite in test_suites_to_execute:
+            print(f"- {test_suite.name}", flush=True)
+        print(flush=True)
+
+    print(f"About to run {total} test(s) in the directory {run_directory}:", flush=True)
+    print(flush=True)
+    for number, systemtest in enumerate(systemtests_to_run, start=1):
+        print(f"{number}. {systemtest}", flush=True)
+    print(flush=True)
+    print(f"Using log-level: {args.log_level}", flush=True)
 
     results = []
     for number, systemtest in enumerate(systemtests_to_run, start=1):
-        print()
+        print(flush=True)
         test_header = f"[{number}/{total}] {systemtest}"
         _group_start(test_header)
-        print("=" * 80)
-        logging.info(f"[{number}/{total}] Started {systemtest}")
-        t = time.perf_counter()
-        result = systemtest.run(run_directory)
-        elapsed_time = time.perf_counter() - t
-
-        if result.success:
-            status_label = _style("✅ PASS", 32)
-        else:
-            status_label = _style("❌ FAIL", 31)
+        if not gh_actions:
+            print("=" * 80, flush=True)
+        try:
+            logging.info(f"[{number}/{total}] Started {systemtest}")
+            t = time.perf_counter()
+            result = systemtest.run(run_directory)
+            elapsed_time = time.perf_counter() - t
+
+            if result.success:
+                status_label = _style("✅ PASS", 32)
+            else:
+                status_label = _style("❌ FAIL", 31)
 
-        logging.info(
-            f"[{number}/{total}] Finished {systemtest} in {elapsed_time:.1f}s [{status_label}]"
-        )
-        _group_end()
-        print("=" * 80)
-        results.append(result)
+            logging.info(
+                f"{status_label} Finished {systemtest} in {elapsed_time:.1f}s"
+            )
+            results.append(result)
+        finally:
+            _group_end()
+            if not gh_actions:
+                print("=" * 80, flush=True)
 
-    print()
     system_test_success = True
     for result in results:
         if not result.success:
diff --git a/tools/tests/systemtests/Systemtest.py b/tools/tests/systemtests/Systemtest.py
index 04293ccd4..492b480a5 100644
--- a/tools/tests/systemtests/Systemtest.py
+++ b/tools/tests/systemtests/Systemtest.py
@@ -27,15 +27,15 @@
 
 COMBINED_STDOUT_LOG = "system-tests-stdout.log"
 COMBINED_STDERR_LOG = "system-tests-stderr.log"
-PHASE_LOG_FILES = {
+STAGE_LOG_FILES = {
     "build": "system-tests-build.log",
-    "solver": "system-tests-solver.log",
-    "fieldcompare": "system-tests-fieldcompare.log",
+    "run": "system-tests-run.log",
+    "compare": "system-tests-compare.log",
 }
 
 
 class _SystemtestLogSink:
-    """Writes subprocess output incrementally to combined and phase log files."""
+    """Writes subprocess output incrementally to combined and stage log files."""
 
     def __init__(self, system_test_dir: Path):
         self._system_test_dir = system_test_dir
@@ -45,24 +45,24 @@ def __init__(self, system_test_dir: Path):
         self._stdout_path.write_text("", encoding="utf-8")
         self._stderr_path.write_text("", encoding="utf-8")
 
-    def begin_phase(self, phase: str) -> None:
-        phase_path = self._system_test_dir / PHASE_LOG_FILES[phase]
-        phase_path.write_text(f"=== {phase} ===\n", encoding="utf-8")
+    def begin_stage(self, stage: str) -> None:
+        stage_path = self._system_test_dir / STAGE_LOG_FILES[stage]
+        stage_path.write_text(f"=== {stage} ===\n", encoding="utf-8")
 
-    def append_stdout(self, line: str, phase: str) -> None:
+    def append_stdout(self, line: str, stage: str) -> None:
         with self._lock:
             with self._stdout_path.open("a", encoding="utf-8") as log_file:
                 log_file.write(line + "\n")
-            phase_path = self._system_test_dir / PHASE_LOG_FILES[phase]
-            with phase_path.open("a", encoding="utf-8") as log_file:
+            stage_path = self._system_test_dir / STAGE_LOG_FILES[stage]
+            with stage_path.open("a", encoding="utf-8") as log_file:
                 log_file.write(line + "\n")
 
-    def append_stderr(self, line: str, phase: str) -> None:
+    def append_stderr(self, line: str, stage: str) -> None:
         with self._lock:
             with self._stderr_path.open("a", encoding="utf-8") as log_file:
                 log_file.write(line + "\n")
-            phase_path = self._system_test_dir / PHASE_LOG_FILES[phase]
-            with phase_path.open("a", encoding="utf-8") as log_file:
+            stage_path = self._system_test_dir / STAGE_LOG_FILES[stage]
+            with stage_path.open("a", encoding="utf-8") as log_file:
                 log_file.write(f"[stderr] {line}\n")
 
 
@@ -121,6 +121,8 @@ def display_systemtestresults_as_table(results: List[SystemtestResult]):
     """
     Prints the result in a nice tabluated way to get an easy overview
     """
+    print()
+
     def _get_length_of_name(results: List[SystemtestResult]) -> int:
         return max(len(str(result.systemtest)) for result in results)
 
@@ -160,7 +162,7 @@ def _get_length_of_name(results: List[SystemtestResult]) -> int:
         with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
             print("\n\n", file=f)
             print(
-                "In case a test fails, download the archive from the bottom of this page and inspect the combined logs (`system-tests-stdout.log`, `system-tests-stderr.log`) and the per-stage logs (`system-tests-build.log`, `system-tests-solver.log`, `system-tests-fieldcompare.log`). The phase runtimes might already give useful hints.",
+                "In case a test fails, download the archive from the bottom of this page and inspect the combined logs (`system-tests-stdout.log`, `system-tests-stderr.log`) and the per-stage logs (`system-tests-build.log`, `system-tests-run.log`, `system-tests-compare.log`). The stage runtimes might already give useful hints.",
                 file=f)
             print(
                 "See the [documentation](https://precice.org/dev-docs-system-tests.html#understanding-what-went-wrong).",
@@ -446,7 +448,7 @@ def __init_run_logs(self) -> None:
     def _run_docker_compose_subprocess(
         self,
         command: List[str],
-        phase: str,
+        stage: str,
         timeout: int,
     ) -> Tuple[int, List[str], List[str]]:
         """
@@ -456,8 +458,8 @@ def _run_docker_compose_subprocess(
         stderr_data: List[str] = []
         log_sink = getattr(self, "_log_sink", None)
         if log_sink is not None:
-            log_sink.begin_phase(phase)
-        logging.info(f"Docker compose {phase} for {self}")
+            log_sink.begin_stage(stage)
+        logging.info(f"Docker compose {stage} for {self}")
 
         try:
             process = subprocess.Popen(
@@ -470,7 +472,7 @@ def _run_docker_compose_subprocess(
                 cwd=self.system_test_dir,
             )
         except Exception as e:
-            logging.critical(f"Error starting docker compose {phase} command: {e}")
+            logging.critical(f"Error starting docker compose {stage} command: {e}")
             return 1, stdout_data, stderr_data
 
         def read_stream(stream, is_stderr: bool) -> None:
@@ -481,11 +483,11 @@ def read_stream(stream, is_stderr: bool) -> None:
                 if is_stderr:
                     stderr_data.append(line)
                     if log_sink is not None:
-                        log_sink.append_stderr(line, phase)
+                        log_sink.append_stderr(line, stage)
                 else:
                     stdout_data.append(line)
                     if log_sink is not None:
-                        log_sink.append_stdout(line, phase)
+                        log_sink.append_stdout(line, stage)
             stream.close()
 
         stdout_thread = threading.Thread(
@@ -504,7 +506,7 @@ def read_stream(stream, is_stderr: bool) -> None:
             raise KeyboardInterrupt from k
         except subprocess.TimeoutExpired:
             logging.critical(
-                f"Systemtest {self} timed out during docker compose {phase} "
+                f"Systemtest {self} timed out during docker compose {stage} "
                 f"after {timeout}s. Killing the process.")
             process.kill()
             try:
@@ -514,7 +516,7 @@ def read_stream(stream, is_stderr: bool) -> None:
             exit_code = process.returncode if process.returncode is not None else 1
         except Exception as e:
             logging.critical(
-                f"Systemtest {self} had serious issues during docker compose {phase}: {e}")
+                f"Systemtest {self} had serious issues during docker compose {stage}: {e}")
             process.kill()
             try:
                 process.wait(timeout=SHORT_TIMEOUT)
@@ -570,8 +572,8 @@ def _run_field_compare(self):
         if not unpack_success:
             log_sink = getattr(self, "_log_sink", None)
             if log_sink is not None:
-                log_sink.begin_phase("fieldcompare")
-                log_sink.append_stderr(unpack_error_message, "fieldcompare")
+                log_sink.begin_stage("compare")
+                log_sink.append_stderr(unpack_error_message, "compare")
             elapsed_time = time.perf_counter() - time_start
             return FieldCompareResult(1, [], [unpack_error_message], self, elapsed_time)
         docker_compose_content = self.__get_field_compare_compose_file()
@@ -588,7 +590,7 @@ def _run_field_compare(self):
                 '--exit-code-from',
                 'field-compare',
             ],
-            "fieldcompare",
+            "compare",
             self.timeout,
         )
         elapsed_time = time.perf_counter() - time_start
@@ -648,10 +650,10 @@ def _build_docker(self):
             [
                 'docker',
                 'compose',
+                '--progress=plain',
                 '--file',
                 'docker-compose.tutorial.yaml',
                 'build',
-                '--progress=plain',
             ],
             "build",
             GLOBAL_TIMEOUT,
@@ -676,7 +678,7 @@ def _run_tutorial(self):
                 'docker-compose.tutorial.yaml',
                 'up',
             ],
-            "solver",
+            "run",
             self.timeout,
         )
         elapsed_time = time.perf_counter() - time_start

From 02b001b14db50f438f0c55d35d3d1ea5f0758523 Mon Sep 17 00:00:00 2001
From: PranjalManhgaye <manhgayepranjal@gmail.com>
Date: Mon, 1 Jun 2026 18:36:27 +0530
Subject: [PATCH 10/11] Tweak system test console output for GitHub Actions

Print PASS/FAIL after ::endgroup:: so results stay visible when groups
are collapsed, add a blank line before the per-test summary, and show
status symbols in the results table and job summary.
---
 tools/tests/systemtests.py            | 25 +++++++++++--------------
 tools/tests/systemtests/Systemtest.py |  8 ++++++--
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/tools/tests/systemtests.py b/tools/tests/systemtests.py
index 9544fb2f4..10531b33d 100644
--- a/tools/tests/systemtests.py
+++ b/tools/tests/systemtests.py
@@ -56,11 +56,11 @@ def _style(text: str, color_code: int | None) -> str:
 
     def _group_start(title: str) -> None:
         if gh_actions:
-            print(f"::group::{title}")
+            print(f"::group::{title}", flush=True)
 
     def _group_end() -> None:
         if gh_actions:
-            print("::endgroup::")
+            print("::endgroup::", flush=True)
 
     systemtests_to_run = []
     test_suites_to_execute = []
@@ -119,12 +119,11 @@ def _group_end() -> None:
     results = []
     for number, systemtest in enumerate(systemtests_to_run, start=1):
         print(flush=True)
-        test_header = f"[{number}/{total}] {systemtest}"
-        _group_start(test_header)
-        if not gh_actions:
-            print("=" * 80, flush=True)
+        started_header = f"[{number}/{total}] Started {systemtest}"
+        _group_start(started_header)
         try:
-            logging.info(f"[{number}/{total}] Started {systemtest}")
+            if not gh_actions:
+                logging.info(started_header)
             t = time.perf_counter()
             result = systemtest.run(run_directory)
             elapsed_time = time.perf_counter() - t
@@ -133,16 +132,14 @@ def _group_end() -> None:
                 status_label = _style("✅ PASS", 32)
             else:
                 status_label = _style("❌ FAIL", 31)
-
-            logging.info(
-                f"{status_label} Finished {systemtest} in {elapsed_time:.1f}s"
-            )
-            results.append(result)
         finally:
             _group_end()
-            if not gh_actions:
-                print("=" * 80, flush=True)
 
+        print(f"{status_label} Finished {systemtest} in {elapsed_time:.1f}s", flush=True)
+        print(f"[{number}/{total}] {systemtest}", flush=True)
+        results.append(result)
+
+    print(flush=True)
     system_test_success = True
     for result in results:
         if not result.success:
diff --git a/tools/tests/systemtests/Systemtest.py b/tools/tests/systemtests/Systemtest.py
index 492b480a5..cbee71c4e 100644
--- a/tools/tests/systemtests/Systemtest.py
+++ b/tools/tests/systemtests/Systemtest.py
@@ -117,6 +117,10 @@ class SystemtestResult:
     fieldcompare_time: float  # in seconds
 
 
+def _success_status_symbol(success: bool) -> str:
+    return "✅" if success else "❌"
+
+
 def display_systemtestresults_as_table(results: List[SystemtestResult]):
     """
     Prints the result in a nice tabluated way to get an easy overview
@@ -129,7 +133,7 @@ def _get_length_of_name(results: List[SystemtestResult]) -> int:
     max_name_length = _get_length_of_name(results)
 
     header = f"| {'systemtest':<{max_name_length + 2}} "\
-        f"| {'success':^7} "\
+        f"| {'status':^7} "\
         f"| {'building time [s]':^17} "\
         f"| {'solver time [s]':^15} "\
         f"| {'fieldcompare time [s]':^21} |"
@@ -148,7 +152,7 @@ def _get_length_of_name(results: List[SystemtestResult]) -> int:
 
     for result in results:
         row = f"| {str(result.systemtest):<{max_name_length + 2}} "\
-            f"| {result.success:^7} "\
+            f"| {_success_status_symbol(result.success):^7} "\
             f"| {result.build_time:^17.1f} "\
             f"| {result.solver_time:^15.1f} "\
             f"| {result.fieldcompare_time:^21.1f} |"

From 2bacf6f6f35f98aa8fcbbae4264a41658bc20504 Mon Sep 17 00:00:00 2001
From: PranjalManhgaye <manhgayepranjal@gmail.com>
Date: Mon, 1 Jun 2026 19:15:41 +0530
Subject: [PATCH 11/11] Restore blank lines between system tests on GitHub
 Actions

Add an empty line after each test's PASS/FAIL output so logs stay
readable now that === banners are removed.
---
 tools/tests/systemtests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/tests/systemtests.py b/tools/tests/systemtests.py
index 10531b33d..42d695a84 100644
--- a/tools/tests/systemtests.py
+++ b/tools/tests/systemtests.py
@@ -137,6 +137,7 @@ def _group_end() -> None:
 
         print(f"{status_label} Finished {systemtest} in {elapsed_time:.1f}s", flush=True)
         print(f"[{number}/{total}] {systemtest}", flush=True)
+        print(flush=True)
         results.append(result)
 
     print(flush=True)