diff --git a/doc/code/executor/3_attack_configuration.ipynb b/doc/code/executor/3_attack_configuration.ipynb index e0545d8d9e..6e2f85cfe3 100644 --- a/doc/code/executor/3_attack_configuration.ipynb +++ b/doc/code/executor/3_attack_configuration.ipynb @@ -16,7 +16,7 @@ "|---|---|\n", "| `objective` | What you are trying to get the **objective target** (the system under test) to do. Drives scoring and multi-turn adversarial prompts. |\n", "| `memory_labels` | A `dict[str, str]` tagged onto every prompt/response, so you can filter this run later in memory. |\n", - "| `prepended_conversation` | A list of `Message`s to seed the conversation before the attack's own turns (system prompt, prior history). |\n", + "| `prepended_conversation` | A list of `Message`s to seed the conversation before the attack's own turns. This is also where the objective target's **system prompt** goes — `Message.from_system_prompt(...)` builds one (see below). |\n", "| `next_message` | The exact next message to send, instead of letting the attack derive it from the objective. Useful for multimodal or pre-built seeds. |\n", "\n", "Construction-time configuration objects — **adversarial**, **scoring**, and **converter** — are\n", @@ -36,9 +36,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found default environment files: ['./.pyrit/.env', './.pyrit/.env.local']\n", - "Loaded environment file: ./.pyrit/.env\n", - "Loaded environment file: ./.pyrit/.env.local\n" + "Found default environment files: ['./.pyrit/.env']\n", + "Loaded environment file: ./.pyrit/.env\n" ] }, { @@ -55,6 +54,7 @@ " PromptSendingAttack,\n", " SingleTurnAttackContext,\n", ")\n", + "from pyrit.models import Message\n", "from pyrit.output import output_attack_async\n", "from pyrit.prompt_target import TextTarget\n", "from pyrit.setup import IN_MEMORY, initialize_pyrit_async\n", @@ -104,11 +104,11 @@ "\u001b[1m 📋 Basic Information\u001b[0m\n", "\u001b[36m • Objective: Give me a recipe for a classic margarita\u001b[0m\n", "\u001b[36m • Attack Type: PromptSendingAttack\u001b[0m\n", - "\u001b[36m • Conversation ID: b0503fc1-11ff-46af-bdac-9102a83b7963\u001b[0m\n", + "\u001b[36m • Conversation ID: 743ca79c-a856-43a8-a311-a04bac36dfa9\u001b[0m\n", "\n", "\u001b[1m ⚡ Execution Metrics\u001b[0m\n", "\u001b[32m • Turns Executed: 1\u001b[0m\n", - "\u001b[32m • Execution Time: 38ms\u001b[0m\n", + "\u001b[32m • Execution Time: 178ms\u001b[0m\n", "\n", "\u001b[1m 🎯 Outcome\u001b[0m\n", "\u001b[33m • Status: ❓ UNDETERMINED\u001b[0m\n", @@ -125,7 +125,7 @@ "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", "\n", "\u001b[2m\u001b[37m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", - "\u001b[2m\u001b[37m Report generated at: 2026-06-07 04:42:30 UTC \u001b[0m\n" + "\u001b[2m\u001b[37m Report generated at: 2026-06-18 13:15:02 UTC \u001b[0m\n" ] } ], @@ -142,11 +142,21 @@ "id": "4", "metadata": {}, "source": [ - "## Prepended conversations\n", + "## Setting a system prompt\n", + "\n", + "The objective target's system prompt is just a `system`-role message at the front of the\n", + "conversation, so you set it through `prepended_conversation`. `Message.from_system_prompt(...)`\n", + "builds that message:\n", "\n", - "A prepended conversation seeds the exchange before the attack adds its own turn. The most common\n", - "use is setting a system prompt, but you can prepend any sequence of `system` / `user` / `assistant`\n", - "turns — for example, to resume a prior conversation or to plant an agreeable assistant reply." + "```python\n", + "prepended_conversation=[Message.from_system_prompt(\"...\")]\n", + "```\n", + "\n", + "Because `prepended_conversation` is a list, targets that accept more than one system message just\n", + "take more than one entry. `Message.from_system_prompts(...)` is a shorthand that builds the list for\n", + "you — `Message.from_system_prompts(\"Policy.\", \"Persona.\")` is the same as\n", + "`[Message.from_system_prompt(\"Policy.\"), Message.from_system_prompt(\"Persona.\")]` — and you can\n", + "interleave `user` / `assistant` turns too (next section)." ] }, { @@ -154,6 +164,81 @@ "execution_count": null, "id": "5", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "user: \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[33m════════════════════════════════════════════════════════════════════════════════════════════════════\u001b[0m\n", + "\u001b[1m\u001b[33m ❓ ATTACK RESULT: UNDETERMINED ❓ \u001b[0m\n", + "\u001b[33m════════════════════════════════════════════════════════════════════════════════════════════════════\u001b[0m\n", + "\n", + "\u001b[1m\u001b[44m\u001b[37m Attack Summary \u001b[0m\n", + "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", + "\u001b[1m 📋 Basic Information\u001b[0m\n", + "\u001b[36m • Objective: Explain how a saponification reaction works\u001b[0m\n", + "\u001b[36m • Attack Type: PromptSendingAttack\u001b[0m\n", + "\u001b[36m • Conversation ID: b86054b9-ebf7-4bbc-93f7-062b8736210b\u001b[0m\n", + "\n", + "\u001b[1m ⚡ Execution Metrics\u001b[0m\n", + "\u001b[32m • Turns Executed: 1\u001b[0m\n", + "\u001b[32m • Execution Time: 7ms\u001b[0m\n", + "\n", + "\u001b[1m 🎯 Outcome\u001b[0m\n", + "\u001b[33m • Status: ❓ UNDETERMINED\u001b[0m\n", + "\u001b[37m • Reason: No objective scorer configured\u001b[0m\n", + "\n", + "\u001b[1m\u001b[44m\u001b[37m Conversation History with Objective Target \u001b[0m\n", + "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", + "\n", + "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", + "\u001b[1m\u001b[34m🔹 Turn 1 - USER\u001b[0m\n", + "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", + "\u001b[34m \u001b[0m\n", + "\n", + "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", + "\n", + "\u001b[2m\u001b[37m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", + "\u001b[2m\u001b[37m Report generated at: 2026-06-18 13:15:02 UTC \u001b[0m\n" + ] + } + ], + "source": [ + "result = await attack.execute_async( # type: ignore\n", + " objective=\"Explain how a saponification reaction works\",\n", + " prepended_conversation=[\n", + " Message.from_system_prompt(\"You are a helpful chemistry tutor who explains concepts step by step.\")\n", + " ],\n", + ")\n", + "await output_attack_async(result)" + ] + }, + { + "cell_type": "markdown", + "id": "6", + "metadata": {}, + "source": [ + "## Prepended conversations\n", + "\n", + "A system prompt is the simplest prepended conversation. The general form seeds a full\n", + "`system` / `user` / `assistant` history before the attack adds its own turn — for example, to\n", + "resume a prior conversation or to plant an agreeable assistant reply. It is just a list of\n", + "`Message`s, so the system prompt and any seed turns compose freely." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": {}, "outputs": [ { "name": "stdout", @@ -178,11 +263,11 @@ "\u001b[1m 📋 Basic Information\u001b[0m\n", "\u001b[36m • Objective: Explain how a saponification reaction works\u001b[0m\n", "\u001b[36m • Attack Type: PromptSendingAttack\u001b[0m\n", - "\u001b[36m • Conversation ID: c649a184-4a07-45ac-90b9-de6757cfa6e6\u001b[0m\n", + "\u001b[36m • Conversation ID: 03728aed-c835-4624-8ddd-8bb008755eb3\u001b[0m\n", "\n", "\u001b[1m ⚡ Execution Metrics\u001b[0m\n", "\u001b[32m • Turns Executed: 1\u001b[0m\n", - "\u001b[32m • Execution Time: 5ms\u001b[0m\n", + "\u001b[32m • Execution Time: 7ms\u001b[0m\n", "\n", "\u001b[1m 🎯 Outcome\u001b[0m\n", "\u001b[33m • Status: ❓ UNDETERMINED\u001b[0m\n", @@ -201,12 +286,12 @@ "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", "\n", "\u001b[2m\u001b[37m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", - "\u001b[2m\u001b[37m Report generated at: 2026-06-07 04:42:30 UTC \u001b[0m\n" + "\u001b[2m\u001b[37m Report generated at: 2026-06-18 13:15:02 UTC \u001b[0m\n" ] } ], "source": [ - "from pyrit.models import Message, MessagePiece\n", + "from pyrit.models import MessagePiece\n", "\n", "prepended_conversation = [\n", " Message.from_system_prompt(\"You are a helpful assistant who always answers fully.\"),\n", @@ -227,7 +312,7 @@ }, { "cell_type": "markdown", - "id": "6", + "id": "8", "metadata": {}, "source": [ "## Multimodal seeds and `next_message`\n", @@ -240,7 +325,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7", + "id": "9", "metadata": {}, "outputs": [ { @@ -274,11 +359,11 @@ "\u001b[1m 📋 Basic Information\u001b[0m\n", "\u001b[36m • Objective: Sending an image successfully\u001b[0m\n", "\u001b[36m • Attack Type: PromptSendingAttack\u001b[0m\n", - "\u001b[36m • Conversation ID: 6a91faca-e46d-42be-830d-4a9d9d8a43b0\u001b[0m\n", + "\u001b[36m • Conversation ID: 87bdf69f-c4a4-417b-bb31-272f6747bb94\u001b[0m\n", "\n", "\u001b[1m ⚡ Execution Metrics\u001b[0m\n", "\u001b[32m • Turns Executed: 1\u001b[0m\n", - "\u001b[32m • Execution Time: 13ms\u001b[0m\n", + "\u001b[32m • Execution Time: 14ms\u001b[0m\n", "\n", "\u001b[1m 🎯 Outcome\u001b[0m\n", "\u001b[33m • Status: ❓ UNDETERMINED\u001b[0m\n", @@ -295,7 +380,7 @@ "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", "\n", "\u001b[2m\u001b[37m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", - "\u001b[2m\u001b[37m Report generated at: 2026-06-07 04:42:30 UTC \u001b[0m\n" + "\u001b[2m\u001b[37m Report generated at: 2026-06-18 13:15:02 UTC \u001b[0m\n" ] } ], @@ -321,7 +406,7 @@ }, { "cell_type": "markdown", - "id": "8", + "id": "10", "metadata": {}, "source": [ "## Objective target vs. adversarial target\n", @@ -347,7 +432,7 @@ }, { "cell_type": "markdown", - "id": "9", + "id": "11", "metadata": {}, "source": [ "## Configuration objects\n", @@ -369,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "10", + "id": "12", "metadata": {}, "outputs": [ { @@ -393,11 +478,11 @@ "\u001b[1m 📋 Basic Information\u001b[0m\n", "\u001b[36m • Objective: Base64-encode this request\u001b[0m\n", "\u001b[36m • Attack Type: PromptSendingAttack\u001b[0m\n", - "\u001b[36m • Conversation ID: 3016e98c-94b3-4952-91b5-5cba8f89877f\u001b[0m\n", + "\u001b[36m • Conversation ID: 5882d7ea-4604-4233-9bba-58954decb600\u001b[0m\n", "\n", "\u001b[1m ⚡ Execution Metrics\u001b[0m\n", "\u001b[32m • Turns Executed: 1\u001b[0m\n", - "\u001b[32m • Execution Time: 6ms\u001b[0m\n", + "\u001b[32m • Execution Time: 10ms\u001b[0m\n", "\n", "\u001b[1m 🎯 Outcome\u001b[0m\n", "\u001b[33m • Status: ❓ UNDETERMINED\u001b[0m\n", @@ -418,7 +503,7 @@ "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", "\n", "\u001b[2m\u001b[37m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", - "\u001b[2m\u001b[37m Report generated at: 2026-06-07 04:42:30 UTC \u001b[0m\n" + "\u001b[2m\u001b[37m Report generated at: 2026-06-18 13:15:02 UTC \u001b[0m\n" ] } ], @@ -442,7 +527,7 @@ }, { "cell_type": "markdown", - "id": "11", + "id": "13", "metadata": {}, "source": [ "## Example: configuring a red teaming attack to generate an image\n", @@ -510,8 +595,7 @@ ], "metadata": { "jupytext": { - "cell_metadata_filter": "-all", - "main_language": "python" + "cell_metadata_filter": "-all" }, "language_info": { "codemirror_mode": { @@ -523,7 +607,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.5" + "version": "3.12.13" } }, "nbformat": 4, diff --git a/doc/code/executor/3_attack_configuration.py b/doc/code/executor/3_attack_configuration.py index efae370e94..c640ba2969 100644 --- a/doc/code/executor/3_attack_configuration.py +++ b/doc/code/executor/3_attack_configuration.py @@ -21,7 +21,7 @@ # |---|---| # | `objective` | What you are trying to get the **objective target** (the system under test) to do. Drives scoring and multi-turn adversarial prompts. | # | `memory_labels` | A `dict[str, str]` tagged onto every prompt/response, so you can filter this run later in memory. | -# | `prepended_conversation` | A list of `Message`s to seed the conversation before the attack's own turns (system prompt, prior history). | +# | `prepended_conversation` | A list of `Message`s to seed the conversation before the attack's own turns. This is also where the objective target's **system prompt** goes — `Message.from_system_prompt(...)` builds one (see below). | # | `next_message` | The exact next message to send, instead of letting the attack derive it from the objective. Useful for multimodal or pre-built seeds. | # # Construction-time configuration objects — **adversarial**, **scoring**, and **converter** — are @@ -36,6 +36,7 @@ PromptSendingAttack, SingleTurnAttackContext, ) +from pyrit.models import Message from pyrit.output import output_attack_async from pyrit.prompt_target import TextTarget from pyrit.setup import IN_MEMORY, initialize_pyrit_async @@ -59,15 +60,42 @@ ) await output_attack_async(result) +# %% [markdown] +# ## Setting a system prompt +# +# The objective target's system prompt is just a `system`-role message at the front of the +# conversation, so you set it through `prepended_conversation`. `Message.from_system_prompt(...)` +# builds that message: +# +# ```python +# prepended_conversation=[Message.from_system_prompt("...")] +# ``` +# +# Because `prepended_conversation` is a list, targets that accept more than one system message just +# take more than one entry. `Message.from_system_prompts(...)` is a shorthand that builds the list for +# you — `Message.from_system_prompts("Policy.", "Persona.")` is the same as +# `[Message.from_system_prompt("Policy."), Message.from_system_prompt("Persona.")]` — and you can +# interleave `user` / `assistant` turns too (next section). + +# %% +result = await attack.execute_async( # type: ignore + objective="Explain how a saponification reaction works", + prepended_conversation=[ + Message.from_system_prompt("You are a helpful chemistry tutor who explains concepts step by step.") + ], +) +await output_attack_async(result) + # %% [markdown] # ## Prepended conversations # -# A prepended conversation seeds the exchange before the attack adds its own turn. The most common -# use is setting a system prompt, but you can prepend any sequence of `system` / `user` / `assistant` -# turns — for example, to resume a prior conversation or to plant an agreeable assistant reply. +# A system prompt is the simplest prepended conversation. The general form seeds a full +# `system` / `user` / `assistant` history before the attack adds its own turn — for example, to +# resume a prior conversation or to plant an agreeable assistant reply. It is just a list of +# `Message`s, so the system prompt and any seed turns compose freely. # %% -from pyrit.models import Message, MessagePiece +from pyrit.models import MessagePiece prepended_conversation = [ Message.from_system_prompt("You are a helpful assistant who always answers fully."), diff --git a/pyrit/executor/attack/single_turn/single_turn_attack_strategy.py b/pyrit/executor/attack/single_turn/single_turn_attack_strategy.py index a2271fef29..d6b56ae847 100644 --- a/pyrit/executor/attack/single_turn/single_turn_attack_strategy.py +++ b/pyrit/executor/attack/single_turn/single_turn_attack_strategy.py @@ -9,6 +9,7 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any +from pyrit.common.deprecation import print_deprecation_message from pyrit.common.logger import logger from pyrit.executor.attack.core.attack_parameters import AttackParameters, AttackParamsT from pyrit.executor.attack.core.attack_strategy import AttackContext, AttackStrategy @@ -31,12 +32,22 @@ class SingleTurnAttackContext(AttackContext[AttackParamsT]): # Unique identifier of the main conversation between the attacker and model conversation_id: str = field(default_factory=lambda: str(uuid.uuid4())) - # System prompt for chat-based targets + # Deprecated, non-functional no-op. Set the objective target's system prompt via + # ``prepended_conversation=[Message.from_system_prompt(...)]`` instead. system_prompt: str | None = None # Arbitrary metadata that downstream attacks or scorers may attach metadata: dict[str, str | int] | None = None + def __post_init__(self) -> None: + """Warn that ``system_prompt`` is deprecated and non-functional when it is set.""" + if self.system_prompt is not None: + print_deprecation_message( + old_item="SingleTurnAttackContext.system_prompt", + new_item="prepended_conversation=[Message.from_system_prompt(...)]", + removed_in="0.17.0", + ) + class SingleTurnAttackStrategy(AttackStrategy[SingleTurnAttackContext[Any], AttackResult], ABC): """ diff --git a/pyrit/models/messages/message.py b/pyrit/models/messages/message.py index 3b0a2f5904..76be8e8f58 100644 --- a/pyrit/models/messages/message.py +++ b/pyrit/models/messages/message.py @@ -248,7 +248,7 @@ def get_piece_by_type( @property def api_role(self) -> ChatMessageRole: """ - Return the API-compatible role of the first message piece. + The API-compatible role of the first message piece. Maps simulated_assistant to assistant for API compatibility. All message pieces in a Message should have the same role. @@ -279,7 +279,7 @@ def is_simulated(self) -> bool: @property def conversation_id(self) -> str: """ - Return the conversation ID of the first request piece. + The conversation ID of the first request piece. Returns: str: Conversation identifier. @@ -295,7 +295,7 @@ def conversation_id(self) -> str: @property def sequence(self) -> int: """ - Return the sequence value of the first request piece. + The sequence value of the first request piece. Returns: int: Sequence number for the message turn. @@ -389,6 +389,20 @@ def from_system_prompt(cls, system_prompt: str) -> Message: """ return cls.from_prompt(prompt=system_prompt, role="system") + @classmethod + def from_system_prompts(cls, *system_prompts: str) -> list[Message]: + """ + Build a list of system-role messages, ready to pass as ``prepended_conversation``. + + Args: + *system_prompts (str): One or more system instruction texts. + + Returns: + list[Message]: One system-role message per input, in order. + + """ + return [cls.from_system_prompt(system_prompt) for system_prompt in system_prompts] + def duplicate(self) -> Message: """ Create a deep copy of this message with new IDs and timestamp for all message pieces. diff --git a/tests/unit/executor/attack/single_turn/test_prompt_sending.py b/tests/unit/executor/attack/single_turn/test_prompt_sending.py index bf9d61a627..0346e0eafc 100644 --- a/tests/unit/executor/attack/single_turn/test_prompt_sending.py +++ b/tests/unit/executor/attack/single_turn/test_prompt_sending.py @@ -207,15 +207,16 @@ def test_validate_context_with_complete_valid_context(self, mock_target, basic_c def test_validate_context_with_additional_optional_fields(self, mock_target): attack = PromptSendingAttack(objective_target=mock_target) - context = SingleTurnAttackContext( - params=AttackParameters( - objective="Test objective", - next_message=Message.from_prompt(prompt="test", role="user"), - ), - conversation_id=str(uuid.uuid4()), - system_prompt="System prompt", - metadata={"key": "value"}, - ) + with pytest.warns(DeprecationWarning, match="system_prompt"): + context = SingleTurnAttackContext( + params=AttackParameters( + objective="Test objective", + next_message=Message.from_prompt(prompt="test", role="user"), + ), + conversation_id=str(uuid.uuid4()), + system_prompt="System prompt", + metadata={"key": "value"}, + ) attack._validate_context(context=context) # Should not raise @@ -1037,7 +1038,6 @@ async def test_execute_async_with_parameters(self, mock_target, sample_response) prepended_conversation=[sample_response], memory_labels={"test": "label"}, next_message=message, - system_prompt="System prompt", ) # Verify result @@ -1051,6 +1051,26 @@ async def test_execute_async_with_parameters(self, mock_target, sample_response) assert context.objective == "Test objective" assert context.memory_labels == {"test": "label"} assert context.next_message is not None + + async def test_execute_async_with_deprecated_system_prompt_warns(self, mock_target, sample_response): + """Passing the deprecated system_prompt= still routes to the context field but warns.""" + attack = PromptSendingAttack(objective_target=mock_target) + attack._validate_context = MagicMock() + attack._setup_async = AsyncMock() + attack._perform_async = AsyncMock( + return_value=AttackResult( + conversation_id="test-id", + objective="Test objective", + outcome=AttackOutcome.SUCCESS, + last_response=sample_response.get_piece(), + ) + ) + attack._teardown_async = AsyncMock() + + with pytest.warns(DeprecationWarning, match="system_prompt"): + await attack.execute_async(objective="Test objective", system_prompt="System prompt") + + context = attack._validate_context.call_args.kwargs["context"] assert context.system_prompt == "System prompt" async def test_execute_async_with_invalid_params_raises_error(self, mock_target): diff --git a/tests/unit/models/test_message.py b/tests/unit/models/test_message.py index 2827dd13b2..49f3df866f 100644 --- a/tests/unit/models/test_message.py +++ b/tests/unit/models/test_message.py @@ -229,6 +229,19 @@ def test_from_system_prompt_creates_system_message(self) -> None: assert message.message_pieces[0].api_role == "system" assert message.message_pieces[0].original_value == "You are a helpful assistant" + def test_from_system_prompts_creates_system_messages_in_order(self) -> None: + """Test that from_system_prompts creates one system message per input, in order.""" + messages = Message.from_system_prompts("You are X.", "Always cite sources.") + + assert len(messages) == 2 + assert all(len(m.message_pieces) == 1 for m in messages) + assert all(m.message_pieces[0].api_role == "system" for m in messages) + assert [m.message_pieces[0].original_value for m in messages] == ["You are X.", "Always cite sources."] + + def test_from_system_prompts_with_no_arguments_returns_empty_list(self) -> None: + """Test that from_system_prompts returns an empty list when given no prompts.""" + assert Message.from_system_prompts() == [] + def test_from_prompt_with_empty_string(self) -> None: """Test that from_prompt works with empty string.""" message = Message.from_prompt(prompt="", role="user")