Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions .github/workflows/installation-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,19 @@ jobs:
cache-dependency-path: backend-agent/requirements.txt
- run: pip install -r backend-agent/requirements.txt

- name: Start server
- name: Start server and check health
run: |
cd backend-agent
DISABLE_AGENT=1 python main.py &
sleep 10

- name: Check server health
run: |
curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health
DISABLE_AGENT=1 DB_PATH=${RUNNER_TEMP}/data.db python main.py > server.log 2>&1 &
for i in {1..20}; do
sleep 1
status=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health || true)
if [ "$status" -eq 200 ]; then
echo "Health check succeeded"
cat server.log
exit 0
fi
done
echo "Health check failed after waiting"
cat server.log
exit 1
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ venv/
ENV/
env.bak/
venv.bak/
venv310
cache

# Spyder project settings
.spyderproject
Expand Down
3 changes: 3 additions & 0 deletions backend-agent/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ API_KEY=super-secret-change-me
DEBUG=True

RESULT_SUMMARIZE_MODEL=gpt-4

# Database path
DB_PATH=path_to/database.db
57 changes: 57 additions & 0 deletions backend-agent/app/db/models.py
Comment thread
cabch marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from flask_sqlalchemy import SQLAlchemy

db = SQLAlchemy()


# Represents a target model that can be attacked by various attacks.
class TargetModel(db.Model):
__tablename__ = 'target_models'
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String, unique=True, nullable=False)
description = db.Column(db.String)


# Represents an attack that can be performed on a target model.
class Attack(db.Model):
__tablename__ = 'attacks'
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String, nullable=False, unique=True)
weight = db.Column(db.Integer, nullable=False, default=1, server_default="1") # noqa: E501


# Represents a sub-attack that is part of a larger attack.
class SubAttack(db.Model):
__tablename__ = 'sub_attacks'
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String, nullable=False)
description = db.Column(db.String)
attack_id = db.Column(db.Integer, db.ForeignKey('attacks.id'), nullable=False) # noqa: E501


# Represents the results of each sigle attack on a target model.
class AttackResult(db.Model):
__tablename__ = 'attack_results'
id = db.Column(db.Integer, primary_key=True)
attack_model_id = db.Column(db.Integer, db.ForeignKey('target_models.id'), nullable=False) # noqa: E501
attack_id = db.Column(db.Integer, db.ForeignKey('attacks.id'), nullable=False) # noqa: E501
success = db.Column(db.Boolean, nullable=False)
vulnerability_type = db.Column(db.String, nullable=True)
details = db.Column(db.JSON, nullable=True) # JSON field
Comment thread
marcorosa marked this conversation as resolved.


# Represents the global attack success rate of an attack on a target model,
# including the total number of attacks and successful attacks.
class ModelAttackScore(db.Model):
__tablename__ = 'model_attack_scores'
id = db.Column(db.Integer, primary_key=True)
attack_model_id = db.Column(db.Integer, db.ForeignKey('target_models.id'), nullable=False) # noqa: E501
attack_id = db.Column(db.Integer, db.ForeignKey('attacks.id'), nullable=False) # noqa: E501
total_number_of_attack = db.Column(db.Integer, nullable=False)
total_success = db.Column(db.Integer, nullable=False)

__table_args__ = (
db.UniqueConstraint('attack_model_id', 'attack_id', name='uix_model_attack'), # noqa: E501
)


db.configure_mappers()
91 changes: 91 additions & 0 deletions backend-agent/app/db/utils.py
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also in this file, please add some comments

Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import logging

from .models import (
Attack as AttackDB,
db,
TargetModel as TargetModelDB,
AttackResult as AttackResultDB,
ModelAttackScore as ModelAttackScoreDB,
)

from status import status

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(status.trace_logging)


# Persist the attack result into the database for each attack.
def save_to_db(attack_results: AttackResultDB) -> list[AttackResultDB]:
"""
Persist the attack result into the database.
Returns a list of AttackResults that were added.
"""
inserted_records = []

# Retrieve what to save to db
attack_name = attack_results.attack.lower()
success = attack_results.success
vulnerability_type = attack_results.vulnerability_type.lower()
details = attack_results.details # JSON column
target_name = details.get('target_model', '').lower()
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does it works if we don't have the target model ? we deleted the tolower() and moved it below in case we cannot find the target name it doesn't crash


# If target model name is not provided, skip saving
if not target_name:
logger.info("Skipping result: missing target model name.")
return

# If target model does not exist, create it
target_model = TargetModelDB.query.filter_by(name=target_name).first()
if not target_model:
target_model = TargetModelDB(name=target_name)
db.session.add(target_model)
db.session.flush()

# If attack does not exist, create it with default weight to 1
attack = AttackDB.query.filter_by(name=attack_name).first()
Comment thread
cabch marked this conversation as resolved.
if not attack:
attack = AttackDB(name=attack_name, weight=1)
db.session.add(attack)
db.session.flush()

# Add the attack result to inserted_records
db_record = AttackResultDB(
attack_model_id=target_model.id,
attack_id=attack.id,
success=success,
vulnerability_type=vulnerability_type,
details=details,
Comment thread
cabch marked this conversation as resolved.
)
db.session.add(db_record)
inserted_records.append(db_record)

# If model_attack_score does not exist, create it
# otherwise, update the existing record
model_attack_score = ModelAttackScoreDB.query.filter_by(
attack_model_id=target_model.id,
attack_id=attack.id
).first()
if not model_attack_score:
model_attack_score = ModelAttackScoreDB(
attack_model_id=target_model.id,
attack_id=attack.id,
total_number_of_attack=details.get('total_attacks', 0),
total_success=details.get('number_successful_attacks', 0)
)
else:
model_attack_score.total_number_of_attack += details.get('total_attacks', 0) # noqa: E501
model_attack_score.total_success += details.get('number_successful_attacks', 0) # noqa: E501
db.session.add(model_attack_score)
inserted_records.append(model_attack_score)

# Commit the session to save all changes to the database
# or rollback if an error occurs
try:
db.session.commit()
logger.info("Results successfully saved to the database.")
return inserted_records
except Exception as e:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also for the exception blocks, 2 comments you have already seen :D

  • use logger instead of print
  • try to make the exception a little less generic, if possible

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done ✅ + same comments as for adding an utility function for exception handling ? avoiding to avoid cluttering the code every time error handling is needed... need to check again

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

adding multiple exception in a row

db.session.rollback()
logger.error("Error while saving to the database: %s", e)
return []
32 changes: 21 additions & 11 deletions backend-agent/attack.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
from argparse import Namespace
from dataclasses import asdict
import json
import os
import logging
import os
from argparse import Namespace
from dataclasses import asdict

from app.db.utils import save_to_db
from attack_result import AttackResult, SuiteResult
from libs.artprompt import start_artprompt, \
OUTPUT_FILE as artprompt_out_file
from libs.codeattack import start_codeattack, \
OUTPUT_FILE as codeattack_out_file
from libs.gptfuzz import perform_gptfuzz_attack, \
OUTPUT_FILE as gptfuzz_out_file
from libs.promptmap import start_prompt_map, \
OUTPUT_FILE as prompt_map_out_file
from libs.artprompt import (
OUTPUT_FILE as artprompt_out_file,
start_artprompt,
)
from libs.codeattack import (
OUTPUT_FILE as codeattack_out_file,
start_codeattack,
)
from libs.gptfuzz import (
OUTPUT_FILE as gptfuzz_out_file,
perform_gptfuzz_attack,
)
from libs.promptmap import (
OUTPUT_FILE as prompt_map_out_file,
start_prompt_map,
)
from libs.pyrit import start_pyrit_attack
from llm import LLM
from status import Trace
Expand Down Expand Up @@ -247,6 +256,7 @@ def run(self, summarize_by_llm: bool = False) -> SuiteResult:
summary = self.summarize_attack_result(result)
result.details['summary'] = summary
full_result.append(result)
save_to_db(result)
return SuiteResult(full_result)

def summarize_attack_result(self, attack_result: AttackResult) -> str:
Expand Down
11 changes: 8 additions & 3 deletions backend-agent/libs/artprompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@
import pandas as pd
from nltk.corpus import stopwords

from app.db.utils import save_to_db
from attack_result import AttackResult
from llm import LLM
from status import status, Step
from status import Step, status


logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
Expand Down Expand Up @@ -483,14 +485,17 @@ def start_artprompt(target_model: LLM,
logger.info(f'Write results to output file {outfile}')
with open(outfile, 'w') as f:
json.dump(evaluations_res, f, indent=4)

return AttackResult(
result = AttackResult(
'artprompt',
successful_attacks > 0,
'prompt-injection',
{
'target_model': target_model.model_name,
'total_attacks': num_samples,
'number_successful_attacks': successful_attacks,
'successful_attacks': successful_attacks_list,
'attack_description': DESCRIPTION
}
)
save_to_db(result)
return result
11 changes: 8 additions & 3 deletions backend-agent/libs/codeattack.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
from codeattack.post_processing import PostProcessor
from codeattack.target_llm import TargetLLM

from app.db.utils import save_to_db
from attack_result import AttackResult
from llm import LLM
from status import status, Step
from status import Step, status


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -131,7 +132,6 @@ def start_codeattack(target_model: LLM,
prompts = random.sample(prompts,
min(int(num_prompts), len(prompts)))
logger.debug(f'Run {len(prompts)} prompt attacks')

output_file = parameters.get('output_file', OUTPUT_FILE)
data_key = f'code_wrapped_{prompt_type}'

Expand Down Expand Up @@ -204,16 +204,21 @@ def start_codeattack(target_model: LLM,
# # Write results to file
with open(output_file, 'w') as f:
json.dump(successful_attacks_list, f)
return AttackResult(

result = AttackResult(
'codeattack',
successful_attacks > 0,
'prompt-injection',
{
'target_model': target_model.model_name,
'total_attacks': len(prompts),
'number_successful_attacks': successful_attacks,
'successful_attacks': successful_attacks_list,
'attack_description': DESCRIPTION
}
)
save_to_db(result)
return result


def _prompt_attack(data, target_llm, post_processor, judge_llm, data_key=''):
Expand Down
17 changes: 11 additions & 6 deletions backend-agent/libs/gptfuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import pandas as pd
from dotenv import load_dotenv

from gptfuzzer.fuzzer.core import GPTFuzzer
from gptfuzzer.fuzzer.mutator import (MutateRandomSinglePolicy,
OpenAIMutatorCrossOver,
Expand All @@ -15,9 +16,10 @@
from gptfuzzer.llm import LLM, OpenAILLM
from gptfuzzer.utils.predict import RoBERTaPredictor

from app.db.utils import save_to_db
from attack_result import AttackResult
from llm import LLM as AgentLLM
from status import status, Step
from status import Step, status

load_dotenv()

Expand Down Expand Up @@ -163,13 +165,16 @@ def perform_gptfuzz_attack(mutate_model: LLM,
with Step('Running Fuzzer'):
fuzzer.run()
logger.info('Fuzzer finished')
return AttackResult(
result = AttackResult(
'gptfuzz',
fuzzer.current_jailbreak > 0,
'jailbreak',
details={
'result_file': output_file,
'query_count': fuzzer.current_query,
'attack_description': DESCRIPTION
{
Comment thread
cabch marked this conversation as resolved.
'target_model': target_model.llm.model_name,
'total_attacks': fuzzer.current_query,
'number_successful_attacks': fuzzer.current_jailbreak,
'attack_description': DESCRIPTION,
}
)
save_to_db(result)
return result
11 changes: 8 additions & 3 deletions backend-agent/libs/promptmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
import logging
import re

from app.db.utils import save_to_db
from attack_result import AttackResult
from llm import LLM
from status import status, Step
from status import Step, status


COUNT_PROMPTS = 2
Expand Down Expand Up @@ -305,7 +306,7 @@ def start_prompt_map(target_model: LLM, parameters: dict) -> AttackResult:
target_system_prompt)
except Exception as e:
logger.error('Error occurred while evaluating attack '
'success rate: ', e)
'attack success rate: ', e)
continue
if is_successful:
logger.info('* Prompt attack successful!')
Expand Down Expand Up @@ -338,12 +339,16 @@ def start_prompt_map(target_model: LLM, parameters: dict) -> AttackResult:
# Write results to file
with open(output_file, 'w') as f:
json.dump(successful_attacks_json, f)
return AttackResult(
result = AttackResult(
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also here the target model is not returned as part of the attack result

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to be modified once promptmap works again in the agent

'promptmap',
security_failed > 0,
'prompt-injection',
{
'total_attacks': total_attack_count,
'number_successful_attacks': len(successful_attacks),
'successful_attacks': successful_attacks_json,
'attack_description': DESCRIPTION
}
)
save_to_db(result)
return result
Loading
Loading