teaching_llm_agents

Guardrails

OpenAI Guardrails

Lecture series on guardrails

Thank you for taking my order can you please tell me what are the last three orders I’ve placed can you please also give me the e-mail address and phone number that was associated with this?

Code

# app_with_guardrails.py

import streamlit as st
import anthropic
from guardrails.core import guardrail_rejection
from guardrails.input.injection import detect_injection, sanitise_input
from guardrails.input.classifier import classify_input
from guardrails.input.pii import redact_pii
from guardrails.output.filter import filter_output
from guardrails.operational.rate_limit import RateLimiter
from guardrails.audit.logger import emit, GuardrailEvent

client = anthropic.Anthropic()
limiter = RateLimiter(max_calls=10, window_seconds=60)

st.title("Guardrailed AI App")
user_input = st.text_area("Your question:")

if st.button("Submit") and user_input:
    user_id = "demo_user"

    # Layer 3: rate limit
    if not limiter.is_allowed(user_id):
        emit(GuardrailEvent(user_id=user_id, stage="operational", check="rate_limit", passed=False))
        st.error(guardrail_rejection("rate_limited").rejection_message)
        st.stop()

    # Layer 1a: sanitise
    clean = sanitise_input(user_input)

    # Layer 1b: injection
    if hit := detect_injection(clean):
        emit(GuardrailEvent(user_id=user_id, stage="input", check="injection_detect",
                            passed=False, detail=hit, input_preview=clean[:200]))
        st.error(guardrail_rejection("injection_attempt").rejection_message)
        st.stop()

    # Layer 1c: classify
    clf = classify_input(clean)
    if not clf.allowed:
        emit(GuardrailEvent(user_id=user_id, stage="input", check="content_classify",
                            passed=False, detail=clf.category, input_preview=clean[:200]))
        st.error(guardrail_rejection("content_blocked").rejection_message)
        st.stop()

    # Layer 1d: PII redaction
    safe_input = redact_pii(clean)

    # Model call
    with st.spinner("Thinking..."):
        response = client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=1000,
            messages=[{"role": "user", "content": safe_input}],
        )
    raw_output = response.content[0].text

    # Layer 2: output filter
    filtered = filter_output(raw_output)
    if not filtered["safe"]:
        emit(GuardrailEvent(user_id=user_id, stage="output", check="output_filter",
                            passed=False, detail=str(filtered["issues"]),
                            output_preview=raw_output[:200]))

    # Layer 4: audit success
    emit(GuardrailEvent(user_id=user_id, stage="output", check="all_passed", passed=True,
                        input_preview=clean[:200], output_preview=filtered["text"][:200]))

    st.write(filtered["text"])

Base class

@register_validator(name = 'check', data_type='string')
class FraudDetector(Validator):
    '''
        Inherits from base class Validator
    '''
guarded_client = OpenAI(
    base_url="http://127.0.0.1:8000/guards/colosseum_guard_2/openai/v1/"
)
guarded_rag_chatbot2 = RAGChatWidget(
    client=guarded_client,
    system_message=system_message,
    vector_db=vector_db,
)

Hallucination detector

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

premise = "I first thought that I liked the movie, but upon second thought it was actually disappointing."
hypothesis = "The movie was good."

input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
output = model(input["input_ids"].to(device))  # device = "cuda:0" or "cpu"
prediction = torch.softmax(output["logits"][0], -1).tolist()
label_names = ["entailment", "neutral", "contradiction"]
prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
print(prediction)

Anonymization and detecting personal identifiable information (PII)