import math
import re
from collections import Counter

import numpy as np

np.set_printoptions(precision=3, suppress=True)
rng = np.random.default_rng(42)

corpus = [
    "The weld inspection found porosity near the edge.",
    "The inspection procedure requires visual review before final acceptance.",
    "Porosity and cracks require additional nondestructive testing.",
    "The maintenance log changed after the latest inspection policy update.",
]

def tokenize(text):
    """Tiny tokenizer: lowercase words and punctuation as separate tokens."""
    return re.findall(r"[A-Za-z]+|[.,!?;:]", text.lower())

tokenized_corpus = [tokenize(doc) for doc in corpus]
print(tokenized_corpus[0])

special_tokens = ["<pad>", "<unk>"]
vocab_tokens = sorted({tok for doc in tokenized_corpus for tok in doc})
vocab = {tok: idx for idx, tok in enumerate(special_tokens + vocab_tokens)}
id_to_token = {idx: tok for tok, idx in vocab.items()}

sentence = "The weld requires inspection."
tokens = tokenize(sentence)
token_ids = [vocab.get(tok, vocab["<unk>"]) for tok in tokens]

print("Tokens:", tokens)
print("Token IDs:", token_ids)
print("Vocabulary size:", len(vocab))

embedding_dim = 6
embedding_table = rng.normal(0, 0.2, size=(len(vocab), embedding_dim))
sequence_embeddings = embedding_table[token_ids]

print("Embedding table shape:", embedding_table.shape)
print("Sequence embedding shape:", sequence_embeddings.shape)
print(sequence_embeddings)

def sinusoidal_positions(seq_len, dim):
    positions = np.arange(seq_len)[:, None]
    dims = np.arange(dim)[None, :]
    angles = positions / np.power(10000, (2 * (dims // 2)) / dim)
    enc = np.zeros((seq_len, dim))
    enc[:, 0::2] = np.sin(angles[:, 0::2])
    enc[:, 1::2] = np.cos(angles[:, 1::2])
    return enc

positional = sinusoidal_positions(len(token_ids), embedding_dim)
transformer_input = sequence_embeddings + positional

print("Positional encoding shape:", positional.shape)
print("Input to first Transformer block shape:", transformer_input.shape)

scores = QK^T / sqrt(d_k)
weights = softmax(scores)
output = weights V

def softmax(x, axis=-1):
    x = x - np.max(x, axis=axis, keepdims=True)
    e = np.exp(x)
    return e / e.sum(axis=axis, keepdims=True)

d_model = embedding_dim
Wq = rng.normal(0, 0.3, size=(d_model, d_model))
Wk = rng.normal(0, 0.3, size=(d_model, d_model))
Wv = rng.normal(0, 0.3, size=(d_model, d_model))

Q = transformer_input @ Wq
K = transformer_input @ Wk
V = transformer_input @ Wv

scores = (Q @ K.T) / math.sqrt(d_model)
attention_weights = softmax(scores, axis=1)
attention_output = attention_weights @ V

print("Attention weight rows sum to:", attention_weights.sum(axis=1))
print("Attention weights for each token:\n", attention_weights)
print("Attention output shape:", attention_output.shape)

architecture_summary = {
    "encoder-only": {
        "purpose": "understand an input sequence",
        "examples": "BERT-style classifiers and embedding models",
        "tasks": "classification, search embeddings, document similarity",
    },
    "decoder-only": {
        "purpose": "generate the next token repeatedly",
        "examples": "GPT-style chat and code models",
        "tasks": "chat, writing, code completion, open-ended generation",
    },
    "encoder-decoder": {
        "purpose": "read one sequence and generate another",
        "examples": "T5, BART, original translation Transformer",
        "tasks": "translation, summarization, text-to-text transformation",
    },
}

for name, info in architecture_summary.items():
    print(f"\n{name.upper()}")
    for key, value in info.items():
        print(f"  {key}: {value}")

llm_development = [
    "1. Pretrain a large Transformer on broad text/code with next-token prediction.",
    "2. Instruction tune on prompt-response examples.",
    "3. Align or preference tune for helpfulness and safety.",
    "4. Evaluate on benchmarks and realistic user tasks.",
    "5. Deploy with prompting, tools, retrieval, monitoring, and guardrails.",
]

print("How many modern LLMs are built:\n")
for step in llm_development:
    print(step)

documents = [
    {"source": "Procedure A", "text": "Final weld acceptance requires visual inspection and NDT review."},
    {"source": "Policy B", "text": "Inspection records must include operator, date, and acceptance decision."},
    {"source": "Dataset Guide", "text": "Image datasets should be checked for class balance and missing labels."},
    {"source": "Maintenance Log", "text": "The porosity threshold was updated in last week's maintenance notice."},
]

def bow_vector(text, vocab_terms):
    counts = Counter(tokenize(text))
    return np.array([counts[t] for t in vocab_terms], dtype=float)

def cosine(a, b):
    denom = np.linalg.norm(a) * np.linalg.norm(b)
    return 0.0 if denom == 0 else float(a @ b / denom)

def build_rag_index(docs):
    """Training/building step: prepare documents and index vectors."""
    chunks = docs  # tiny example: each document is one chunk
    vocab_terms = sorted({tok for d in chunks for tok in tokenize(d["text"])})
    vectors = np.vstack([bow_vector(d["text"], vocab_terms) for d in chunks])
    return {"chunks": chunks, "vocab_terms": vocab_terms, "vectors": vectors}

rag_index = build_rag_index(documents)
print("Indexed chunks:", len(rag_index["chunks"]))
print("Vocabulary terms:", len(rag_index["vocab_terms"]))

def retrieve(question, index, top_k=2):
    q = bow_vector(question, index["vocab_terms"])
    scores = [cosine(q, v) for v in index["vectors"]]
    order = np.argsort(scores)[::-1][:top_k]
    return [(scores[i], index["chunks"][i]) for i in order]

def build_prompt(question, retrieved):
    context = "\n".join(
        f"[{chunk['source']}] {chunk['text']}" for _, chunk in retrieved
    )
    return f"Use only the context below. If missing, say you do not know.\n\nContext:\n{context}\n\nQuestion: {question}"

question = "What should final weld acceptance include?"
retrieved = retrieve(question, rag_index, top_k=2)
prompt = build_prompt(question, retrieved)

print("Retrieved chunks:")
for score, chunk in retrieved:
    print(f"  score={score:.3f} source={chunk['source']}: {chunk['text']}")

print("\nPrompt sent to the LLM:\n")
print(prompt)

Transformers, RAG, and LLMs - code examples¶

Setup¶

Tokenization and token IDs¶

Embeddings¶

Positional encoding¶

Self-attention mechanism¶

Encoder, decoder, and encoder-decoder architectures¶

How modern LLMs are built using Transformers¶

The gap RAG fills¶

RAG inference¶