graph TD
classDef yellow fill:#FFFDE7,stroke:#1A1A1A,stroke-width:2px;
classDef blue fill:#E3F2FD,stroke:#1A1A1A,stroke-width:2px;
classDef green fill:#E8F5E9,stroke:#1A1A1A,stroke-width:2px;
classDef purple fill:#F3E5F5,stroke:#1A1A1A,stroke-width:2px;
classDef orange fill:#FFF3E0,stroke:#1A1A1A,stroke-width:2px;
classDef note fill:#FFF9C4,stroke:#FBC02D,stroke-dasharray:4;
subgraph "Input Layer"
In["Input Tokens"]:::yellow
InEmb["Embedding"]:::orange
InPos["Positional Enc."]:::purple
Note1["Note: Positional Encoding adds
sine/cosine waves to tokens
so the model knows word order."]:::note
end
subgraph "Encoder Stack (Nx)"
MHA1["Multi-Head Attention"]:::blue
AddNorm1["Add and Norm"]:::green
FFN1["Feed Forward"]:::orange
AddNorm2["Add and Norm"]:::green
Note2["Note: Attention Formula:
Softmax(QK^T / sqrt(dk))V"]:::note
end
subgraph "Decoder Stack (Nx)"
MMHA["Masked MHA"]:::blue
AddNorm3["Add and Norm"]:::green
EncDecAttn["Cross-Attention"]:::blue
AddNorm4["Add and Norm"]:::green
FFN2["Feed Forward"]:::orange
AddNorm5["Add and Norm"]:::green
Note3["Note: Cross-Attention lets the
decoder look at the final
encoder state."]:::note
end
In --> InEmb --> InPos
InPos --> MHA1
MHA1 --> AddNorm1 --> FFN1 --> AddNorm2
Out["Outputs"]:::yellow --> OutEmb["Embedding"]:::orange --> OutPos["Positional Enc."]:::purple
OutPos --> MMHA --> AddNorm3 --> EncDecAttn
AddNorm2 --> EncDecAttn
EncDecAttn --> AddNorm4 --> FFN2 --> AddNorm5
AddNorm5 --> Lin["Linear"]:::purple --> Soft["Softmax"]:::teal --> Final["Probabilities"]:::yellow
%% Residual Links
InPos -.-> AddNorm1
AddNorm1 -.-> AddNorm2
OutPos -.-> AddNorm3
AddNorm3 -.-> AddNorm4
AddNorm4 -.-> AddNorm5