diff --git a/Genel-1/cross-attn_llm.ipynb b/Genel-1/cross-attn_llm.ipynb
index 74bf4a9..cd5560d 100644
--- a/Genel-1/cross-attn_llm.ipynb
+++ b/Genel-1/cross-attn_llm.ipynb
@@ -519,7 +519,7 @@
     "            total_loss += loss.item() * accumulation_steps\n",
     "        \n",
     "        avg_loss = total_loss / len(train_loader)\n",
-    "        print(f\"Epoch [{epoch+1}/{CONFIG['num_epochs']}] Ortalama Loss: {avg_loss:.4f}\")\n",
+    "        print(f\"Epoch [{epoch+1}/{CONFIG['num_epochs']}] Average Loss: {avg_loss:.4f}\")\n",
     "\n",
     "# Run the model\n",
     "if __name__ == \"__main__\":\n",
@@ -548,4 +548,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/Genel-1/moe.ipynb b/Genel-1/moe.ipynb
index 3ebcaa7..0de701f 100644
--- a/Genel-1/moe.ipynb
+++ b/Genel-1/moe.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -11,94 +11,70 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#RMSNorm, girdiyi normalize eder ve stabilite sağlar\n",
-    "# Burada, girdinin elemanlarının karelerinin ortalamasını alıp karekökü ile bölüyoruz.\n",
+    "# RMSNorm normalizes the input and ensures stability\n",
+    "# Here we take the mean of the squares of the input elements and divide by the square root.\n",
     "#\n",
     "\n",
     "def normalize(x):\n",
-    "    return x/ np.sqrt(np.sum(x**2) + 1e-6) # 1e-8, sıfıra bölmeyi önlemek için eklenir."
+    "    return x/ np.sqrt(np.sum(x**2) + 1e-6) # 1e-8 is added to prevent division by zero."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([0.13483997, 0.26967994, 0.40451991, 0.53935989, 0.67419986])"
-      ]
-     },
-     "execution_count": 81,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "normalize(np.array([1,2,3,4,5]))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Girdi:\n",
-      " [[ 1.10517535 -1.1437946  -0.6105434   0.61707477 -1.25953267 -1.60542219\n",
-      "  -1.12414634 -0.09881562]]\n",
-      "MoE Çıktısı:\n",
-      " [[ 0.3041326   0.3533763   0.05777129  0.5359059  -0.3078998  -0.07377402\n",
-      "  -0.33160865  0.53093004]]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Mixture of Experts\n",
     "class MixtureOfExperts:\n",
     "    def __init__(self, num_experts, hidden_dim):\n",
-    "        # Uzmanların (experts) sayısını ve her birinin boyutunu belirtiyoruz.\n",
+    "        # Define the number of experts and the dimensionality of each one.\n",
     "        self.num_experts = num_experts\n",
-    "        # Her uzman için rastgele ağırlık matrisi oluşturuyoruz.\n",
+    "        # Create a random weight matrix for each expert.\n",
     "        self.experts = [np.random.randn(hidden_dim, hidden_dim) for _ in range(num_experts)]\n",
     "\n",
     "    def route(self, x):\n",
-    "        # Router (yönlendirici), hangi uzmanların kullanılacağını seçer.\n",
-    "        # Basit bir örnek uygulaması: Uzmanlara rastgele skor veriyoruz.\n",
-    "        scores = np.random.rand(self.num_experts)  # Her uzman için rastgele skor oluştur.\n",
-    "        top_k_indices = np.argsort(scores)[-2:]  # En yüksek skora sahip 2 uzmanı seç.\n",
+    "        # The router decides which experts will be used.\n",
+    "        # Simple example implementation: assign random scores to the experts.\n",
+    "        scores = np.random.rand(self.num_experts)  # Generate a random score for each expert.\n",
+    "        top_k_indices = np.argsort(scores)[-2:]  # Select the two experts with the highest scores.\n",
     "        return top_k_indices\n",
     "\n",
     "    def forward(self, x):\n",
-    "        # Router tarafından seçilen uzmanlardan geçiş yapıyoruz.\n",
-    "        selected_experts = self.route(x)  # Seçilen uzmanların indekslerini alıyoruz.\n",
-    "        output = 0  # Başlangıç değeri\n",
+    "        # Iterate through the experts selected by the router.\n",
+    "        selected_experts = self.route(x)  # Retrieve the indices of the selected experts.\n",
+    "        output = 0  # Initial value\n",
     "        for idx in selected_experts:\n",
-    "            # Seçilen uzmanlardan sırasıyla geçiş yapıyoruz.\n",
-    "            output += np.dot(x, self.experts[idx])  # Uzmanın ağırlık matrisi ile çarp.\n",
-    "        return normalize(output)  # Sonucu normalize ederek döndür.\n",
+    "            # Iterate through the chosen experts in order.\n",
+    "            output += np.dot(x, self.experts[idx])  # Multiply by the expert's weight matrix.\n",
+    "        return normalize(output)  # Return the normalized result.\n",
     "\n",
-    "# Örnek kullanım\n",
+    "# Example usage\n",
     "hidden_dim = 8\n",
-    "x = np.random.randn(1, hidden_dim)  # Tek bir giriş örneği (1 satır, 8 boyut).\n",
-    "moe = MixtureOfExperts(num_experts=4, hidden_dim=hidden_dim)  # 4 uzmanlı sistem oluşturuyoruz.\n",
-    "output = moe.forward(x)  # Girdiyi MoE'den geçiriyoruz.\n",
+    "x = np.random.randn(1, hidden_dim)  # Single input example (1 row, 8 dimensions).\n",
+    "moe = MixtureOfExperts(num_experts=4, hidden_dim=hidden_dim)  # Create a system with 4 experts.\n",
+    "output = moe.forward(x)  # Pass the input through the MoE.\n",
     "\n",
-    "print(\"Girdi:\\n\", x)\n",
-    "print(\"MoE Çıktısı:\\n\", output)"
+    "print(\"Input:\\n\", x)\n",
+    "print(\"MoE Output:\\n\", output)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -108,20 +84,20 @@
     "        self.num_tokens = num_tokens\n",
     "        self.hidden_size = hidden_size\n",
     "        \n",
-    "        self.weights = np.random.rand(num_experts, num_tokens) # Expertlerin ağırlıkları\n",
-    "        self.bias = np.random.rand(num_experts) # Expertlerin biasları\n",
+    "        self.weights = np.random.rand(num_experts, num_tokens) # Expert weights\n",
+    "        self.bias = np.random.rand(num_experts) # Expert biases\n",
     "        \n",
     "    def route(x):\n",
-    "        scores = np.dot(self.weights, x) + self.bias # Expertlerin skorları\n",
-    "        scores = normalize(scores) # Skorları normalize ediyoruz.\n",
+    "        scores = np.dot(self.weights, x) + self.bias # Expert scores\n",
+    "        scores = normalize(scores) # Normalize the scores.\n",
     "        topk_incdice = np.argsort(scores)[-self.hidden_size:]\n",
-    "        return topk_incdice # En yüksek skorları döndürüyoruz.\n",
+    "        return topk_incdice # Return the highest scores.\n",
     "    "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -134,11 +110,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Basit bir veri kümesi (örnek metinler ve etiketler)\n",
+    "# Simple dataset (example texts and labels)\n",
     "data = [\n",
     "    (\"I love this product\", 1),\n",
     "    (\"This is amazing\", 1),\n",
@@ -148,11 +124,11 @@
     "    (\"Absolutely awful\", 0),\n",
     "]\n",
     "\n",
-    "# Metinleri vektörlere dönüştürmek için bir kelime dağarcığı\n",
+    "# Vocabulary to convert texts into vectors\n",
     "vocab = {word: idx for idx, word in enumerate(set(\" \".join([d[0] for d in data]).split()))}\n",
     "vocab_size = len(vocab)\n",
     "\n",
-    "# Veriyi sayısal hale getiren bir fonksiyon\n",
+    "# Function that converts data to numeric form\n",
     "def text_to_vector(text, vocab):\n",
     "    vector = [vocab[word] for word in text.split()]\n",
     "    return vector\n",
@@ -170,44 +146,33 @@
     "    def __getitem__(self, idx):\n",
     "        return self.data[idx]\n",
     "\n",
-    "# Collate fonksiyonu: Dataları aynı uzunluğa getirir.\n",
+    "# Collate function: brings all data to the same length.\n",
     "def collate_fn(batch):\n",
     "    texts, labels = zip(*batch)\n",
-    "    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)  # Pad ile aynı uzunluğa getiriliyor.\n",
+    "    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)  # Pad to achieve equal length.\n",
     "    labels = torch.tensor(labels)\n",
     "    return padded_texts, labels\n",
     "\n",
-    "# DataLoader'da collate_fn kullanımı\n",
+    "# Using collate_fn in the DataLoader\n",
     "dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)\n",
     "\n",
-    "# Veri kümesi ve DataLoader\n",
+    "# Dataset and DataLoader\n",
     "dataset = TextDataset(data, vocab)\n",
     "dataloader = DataLoader(dataset, batch_size=2, shuffle=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<__main__.TextDataset at 0x2313b0f3e10>"
-      ]
-     },
-     "execution_count": 86,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "dataset"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -219,20 +184,20 @@
     "        self.gate = nn.Linear(embed_dim, num_experts)\n",
     "\n",
     "    def forward(self, x):\n",
-    "        # Uzman skorlarını hesapla\n",
+    "        # Compute expert scores\n",
     "        gate_scores = torch.softmax(self.gate(x), dim=-1)  # [batch_size, num_experts]\n",
     "        \n",
-    "        # Uzmanların çıktıları\n",
+    "        # Expert outputs\n",
     "        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=-1)  # [batch_size, embed_dim, num_experts]\n",
     "        \n",
-    "        # Kapılı uzman karışımı\n",
+    "        # Gated mixture of experts\n",
     "        output = torch.sum(expert_outputs * gate_scores.unsqueeze(1), dim=-1)  # [batch_size, embed_dim]\n",
     "        return output"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -261,18 +226,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "def create_padding_mask(input_tensor, padding_value=0):\n",
-    "    # Girdi tensöründe doldurma yapılan yerleri (0 değerini) bulur.\n",
+    "    # Find where padding (value 0) occurs in the input tensor.\n",
     "    return (input_tensor == padding_value)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -282,81 +247,64 @@
     "        self.embedding = nn.Embedding(vocab_size, embed_dim)\n",
     "        self.transformer = TransformerBlock(embed_dim, num_heads, ff_hidden_dim)\n",
     "        self.moe = MixtureOfExperts(num_experts, embed_dim)\n",
-    "        self.classifier = nn.Linear(embed_dim, 2)  # İki sınıf (pozitif ve negatif)\n",
+    "        self.classifier = nn.Linear(embed_dim, 2)  # Two classes (positive and negative)\n",
     "\n",
     "    def forward(self, x):\n",
     "        # Embedding\n",
     "        x = self.embedding(x)  # [batch_size, seq_len, embed_dim]\n",
-    "        x = x.permute(1, 0, 2)  # Transformer için [seq_len, batch_size, embed_dim]\n",
+    "        x = x.permute(1, 0, 2)  # For the transformer: [seq_len, batch_size, embed_dim]\n",
     "        \n",
-    "        # Padding maskesi\n",
+    "        # Padding mask\n",
     "        padding_mask = create_padding_mask(x.permute(1, 0, 2)[:, :, 0])  # [batch_size, seq_len]\n",
     "        \n",
-    "        # Transformer Bloğu\n",
+    "        # Transformer block\n",
     "        x = self.transformer(x, padding_mask=padding_mask)\n",
     "        x = x.permute(1, 0, 2)  # [batch_size, seq_len, embed_dim]\n",
     "        \n",
     "        # MoE\n",
-    "        x = self.moe(x.mean(dim=1))  # Sekansın ortalaması alınır (basitlik için)\n",
+    "        x = self.moe(x.mean(dim=1))  # Take the mean of the sequence (for simplicity)\n",
     "        \n",
-    "        # Sınıflandırma\n",
+    "        # Classification\n",
     "        logits = self.classifier(x)\n",
     "        return logits"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 91,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "class TransformerWithMoE(nn.Module):\n",
     "    def __init__(self, vocab_size, embed_dim, num_heads, ff_hidden_dim, num_experts):\n",
     "        super().__init__()\n",
-    "        self.embedding = nn.Embedding(vocab_size, embed_dim)  # vocab_size, sözlük boyutuyla eşleşmeli\n",
+    "        self.embedding = nn.Embedding(vocab_size, embed_dim)  # vocab_size must match the vocabulary size\n",
     "        self.transformer = TransformerBlock(embed_dim, num_heads, ff_hidden_dim)\n",
     "        self.moe = MixtureOfExperts(num_experts, embed_dim)\n",
-    "        self.classifier = nn.Linear(embed_dim, 2)  # İki sınıf (pozitif ve negatif)\n",
+    "        self.classifier = nn.Linear(embed_dim, 2)  # Two classes (positive and negative)\n",
     "\n",
     "    def forward(self, x):\n",
     "        # Embedding\n",
     "        x = self.embedding(x)  # [batch_size, seq_len, embed_dim]\n",
-    "        x = x.permute(1, 0, 2)  # Transformer için [seq_len, batch_size, embed_dim]\n",
+    "        x = x.permute(1, 0, 2)  # For the transformer: [seq_len, batch_size, embed_dim]\n",
     "        \n",
-    "        # Transformer Bloğu\n",
+    "        # Transformer block\n",
     "        x = self.transformer(x)\n",
     "        x = x.permute(1, 0, 2)  # [batch_size, seq_len, embed_dim]\n",
     "        \n",
     "        # Mixture of Experts (MoE)\n",
-    "        x = self.moe(x.mean(dim=1))  # Sekansın ortalaması alınır (basitlik için)\n",
+    "        x = self.moe(x.mean(dim=1))  # Take the mean of the sequence (for simplicity)\n",
     "        \n",
-    "        # Sınıflandırma\n",
+    "        # Classification\n",
     "        logits = self.classifier(x)\n",
     "        return logits"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 92,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Epoch 1, Loss: 2.1026\n",
-      "Epoch 2, Loss: 2.0711\n",
-      "Epoch 3, Loss: 2.0545\n",
-      "Epoch 4, Loss: 2.0393\n",
-      "Epoch 5, Loss: 2.0245\n",
-      "Epoch 6, Loss: 2.0096\n",
-      "Epoch 7, Loss: 1.9945\n",
-      "Epoch 8, Loss: 1.9789\n",
-      "Epoch 9, Loss: 1.9626\n",
-      "Epoch 10, Loss: 1.9452\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import torch\n",
     "from torch.nn.utils.rnn import pad_sequence\n",
@@ -384,7 +332,7 @@
     "# Update your dataloader to use the custom collate function\n",
     "dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)\n",
     "\n",
-    "# Eğitim döngüsü\n",
+    "# Training loop\n",
     "for epoch in range(num_epochs):\n",
     "    model.train()\n",
     "    total_loss = 0\n",
@@ -400,46 +348,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 93,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyError",
-     "evalue": "'absolutely'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[93], line 20\u001b[0m\n\u001b[0;32m     17\u001b[0m vocab[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m<UNK>\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(vocab)  \u001b[38;5;66;03m# <UNK> anahtarını sözlüğe ekliyoruz\u001b[39;00m\n\u001b[0;32m     19\u001b[0m \u001b[38;5;66;03m# Metinleri vektörlere dönüştür ve padding işlemi yap\u001b[39;00m\n\u001b[1;32m---> 20\u001b[0m test_vectors \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext_to_vector\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvocab\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtext\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtest_data\u001b[49m\u001b[43m]\u001b[49m\n\u001b[0;32m     21\u001b[0m padded_test_vectors \u001b[38;5;241m=\u001b[39m pad_sequence(test_vectors, batch_first\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, padding_value\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)  \u001b[38;5;66;03m# Pad ile aynı uzunlukta yap\u001b[39;00m\n\u001b[0;32m     23\u001b[0m \u001b[38;5;28mprint\u001b[39m(padded_test_vectors)\n",
-      "Cell \u001b[1;32mIn[93], line 20\u001b[0m, in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m     17\u001b[0m vocab[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m<UNK>\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(vocab)  \u001b[38;5;66;03m# <UNK> anahtarını sözlüğe ekliyoruz\u001b[39;00m\n\u001b[0;32m     19\u001b[0m \u001b[38;5;66;03m# Metinleri vektörlere dönüştür ve padding işlemi yap\u001b[39;00m\n\u001b[1;32m---> 20\u001b[0m test_vectors \u001b[38;5;241m=\u001b[39m [torch\u001b[38;5;241m.\u001b[39mtensor(\u001b[43mtext_to_vector\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvocab\u001b[49m\u001b[43m)\u001b[49m) \u001b[38;5;28;01mfor\u001b[39;00m text \u001b[38;5;129;01min\u001b[39;00m test_data]\n\u001b[0;32m     21\u001b[0m padded_test_vectors \u001b[38;5;241m=\u001b[39m pad_sequence(test_vectors, batch_first\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, padding_value\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)  \u001b[38;5;66;03m# Pad ile aynı uzunlukta yap\u001b[39;00m\n\u001b[0;32m     23\u001b[0m \u001b[38;5;28mprint\u001b[39m(padded_test_vectors)\n",
-      "Cell \u001b[1;32mIn[85], line 17\u001b[0m, in \u001b[0;36mtext_to_vector\u001b[1;34m(text, vocab)\u001b[0m\n\u001b[0;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mtext_to_vector\u001b[39m(text, vocab):\n\u001b[1;32m---> 17\u001b[0m     vector \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[43mvocab\u001b[49m\u001b[43m[\u001b[49m\u001b[43mword\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mword\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\n\u001b[0;32m     18\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m vector\n",
-      "Cell \u001b[1;32mIn[85], line 17\u001b[0m, in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mtext_to_vector\u001b[39m(text, vocab):\n\u001b[1;32m---> 17\u001b[0m     vector \u001b[38;5;241m=\u001b[39m [\u001b[43mvocab\u001b[49m\u001b[43m[\u001b[49m\u001b[43mword\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m word \u001b[38;5;129;01min\u001b[39;00m text\u001b[38;5;241m.\u001b[39msplit()]\n\u001b[0;32m     18\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m vector\n",
-      "\u001b[1;31mKeyError\u001b[0m: 'absolutely'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import torch\n",
     "from torch.nn.utils.rnn import pad_sequence\n",
     "\n",
-    "# Test verileri\n",
+    "# Test data\n",
     "test_data = [\n",
-    "    \"I absolutely love this\",  # Pozitif\n",
-    "    \"This is the worst\",       # Negatif\n",
-    "    \"Fantastic product\",       # Pozitif\n",
-    "    \"Not good at all\",         # Negatif\n",
-    "    \"I am very happy\",         # Pozitif\n",
-    "    \"Completely disappointed\"  # Negatif\n",
+    "    \"I absolutely love this\",  # Positive\n",
+    "    \"This is the worst\",       # Negative\n",
+    "    \"Fantastic product\",       # Positive\n",
+    "    \"Not good at all\",         # Negative\n",
+    "    \"I am very happy\",         # Positive\n",
+    "    \"Completely disappointed\"  # Negative\n",
     "]\n",
     "\n",
-    "# Metinleri vektörlere dönüştürmek için bir fonksiyon\n",
-    "# Kelime dağarcığı (vocab) oluşturuluyor\n",
+    "# Function to convert texts into vectors\n",
+    "# Building the vocabulary\n",
     "vocab = {word: idx for idx, word in enumerate(set(\" \".join([d[0] for d in data]).split()))}\n",
-    "vocab['<UNK>'] = len(vocab)  # <UNK> anahtarını sözlüğe ekliyoruz\n",
+    "vocab['<UNK>'] = len(vocab)  # Add the <UNK> token to the vocabulary\n",
     "\n",
-    "# Metinleri vektörlere dönüştür ve padding işlemi yap\n",
+    "# Convert texts into vectors and apply padding\n",
     "test_vectors = [torch.tensor(text_to_vector(text, vocab)) for text in test_data]\n",
-    "padded_test_vectors = pad_sequence(test_vectors, batch_first=True, padding_value=0)  # Pad ile aynı uzunlukta yap\n",
+    "padded_test_vectors = pad_sequence(test_vectors, batch_first=True, padding_value=0)  # Pad to make them the same length\n",
     "\n",
     "print(padded_test_vectors)"
    ]
@@ -448,57 +381,12 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Kelime 'absolutely' sözlükte yok, '<UNK>' olarak işlenecek.\n",
-      "Kelime 'the' sözlükte yok, '<UNK>' olarak işlenecek.\n",
-      "Kelime 'worst' sözlükte yok, '<UNK>' olarak işlenecek.\n",
-      "Kelime 'Fantastic' sözlükte yok, '<UNK>' olarak işlenecek.\n",
-      "Kelime 'Not' sözlükte yok, '<UNK>' olarak işlenecek.\n",
-      "Kelime 'good' sözlükte yok, '<UNK>' olarak işlenecek.\n",
-      "Kelime 'at' sözlükte yok, '<UNK>' olarak işlenecek.\n",
-      "Kelime 'all' sözlükte yok, '<UNK>' olarak işlenecek.\n",
-      "Kelime 'am' sözlükte yok, '<UNK>' olarak işlenecek.\n",
-      "Kelime 'very' sözlükte yok, '<UNK>' olarak işlenecek.\n",
-      "Kelime 'happy' sözlükte yok, '<UNK>' olarak işlenecek.\n",
-      "Kelime 'Completely' sözlükte yok, '<UNK>' olarak işlenecek.\n",
-      "Kelime 'disappointed' sözlükte yok, '<UNK>' olarak işlenecek.\n",
-      "Padded Test Vektörleri:\n",
-      "tensor([[ 1, 12, 11,  4],\n",
-      "        [ 5,  3, 12, 12],\n",
-      "        [12,  2,  0,  0],\n",
-      "        [12, 12, 12, 12],\n",
-      "        [ 1, 12, 12, 12],\n",
-      "        [12, 12,  0,  0]])\n"
-     ]
-    },
-    {
-     "ename": "IndexError",
-     "evalue": "index out of range in self",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mIndexError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[78], line 65\u001b[0m\n\u001b[0;32m     62\u001b[0m predictions \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m     64\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():  \u001b[38;5;66;03m# Modelin parametrelerini güncelleme\u001b[39;00m\n\u001b[1;32m---> 65\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpadded_test_vectors\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# Tüm test girdilerini modele ver\u001b[39;00m\n\u001b[0;32m     66\u001b[0m     predicted_classes \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39margmax(outputs, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)  \u001b[38;5;66;03m# Sınıf tahminlerini al\u001b[39;00m\n\u001b[0;32m     67\u001b[0m     predictions \u001b[38;5;241m=\u001b[39m predicted_classes\u001b[38;5;241m.\u001b[39mtolist()  \u001b[38;5;66;03m# Liste formatına dönüştür\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\Users\\emreq\\Desktop\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1734\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m   1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1736\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\emreq\\Desktop\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m   1745\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1746\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1747\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m   1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
-      "Cell \u001b[1;32mIn[64], line 54\u001b[0m, in \u001b[0;36mTransformerWithMoE.forward\u001b[1;34m(self, x)\u001b[0m\n\u001b[0;32m     53\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, x):\n\u001b[1;32m---> 54\u001b[0m     x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     55\u001b[0m     x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfc(x\u001b[38;5;241m.\u001b[39mmean(dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m))\n\u001b[0;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m x\n",
-      "File \u001b[1;32mc:\\Users\\emreq\\Desktop\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1734\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m   1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1736\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\emreq\\Desktop\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m   1745\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1746\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1747\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m   1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
-      "File \u001b[1;32mc:\\Users\\emreq\\Desktop\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\modules\\sparse.py:190\u001b[0m, in \u001b[0;36mEmbedding.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m    189\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[1;32m--> 190\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    191\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m    192\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    193\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    194\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_norm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    195\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnorm_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    196\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    197\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msparse\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    198\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\emreq\\Desktop\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\functional.py:2551\u001b[0m, in \u001b[0;36membedding\u001b[1;34m(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)\u001b[0m\n\u001b[0;32m   2545\u001b[0m     \u001b[38;5;66;03m# Note [embedding_renorm set_grad_enabled]\u001b[39;00m\n\u001b[0;32m   2546\u001b[0m     \u001b[38;5;66;03m# XXX: equivalent to\u001b[39;00m\n\u001b[0;32m   2547\u001b[0m     \u001b[38;5;66;03m# with torch.no_grad():\u001b[39;00m\n\u001b[0;32m   2548\u001b[0m     \u001b[38;5;66;03m#   torch.embedding_renorm_\u001b[39;00m\n\u001b[0;32m   2549\u001b[0m     \u001b[38;5;66;03m# remove once script supports set_grad_enabled\u001b[39;00m\n\u001b[0;32m   2550\u001b[0m     _no_grad_embedding_renorm_(weight, \u001b[38;5;28minput\u001b[39m, max_norm, norm_type)\n\u001b[1;32m-> 2551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msparse\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[1;31mIndexError\u001b[0m: index out of range in self"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import torch\n",
     "from torch.nn.utils.rnn import pad_sequence\n",
     "\n",
-    "# Örnek eğitim verileri (sözlük oluşturmak için)\n",
+    "# Sample training data (used to build the vocabulary)\n",
     "data = [\n",
     "    (\"I love this product\", 1),\n",
     "    (\"This is amazing\", 1),\n",
@@ -508,11 +396,11 @@
     "    (\"Absolutely awful\", 0),\n",
     "]\n",
     "\n",
-    "# Kelime dağarcığı (vocab) oluşturuluyor\n",
+    "# Building the vocabulary\n",
     "vocab = {word: idx for idx, word in enumerate(set(\" \".join([d[0] for d in data]).split()))}\n",
-    "vocab['<UNK>'] = len(vocab)  # <UNK> anahtarını ekliyoruz\n",
+    "vocab['<UNK>'] = len(vocab)  # Add the <UNK> token\n",
     "\n",
-    "# Test verileri\n",
+    "# Test data\n",
     "test_data = [\n",
     "    \"I absolutely love this\",\n",
     "    \"This is the worst\",\n",
@@ -522,48 +410,48 @@
     "    \"Completely disappointed\"\n",
     "]\n",
     "\n",
-    "# Test kelimelerini kontrol et\n",
+    "# Check the test words\n",
     "for text in test_data:\n",
     "    for word in text.split():\n",
     "        if word not in vocab:\n",
-    "            print(f\"Kelime '{word}' sözlükte yok, '<UNK>' olarak işlenecek.\")\n",
+    "            print(f\"Word '{word}' is not in the vocabulary and will be processed as '<UNK>'.\")\n",
     "\n",
-    "# Metinleri vektörlere dönüştürmek için bir fonksiyon\n",
+    "# Function to convert texts into vectors\n",
     "def text_to_vector(text, vocab, unk_token='<UNK>'):\n",
     "    return [vocab.get(word, vocab[unk_token]) for word in text.split()]\n",
     "\n",
-    "# Metinleri vektörlere dönüştür ve padding işlemi yap\n",
+    "# Convert texts into vectors and apply padding\n",
     "test_vectors = [torch.tensor(text_to_vector(text, vocab)) for text in test_data]\n",
     "\n",
-    "# Sözlük boyutunu kontrol et\n",
+    "# Check the vocabulary size\n",
     "vocab_size = len(vocab)\n",
     "\n",
-    "# Tüm indekslerin sınır içinde olup olmadığını kontrol et\n",
+    "# Ensure all indices stay within bounds\n",
     "for vector in test_vectors:\n",
     "    if any(idx >= vocab_size for idx in vector):\n",
-    "        print(f\"Hata! Bu vektör {vector} sözlük boyutunun dışına çıkıyor.\")\n",
+    "        print(f\"Error! This vector {vector} exceeds the vocabulary size.\")\n",
     "\n",
-    "# Padding işlemi yap\n",
+    "# Apply padding\n",
     "padded_test_vectors = pad_sequence(test_vectors, batch_first=True, padding_value=0)\n",
     "\n",
-    "# Padding sonrası kontrol et\n",
+    "# Check after padding\n",
     "for vector in padded_test_vectors:\n",
     "    if any(idx >= vocab_size for idx in vector):\n",
-    "        print(f\"Hata! Bu vektör {vector} sözlük boyutunun dışına çıkıyor.\")\n",
+    "        print(f\"Error! This vector {vector} exceeds the vocabulary size.\")\n",
     "\n",
-    "print(\"Padded Test Vektörleri:\")\n",
+    "print(\"Padded Test Vectors:\")\n",
     "print(padded_test_vectors)\n",
     "\n",
-    "# Modeli değerlendirme\n",
-    "model.eval()  # Modeli değerlendirme moduna al\n",
+    "# Model evaluation\n",
+    "model.eval()  # Put the model into evaluation mode\n",
     "predictions = []\n",
     "\n",
-    "with torch.no_grad():  # Modelin parametrelerini güncelleme\n",
-    "    outputs = model(padded_test_vectors)  # Tüm test girdilerini modele ver\n",
-    "    predicted_classes = torch.argmax(outputs, dim=-1)  # Sınıf tahminlerini al\n",
-    "    predictions = predicted_classes.tolist()  # Liste formatına dönüştür\n",
+    "with torch.no_grad():  # Do not update model parameters\n",
+    "    outputs = model(padded_test_vectors)  # Feed all test inputs to the model\n",
+    "    predicted_classes = torch.argmax(outputs, dim=-1)  # Get the class predictions\n",
+    "    predictions = predicted_classes.tolist()  # Convert to list format\n",
     "\n",
-    "# Sonuçları yazdır\n",
+    "# Print the results\n",
     "for text, prediction in zip(test_data, predictions):\n",
     "    result = \"Positive\" if prediction == 1 else \"Negative\"\n",
     "    print(f\"Text: \\\"{text}\\\" -> Prediction: {result}\")"
@@ -591,4 +479,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/Genel-3/DAPO.ipynb b/Genel-3/DAPO.ipynb
index bc232c6..b2418fd 100644
--- a/Genel-3/DAPO.ipynb
+++ b/Genel-3/DAPO.ipynb
@@ -6,48 +6,39 @@
    "metadata": {},
    "source": [
     "1. Clip-Higher\n",
-    "Clip-Higher, düşük olasılıklı token'ların olasılığını artırarak entropi çökmesini önlemek için kullanılır. Bu, modelin daha çeşitli cevaplar üretmesini sağlar."
+    "Clip-Higher increases the probability of low-probability tokens to prevent entropy collapse, enabling the model to produce more diverse answers."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": null,
    "id": "e70a1bf6-a628-4768-86f1-2ad4e84ac7dc",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Orijinal Olasılıklar: [0.9  0.05 0.06]\n",
-      "Soft Clip sonrası Olasılıklar: [0.30000369 0.19288612 0.19197462]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "\n",
     "def soft_clip(probabilities, clip_low=0.2, clip_high=0.3, steepness=20):\n",
     "    \"\"\"\n",
-    "    Yumuşak clip fonksiyonu:\n",
-    "    - clip_low altında kalan değerler neredeyse clip_low'ya çekilir.\n",
-    "    - clip_high üzerinde kalan değerler yavaşça bastırılır.\n",
+    "    Soft clip function:\n",
+    "    - Values below clip_low are softly pulled toward clip_low.\n",
+    "    - Values above clip_high are gradually suppressed.\n",
     "    \"\"\"\n",
-    "    # Yumuşak minimum\n",
+    "    # Soft minimum\n",
     "    low_adjusted = clip_low + (probabilities - clip_low) / (1 + np.exp(-steepness*(probabilities - clip_low)))\n",
-    "    # Yumuşak maksimum\n",
+    "    # Soft maximum\n",
     "    high_adjusted = clip_high + (probabilities - clip_high) / (1 + np.exp(steepness*(probabilities - clip_high)))\n",
-    "    # İki aşamada da uygulayarak dengeli sonuç elde ediyoruz\n",
+    "    # Applying it in two stages yields balanced results\n",
     "    clipped = np.where(probabilities < clip_low, low_adjusted, probabilities)\n",
     "    clipped = np.where(probabilities > clip_high, high_adjusted, clipped)\n",
     "    return clipped\n",
     "\n",
-    "# Örnek olasılık dağılımı\n",
+    "# Example probability distribution\n",
     "probabilities = np.array([0.9, 0.05, 0.06])\n",
     "clipped_probabilities = soft_clip(probabilities)\n",
     "\n",
-    "print(\"Orijinal Olasılıklar:\", probabilities)\n",
-    "print(\"Soft Clip sonrası Olasılıklar:\", clipped_probabilities)"
+    "print(\"Original probabilities:\", probabilities)\n",
+    "print(\"Probabilities after soft clip:\", clipped_probabilities)"
    ]
   },
   {
@@ -55,39 +46,31 @@
    "id": "444b9ec8-0a19-4132-80c1-0c77b12362ed",
    "metadata": {},
    "source": [
-    "2. Dinamik Örnekleme\n",
-    "Dinamik Örnekleme, sıfır gradyanlı örnekleri filtreleyerek eğitim verimliliğini artırır."
+    "2. Dynamic sampling\n",
+    "Dynamic sampling filters out zero-gradient samples to improve training efficiency."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
    "id": "32c04381-58ec-4cde-a4fb-63678bb007a8",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Filtrelenmiş Örnekler: [(0.4, 0), (0.55, None)]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "# Örnek örnek seti: (olasılık, etiket) şeklinde\n",
+    "# Example sample set: in the form (probability, label)\n",
     "samples = [(0.9, 1), (0.1, 0), (0.8, 1), (0.4, 0), (0.55, None)]\n",
     "\n",
     "def dynamic_sample_filter(samples, lower_bound=0.2, upper_bound=0.8):\n",
     "    \"\"\"\n",
-    "    Dinamik örnekleme: \n",
-    "    Sadece modelin kararsız kaldığı (olasılık değeri belirsiz aralıkta olan) örnekleri seçer.\n",
+    "    Dynamic sampling: \n",
+    "    Selects only the samples where the model is uncertain (probabilities fall in an ambiguous range).\n",
     "    \"\"\"\n",
-    "    # Eğer etiket None ise sadece modelin tahminine göre filtre uyguluyoruz\n",
+    "    # If the label is None, filter solely based on the model prediction\n",
     "    filtered = [sample for sample in samples if lower_bound < sample[0] < upper_bound]\n",
     "    return filtered\n",
     "\n",
     "filtered_samples = dynamic_sample_filter(samples)\n",
-    "print(\"Filtrelenmiş Örnekler:\", filtered_samples)"
+    "print(\"Filtered samples:\", filtered_samples)"
    ]
   },
   {
@@ -96,33 +79,25 @@
    "metadata": {},
    "source": [
     "3. Token-Level Policy Gradient Loss\n",
-    "Bu teknik, her bir token'ın kaybını ayrı ayrı hesaplayarak uzun cümlelerdeki öğrenmeyi daha etkili hale getirir."
+    "This technique computes the loss for each token individually, making learning more effective for long sentences."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
    "id": "13841a52-33aa-40b2-bc33-0c15e59ad312",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Token Düzeyinde Ağırlıklı Kayıp: 0.25333333333333335\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "# Örnek token dizisi ve token kayıpları (her token için hata)\n",
+    "# Example token sequence and token losses (error per token)\n",
     "tokens = [\"token1\", \"token2\", \"token3\"]\n",
     "losses = [0.1, 0.4, 0.2]\n",
     "\n",
     "def token_policy_gradient_loss(tokens, losses, weights=None):\n",
     "    \"\"\"\n",
-    "    Token-level kayıp hesaplama:\n",
-    "    - Eğer weights belirtilmemişse tüm tokenlar eşit ağırlıkta kabul edilir.\n",
-    "    - Ağırlıklı ortalama hesaplanır.\n",
+    "    Token-level loss calculation:\n",
+    "    - If weights are not specified, all tokens are treated equally.\n",
+    "    - The weighted average is computed.\n",
     "    \"\"\"\n",
     "    if weights is None:\n",
     "        weights = [1.0] * len(tokens)\n",
@@ -130,10 +105,10 @@
     "    total_weight = sum(weights)\n",
     "    return weighted_loss / total_weight if total_weight != 0 else 0\n",
     "\n",
-    "# Örneğin, daha kritik tokenlara yüksek ağırlık verilebilir:\n",
+    "# For example, more critical tokens can receive higher weights:\n",
     "weights = [0.8, 1.2, 1.0]\n",
     "token_level_loss = token_policy_gradient_loss(tokens, losses, weights)\n",
-    "print(\"Token Düzeyinde Ağırlıklı Kayıp:\", token_level_loss)"
+    "print(\"Token-level weighted loss:\", token_level_loss)"
    ]
   },
   {
@@ -142,31 +117,23 @@
    "metadata": {},
    "source": [
     "4. Overlong Reward Shaping\n",
-    "Bu teknik, çok uzun örnekler için ödül gürültüsünü azaltarak eğitim sürecini stabilize eder."
+    "This technique stabilizes training for very long examples by reducing reward noise."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": null,
    "id": "975b304b-d078-4188-8559-a6abc37b97e0",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Şekillendirilmiş Ödül: 1.9866142981514305\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import math\n",
     "\n",
     "def shaped_reward(length, base_reward=1, max_length=200, steepness=0.1):\n",
     "    \"\"\"\n",
     "    Overlong Reward Shaping:\n",
-    "    - Uzunluk eşik değerini aşarsa, sigmoidal bir ceza uygulanır.\n",
-    "    - Bu, uzunluk arttıkça cezanın yavaşça derinleşmesini sağlar.\n",
+    "    - If the length exceeds the threshold, apply a sigmoidal penalty.\n",
+    "    - This slowly increases the penalty as the length grows.\n",
     "    \"\"\"\n",
     "    if length <= max_length:\n",
     "        return base_reward\n",
@@ -174,64 +141,18 @@
     "    penalty = -1 + 2 / (1 + math.exp(-steepness * (length - max_length)))\n",
     "    return base_reward + penalty\n",
     "\n",
-    "# Örnek uzunluk ve ödül\n",
+    "# Example length and reward\n",
     "length = 250\n",
     "reward = shaped_reward(length)\n",
-    "print(\"Şekillendirilmiş Ödül:\", reward)"
+    "print(\"Shaped reward:\", reward)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": null,
    "id": "214a95e9-3a07-4ba4-a5ba-f6c1b55db122",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Collecting datasets\n",
-      "  Using cached datasets-3.4.1-py3-none-any.whl.metadata (19 kB)\n",
-      "Requirement already satisfied: filelock in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from datasets) (3.13.1)\n",
-      "Requirement already satisfied: numpy>=1.17 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from datasets) (1.26.4)\n",
-      "Requirement already satisfied: pyarrow>=15.0.0 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from datasets) (16.1.0)\n",
-      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from datasets) (0.3.8)\n",
-      "Requirement already satisfied: pandas in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from datasets) (2.2.2)\n",
-      "Requirement already satisfied: requests>=2.32.2 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from datasets) (2.32.3)\n",
-      "Requirement already satisfied: tqdm>=4.66.3 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from datasets) (4.66.5)\n",
-      "Collecting xxhash (from datasets)\n",
-      "  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)\n",
-      "Collecting multiprocess<0.70.17 (from datasets)\n",
-      "  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)\n",
-      "Requirement already satisfied: fsspec<=2024.12.0,>=2023.1.0 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets) (2024.6.1)\n",
-      "Requirement already satisfied: aiohttp in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from datasets) (3.10.5)\n",
-      "Requirement already satisfied: huggingface-hub>=0.24.0 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from datasets) (0.29.3)\n",
-      "Requirement already satisfied: packaging in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from datasets) (24.1)\n",
-      "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from datasets) (6.0.1)\n",
-      "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from aiohttp->datasets) (2.4.0)\n",
-      "Requirement already satisfied: aiosignal>=1.1.2 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from aiohttp->datasets) (1.2.0)\n",
-      "Requirement already satisfied: attrs>=17.3.0 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from aiohttp->datasets) (23.1.0)\n",
-      "Requirement already satisfied: frozenlist>=1.1.1 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from aiohttp->datasets) (1.4.0)\n",
-      "Requirement already satisfied: multidict<7.0,>=4.5 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from aiohttp->datasets) (6.0.4)\n",
-      "Requirement already satisfied: yarl<2.0,>=1.0 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from aiohttp->datasets) (1.11.0)\n",
-      "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from huggingface-hub>=0.24.0->datasets) (4.11.0)\n",
-      "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from requests>=2.32.2->datasets) (3.3.2)\n",
-      "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from requests>=2.32.2->datasets) (3.7)\n",
-      "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from requests>=2.32.2->datasets) (2.2.3)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from requests>=2.32.2->datasets) (2024.12.14)\n",
-      "Requirement already satisfied: colorama in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from tqdm>=4.66.3->datasets) (0.4.6)\n",
-      "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from pandas->datasets) (2.9.0.post0)\n",
-      "Requirement already satisfied: pytz>=2020.1 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from pandas->datasets) (2024.1)\n",
-      "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from pandas->datasets) (2023.3)\n",
-      "Requirement already satisfied: six>=1.5 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
-      "Using cached datasets-3.4.1-py3-none-any.whl (487 kB)\n",
-      "Downloading multiprocess-0.70.16-py312-none-any.whl (146 kB)\n",
-      "Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl (30 kB)\n",
-      "Installing collected packages: xxhash, multiprocess, datasets\n",
-      "Successfully installed datasets-3.4.1 multiprocess-0.70.16 xxhash-3.5.0\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "!pip install datasets"
    ]
@@ -241,43 +162,7 @@
    "execution_count": null,
    "id": "92d23e63-d9e5-4a23-9a63-3f6bb5fbe891",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5a779653fa9c405c89d346e046d679e0",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Map:   0%|          | 0/2000 [00:00<?, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Epoch 1/3\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Step 0, Loss: 8.1641\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import torch\n",
     "from torch import nn\n",
@@ -296,12 +181,12 @@
     "model = GPT2LMHeadModel.from_pretrained(model_name)\n",
     "tokenizer = GPT2TokenizerFast.from_pretrained(model_name)\n",
     "\n",
-    "# PAD Token Tanımlama\n",
-    "tokenizer.pad_token = tokenizer.eos_token  # Padding için EOS token'ı kullanıyoruz\n",
+    "# Define the PAD token\n",
+    "tokenizer.pad_token = tokenizer.eos_token  # Use the EOS token for padding\n",
     "\n",
-    "# Dataset Yükleme ve Tokenize Etme\n",
+    "# Dataset Loadme ve Tokenize Etme\n",
     "dataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"train\")\n",
-    "dataset = dataset.select(range(2000))  # Küçük bir subset alalım\n",
+    "dataset = dataset.select(range(2000))  # Take a small subset\n",
     "\n",
     "def tokenize_function(example):\n",
     "    return tokenizer(example[\"text\"], truncation=True, max_length=256, padding=\"max_length\")\n",
@@ -309,11 +194,11 @@
     "tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=[\"text\"])\n",
     "tokenized_dataset.set_format(type=\"torch\", columns=[\"input_ids\", \"attention_mask\"])\n",
     "\n",
-    "# Data Collator Kullanarak Padding İşlemi\n",
+    "# Padding with the data collator\n",
     "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n",
     "dataloader = DataLoader(tokenized_dataset, batch_size=4, shuffle=True, collate_fn=data_collator)\n",
     "\n",
-    "# Eğitim Parametreleri\n",
+    "# Training Parametreleri\n",
     "epochs = 3\n",
     "optimizer = AdamW(model.parameters(), lr=5e-5)\n",
     "num_training_steps = epochs * len(dataloader)\n",
@@ -321,7 +206,7 @@
     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
     "model.to(device)\n",
     "\n",
-    "# Eğitim Döngüsü\n",
+    "# Training loop\n",
     "for epoch in range(epochs):\n",
     "    print(f\"Epoch {epoch+1}/{epochs}\")\n",
     "    for step, batch in enumerate(dataloader):\n",
@@ -339,7 +224,7 @@
     "        if step % 50 == 0:\n",
     "            print(f\"Step {step}, Loss: {loss.item():.4f}\")\n",
     "\n",
-    "print(\"Fine-tuning tamamlandı!\")"
+    "print(\"Fine-tuning completed!\")"
    ]
   },
   {
@@ -372,4 +257,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/Genel-3/SmolVLM-Stratch.ipynb b/Genel-3/SmolVLM-Stratch.ipynb
index 1e77aff..e91567a 100644
--- a/Genel-3/SmolVLM-Stratch.ipynb
+++ b/Genel-3/SmolVLM-Stratch.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "abc14454-626c-497a-8c2b-3239abc59d23",
    "metadata": {},
    "outputs": [],
@@ -14,31 +14,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "67db68f0-c03c-4e1e-af0c-aeb81c4c8560",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Toplam 4 yama oluşturuldu.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def divide_image_into_patches(image_path, patch_size=512):\n",
     "    \"\"\"\n",
-    "    Büyük bir görüntüyü belirtilen boyutta yamalara böler.\n",
+    "    Splits a large image into patches of the given size.\n",
     "    \n",
     "    Args:\n",
-    "        image_path (str): Görüntü dosyasının yolu.\n",
-    "        patch_size (int): Her yamanın piksel boyutu (varsayılan 512).\n",
+    "        image_path (str): Path to the image file.\n",
+    "        patch_size (int): Pixel size of each patch (default 512).\n",
     "    \n",
     "    Returns:\n",
     "        list: Her biri numpy dizisi olan yama listesi.\n",
     "    \"\"\"\n",
-    "    # Görüntüyü aç ve numpy dizisine çevir\n",
+    "    # Open the image and convert it to a NumPy array\n",
     "    image = Image.open(image_path).convert(\"RGB\")\n",
     "    image_array = np.array(image)\n",
     "    height, width, _ = image_array.shape\n",
@@ -46,9 +38,9 @@
     "    patches = []\n",
     "    for i in range(0, height, patch_size):\n",
     "        for j in range(0, width, patch_size):\n",
-    "            # Yamayı kes\n",
+    "            # Cut the patch\n",
     "            patch = image_array[i:min(i + patch_size, height), j:min(j + patch_size, width)]\n",
-    "            # Eğer yama tam boyutta değilse, doldur\n",
+    "            # If the patch is not full sized, pad it\n",
     "            if patch.shape[0] != patch_size or patch.shape[1] != patch_size:\n",
     "                padded_patch = np.zeros((patch_size, patch_size, 3), dtype=np.uint8)\n",
     "                padded_patch[:patch.shape[0], :patch.shape[1]] = patch\n",
@@ -56,255 +48,41 @@
     "            patches.append(patch)\n",
     "    return patches\n",
     "\n",
-    "# Örnek kullanım\n",
-    "image_path = \"app.jpeg\"  # Kendi görüntünüzün yolunu buraya ekleyin\n",
+    "# Example usage\n",
+    "image_path = \"app.jpeg\"  # Add the path to your own image here\n",
     "patches = divide_image_into_patches(image_path)\n",
-    "print(f\"Toplam {len(patches)} yama oluşturuldu.\")"
+    "print(f\"A total of {len(patches)} patches were created.\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "09cd6a97-6740-4583-a95e-95340d503705",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[array([[[109, 133, 143],\n",
-       "         [106, 130, 140],\n",
-       "         [107, 131, 141],\n",
-       "         ...,\n",
-       "         [129, 154, 159],\n",
-       "         [128, 153, 158],\n",
-       "         [128, 153, 158]],\n",
-       " \n",
-       "        [[109, 133, 143],\n",
-       "         [106, 130, 140],\n",
-       "         [107, 131, 141],\n",
-       "         ...,\n",
-       "         [129, 154, 159],\n",
-       "         [128, 153, 158],\n",
-       "         [128, 153, 158]],\n",
-       " \n",
-       "        [[109, 133, 143],\n",
-       "         [107, 131, 141],\n",
-       "         [107, 131, 141],\n",
-       "         ...,\n",
-       "         [129, 154, 159],\n",
-       "         [128, 153, 158],\n",
-       "         [128, 153, 158]],\n",
-       " \n",
-       "        ...,\n",
-       " \n",
-       "        [[ 75,  80,  83],\n",
-       "         [ 74,  79,  82],\n",
-       "         [ 74,  79,  82],\n",
-       "         ...,\n",
-       "         [ 65,  62,  47],\n",
-       "         [ 64,  61,  46],\n",
-       "         [ 63,  60,  45]],\n",
-       " \n",
-       "        [[ 65,  70,  73],\n",
-       "         [ 65,  70,  73],\n",
-       "         [ 64,  69,  72],\n",
-       "         ...,\n",
-       "         [ 61,  58,  41],\n",
-       "         [ 60,  57,  40],\n",
-       "         [ 60,  57,  40]],\n",
-       " \n",
-       "        [[ 54,  59,  62],\n",
-       "         [ 54,  59,  62],\n",
-       "         [ 53,  58,  61],\n",
-       "         ...,\n",
-       "         [ 61,  58,  41],\n",
-       "         [ 61,  58,  41],\n",
-       "         [ 61,  58,  41]]], dtype=uint8),\n",
-       " array([[[132, 155, 161],\n",
-       "         [131, 154, 160],\n",
-       "         [131, 154, 160],\n",
-       "         ...,\n",
-       "         [137, 162, 169],\n",
-       "         [136, 161, 168],\n",
-       "         [135, 160, 167]],\n",
-       " \n",
-       "        [[130, 153, 159],\n",
-       "         [130, 153, 159],\n",
-       "         [130, 153, 159],\n",
-       "         ...,\n",
-       "         [137, 162, 169],\n",
-       "         [137, 162, 169],\n",
-       "         [137, 162, 169]],\n",
-       " \n",
-       "        [[129, 152, 158],\n",
-       "         [129, 152, 158],\n",
-       "         [129, 152, 158],\n",
-       "         ...,\n",
-       "         [132, 157, 164],\n",
-       "         [134, 159, 166],\n",
-       "         [135, 160, 167]],\n",
-       " \n",
-       "        ...,\n",
-       " \n",
-       "        [[ 58,  55,  40],\n",
-       "         [ 60,  57,  40],\n",
-       "         [ 61,  58,  41],\n",
-       "         ...,\n",
-       "         [ 30,  31,  33],\n",
-       "         [ 30,  31,  33],\n",
-       "         [ 30,  31,  33]],\n",
-       " \n",
-       "        [[ 58,  52,  36],\n",
-       "         [ 59,  54,  35],\n",
-       "         [ 60,  55,  35],\n",
-       "         ...,\n",
-       "         [ 31,  33,  32],\n",
-       "         [ 31,  33,  32],\n",
-       "         [ 31,  33,  32]],\n",
-       " \n",
-       "        [[ 65,  60,  41],\n",
-       "         [ 67,  62,  43],\n",
-       "         [ 69,  64,  44],\n",
-       "         ...,\n",
-       "         [ 29,  31,  30],\n",
-       "         [ 29,  31,  30],\n",
-       "         [ 29,  31,  30]]], dtype=uint8),\n",
-       " array([[[44, 44, 44],\n",
-       "         [44, 44, 44],\n",
-       "         [44, 44, 44],\n",
-       "         ...,\n",
-       "         [68, 63, 44],\n",
-       "         [69, 64, 45],\n",
-       "         [70, 65, 46]],\n",
-       " \n",
-       "        [[40, 40, 40],\n",
-       "         [40, 40, 40],\n",
-       "         [40, 40, 40],\n",
-       "         ...,\n",
-       "         [55, 50, 31],\n",
-       "         [53, 48, 29],\n",
-       "         [52, 47, 28]],\n",
-       " \n",
-       "        [[39, 39, 39],\n",
-       "         [39, 39, 39],\n",
-       "         [39, 39, 39],\n",
-       "         ...,\n",
-       "         [71, 66, 47],\n",
-       "         [71, 66, 47],\n",
-       "         [70, 65, 46]],\n",
-       " \n",
-       "        ...,\n",
-       " \n",
-       "        [[47, 52, 58],\n",
-       "         [47, 52, 58],\n",
-       "         [47, 52, 58],\n",
-       "         ...,\n",
-       "         [55, 52, 47],\n",
-       "         [56, 53, 48],\n",
-       "         [56, 53, 48]],\n",
-       " \n",
-       "        [[47, 52, 58],\n",
-       "         [47, 52, 58],\n",
-       "         [47, 52, 58],\n",
-       "         ...,\n",
-       "         [55, 52, 47],\n",
-       "         [55, 52, 47],\n",
-       "         [55, 52, 47]],\n",
-       " \n",
-       "        [[47, 52, 58],\n",
-       "         [47, 52, 58],\n",
-       "         [47, 52, 58],\n",
-       "         ...,\n",
-       "         [54, 51, 46],\n",
-       "         [55, 52, 47],\n",
-       "         [55, 52, 47]]], dtype=uint8),\n",
-       " array([[[ 58,  55,  36],\n",
-       "         [ 54,  52,  31],\n",
-       "         [ 55,  53,  32],\n",
-       "         ...,\n",
-       "         [ 31,  32,  34],\n",
-       "         [ 31,  32,  34],\n",
-       "         [ 31,  32,  34]],\n",
-       " \n",
-       "        [[ 67,  64,  45],\n",
-       "         [ 63,  61,  40],\n",
-       "         [ 61,  59,  38],\n",
-       "         ...,\n",
-       "         [ 31,  32,  34],\n",
-       "         [ 31,  32,  34],\n",
-       "         [ 31,  32,  34]],\n",
-       " \n",
-       "        [[ 61,  58,  39],\n",
-       "         [ 58,  56,  35],\n",
-       "         [ 58,  56,  35],\n",
-       "         ...,\n",
-       "         [ 31,  32,  34],\n",
-       "         [ 31,  32,  34],\n",
-       "         [ 31,  32,  34]],\n",
-       " \n",
-       "        ...,\n",
-       " \n",
-       "        [[ 56,  53,  46],\n",
-       "         [ 56,  53,  46],\n",
-       "         [ 57,  54,  47],\n",
-       "         ...,\n",
-       "         [229, 221, 200],\n",
-       "         [230, 222, 201],\n",
-       "         [231, 223, 202]],\n",
-       " \n",
-       "        [[ 56,  53,  46],\n",
-       "         [ 56,  53,  46],\n",
-       "         [ 57,  54,  47],\n",
-       "         ...,\n",
-       "         [230, 222, 201],\n",
-       "         [231, 223, 202],\n",
-       "         [232, 224, 203]],\n",
-       " \n",
-       "        [[ 56,  53,  46],\n",
-       "         [ 56,  53,  46],\n",
-       "         [ 57,  54,  47],\n",
-       "         ...,\n",
-       "         [231, 223, 202],\n",
-       "         [232, 224, 203],\n",
-       "         [232, 224, 203]]], dtype=uint8)]"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "patches"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "8d9c12cf-d3a2-4649-be25-9d42fe71f596",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Her yama 64 tokena kodlandı.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def encode_patch_to_tokens(patch, num_tokens=64):\n",
     "    \"\"\"\n",
-    "    Bir yamayı belirtilen sayıda tokena sıkıştırır.\n",
+    "    Compresses a patch into the specified number of tokens.\n",
     "    \n",
     "    Args:\n",
     "        patch (np.array): 512x512x3 boyutunda yama.\n",
-    "        num_tokens (int): Üretilecek token sayısı (varsayılan 64).\n",
+    "        num_tokens (int): Number of tokens to generate (default 64).\n",
     "    \n",
     "    Returns:\n",
-    "        list: Her biri bir token olan liste (simüle edilmiş sayılar).\n",
+    "        list: A list where each element represents a token (simulated numbers).\n",
     "    \"\"\"\n",
-    "    # Yamayı düzleştir (262,144 x 3)\n",
+    "    # Flatten the patch (262,144 x 3)\n",
     "    flattened = patch.reshape(-1, 3)\n",
     "    pixels_per_token = flattened.shape[0] // num_tokens  # 4096 piksel/token\n",
     "    \n",
@@ -313,406 +91,100 @@
     "        start = i * pixels_per_token\n",
     "        end = start + pixels_per_token\n",
     "        token_pixels = flattened[start:end]\n",
-    "        # Tokenı temsil etmek için ortalama RGB değeri\n",
+    "        # Use the average RGB value to represent the token\n",
     "        token = np.mean(token_pixels, axis=0).tolist()\n",
     "        tokens.append(token)\n",
     "    return tokens\n",
     "\n",
-    "# Tüm yamaları tokenlara çevir\n",
+    "# Convert all patches into tokens\n",
     "patch_tokens = [encode_patch_to_tokens(patch) for patch in patches]\n",
-    "print(f\"Her yama {len(patch_tokens[0])} tokena kodlandı.\")"
+    "print(f\"Each patch was encoded into {len(patch_tokens[0])} tokens.\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "id": "a4ac0bfe-2989-421f-bf71-ef00087a42a2",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[[[107.730712890625, 126.15673828125, 130.179443359375],\n",
-       "  [109.221923828125, 127.397216796875, 131.6240234375],\n",
-       "  [111.3291015625, 129.4921875, 133.479736328125],\n",
-       "  [113.89697265625, 131.5595703125, 135.259521484375],\n",
-       "  [113.03955078125, 130.85205078125, 134.44873046875],\n",
-       "  [115.454345703125, 133.654296875, 137.001953125],\n",
-       "  [119.689697265625, 137.594970703125, 140.5498046875],\n",
-       "  [122.297607421875, 137.378662109375, 140.153076171875],\n",
-       "  [125.992919921875, 135.9111328125, 138.4423828125],\n",
-       "  [133.64501953125, 136.93115234375, 138.421142578125],\n",
-       "  [144.038818359375, 138.53076171875, 138.54736328125],\n",
-       "  [150.18017578125, 137.39404296875, 136.306640625],\n",
-       "  [155.70751953125, 135.4462890625, 133.658203125],\n",
-       "  [149.91064453125, 124.12353515625, 122.6328125],\n",
-       "  [149.3583984375, 124.04150390625, 122.4150390625],\n",
-       "  [149.345947265625, 130.420654296875, 128.434326171875],\n",
-       "  [150.46044921875, 137.738525390625, 134.950439453125],\n",
-       "  [153.12255859375, 140.675048828125, 138.04052734375],\n",
-       "  [154.406005859375, 136.83251953125, 134.26513671875],\n",
-       "  [154.453369140625, 132.76806640625, 129.989013671875],\n",
-       "  [153.838623046875, 129.94580078125, 127.320068359375],\n",
-       "  [152.353759765625, 128.2421875, 126.10498046875],\n",
-       "  [152.642822265625, 130.939208984375, 128.83203125],\n",
-       "  [147.859130859375, 130.55029296875, 128.5302734375],\n",
-       "  [138.703857421875, 121.360595703125, 120.78857421875],\n",
-       "  [134.19384765625, 117.161865234375, 116.278564453125],\n",
-       "  [133.5087890625, 117.577880859375, 116.694091796875],\n",
-       "  [132.485595703125, 119.12109375, 117.1650390625],\n",
-       "  [122.232421875, 102.85791015625, 102.040771484375],\n",
-       "  [122.537353515625, 97.26171875, 98.460205078125],\n",
-       "  [131.362548828125, 110.45556640625, 112.2666015625],\n",
-       "  [143.2333984375, 132.781005859375, 133.840576171875],\n",
-       "  [149.495361328125, 149.377685546875, 149.697509765625],\n",
-       "  [153.81201171875, 159.67919921875, 159.89892578125],\n",
-       "  [169.04736328125, 175.55517578125, 174.634521484375],\n",
-       "  [188.995849609375, 194.506591796875, 192.460693359375],\n",
-       "  [186.8056640625, 192.6162109375, 190.87890625],\n",
-       "  [185.5146484375, 191.797119140625, 190.5625],\n",
-       "  [168.982177734375, 175.826416015625, 175.926025390625],\n",
-       "  [167.26025390625, 175.260986328125, 175.600830078125],\n",
-       "  [176.441650390625, 183.739501953125, 183.434326171875],\n",
-       "  [175.567626953125, 182.594970703125, 182.14501953125],\n",
-       "  [172.988037109375, 178.482177734375, 177.703125],\n",
-       "  [171.573486328125, 175.61474609375, 174.209716796875],\n",
-       "  [163.90625, 167.306640625, 165.68798828125],\n",
-       "  [166.3134765625, 169.157470703125, 166.722412109375],\n",
-       "  [165.306640625, 168.332275390625, 165.967041015625],\n",
-       "  [165.208984375, 168.0654296875, 165.85888671875],\n",
-       "  [160.88134765625, 164.612548828125, 162.481201171875],\n",
-       "  [162.20849609375, 165.95654296875, 163.837890625],\n",
-       "  [160.82421875, 165.189208984375, 164.029052734375],\n",
-       "  [157.868896484375, 162.295654296875, 161.01171875],\n",
-       "  [156.366943359375, 161.850341796875, 160.37548828125],\n",
-       "  [155.090087890625, 160.375, 159.20849609375],\n",
-       "  [152.9296875, 158.29345703125, 157.228271484375],\n",
-       "  [148.896484375, 154.18017578125, 153.113525390625],\n",
-       "  [141.673828125, 147.154052734375, 147.044921875],\n",
-       "  [134.26953125, 139.89697265625, 140.17138671875],\n",
-       "  [125.921630859375, 131.9140625, 132.5029296875],\n",
-       "  [116.44287109375, 121.87939453125, 123.011474609375],\n",
-       "  [110.45263671875, 115.688232421875, 116.270751953125],\n",
-       "  [99.96826171875, 102.486083984375, 102.459228515625],\n",
-       "  [100.428466796875, 100.2158203125, 97.140625],\n",
-       "  [100.85302734375, 96.547607421875, 91.2255859375]],\n",
-       " [[163.20654296875, 179.21728515625, 181.384765625],\n",
-       "  [159.45068359375, 176.75830078125, 179.88427734375],\n",
-       "  [153.723876953125, 172.996826171875, 177.150634765625],\n",
-       "  [150.2236328125, 170.62158203125, 175.699951171875],\n",
-       "  [150.768310546875, 171.355712890625, 176.3056640625],\n",
-       "  [156.38818359375, 175.425048828125, 179.62451171875],\n",
-       "  [168.412841796875, 185.03759765625, 187.550048828125],\n",
-       "  [180.466552734375, 194.019287109375, 194.288818359375],\n",
-       "  [186.94970703125, 199.04931640625, 198.747314453125],\n",
-       "  [189.51904296875, 201.099853515625, 200.52783203125],\n",
-       "  [190.892333984375, 201.4619140625, 200.343994140625],\n",
-       "  [193.24072265625, 201.259033203125, 199.370361328125],\n",
-       "  [192.29736328125, 199.158203125, 196.559814453125],\n",
-       "  [189.31787109375, 194.632080078125, 191.7578125],\n",
-       "  [184.893310546875, 188.530029296875, 185.95263671875],\n",
-       "  [174.13525390625, 175.572265625, 173.43994140625],\n",
-       "  [159.47216796875, 159.81689453125, 159.613037109375],\n",
-       "  [148.3876953125, 146.510498046875, 147.511474609375],\n",
-       "  [140.107177734375, 137.0146484375, 138.298828125],\n",
-       "  [132.412353515625, 127.802001953125, 130.14892578125],\n",
-       "  [114.494140625, 106.72607421875, 108.98876953125],\n",
-       "  [99.52392578125, 89.60107421875, 92.02392578125],\n",
-       "  [97.342529296875, 89.507568359375, 92.637451171875],\n",
-       "  [94.957275390625, 88.146484375, 91.27294921875],\n",
-       "  [91.455078125, 84.8828125, 87.778076171875],\n",
-       "  [92.067138671875, 84.19970703125, 87.1796875],\n",
-       "  [95.071533203125, 85.451171875, 87.239990234375],\n",
-       "  [102.92919921875, 91.526123046875, 91.566650390625],\n",
-       "  [108.2138671875, 95.891357421875, 94.63525390625],\n",
-       "  [111.784423828125, 98.52734375, 96.68017578125],\n",
-       "  [116.6318359375, 102.186767578125, 99.57666015625],\n",
-       "  [115.185546875, 99.331298828125, 97.112548828125],\n",
-       "  [114.88720703125, 96.860107421875, 94.987060546875],\n",
-       "  [115.389892578125, 97.298095703125, 95.232177734375],\n",
-       "  [120.772705078125, 106.7802734375, 103.984619140625],\n",
-       "  [124.75390625, 117.729248046875, 114.567626953125],\n",
-       "  [125.830810546875, 124.01171875, 121.186767578125],\n",
-       "  [132.989501953125, 128.822509765625, 123.9189453125],\n",
-       "  [124.543212890625, 123.85009765625, 120.632568359375],\n",
-       "  [120.31640625, 121.147705078125, 119.306396484375],\n",
-       "  [119.80224609375, 121.83740234375, 120.47216796875],\n",
-       "  [122.142822265625, 124.41748046875, 123.5361328125],\n",
-       "  [128.61328125, 129.35595703125, 128.50927734375],\n",
-       "  [137.335693359375, 138.530029296875, 137.0703125],\n",
-       "  [145.00390625, 145.96923828125, 143.33984375],\n",
-       "  [143.26708984375, 144.94921875, 143.200927734375],\n",
-       "  [135.905517578125, 138.099365234375, 136.575927734375],\n",
-       "  [132.860595703125, 134.223388671875, 132.646728515625],\n",
-       "  [128.69384765625, 129.69287109375, 127.57861328125],\n",
-       "  [129.530029296875, 129.74462890625, 127.6669921875],\n",
-       "  [130.470703125, 130.831298828125, 127.92626953125],\n",
-       "  [127.182861328125, 127.847900390625, 123.744873046875],\n",
-       "  [128.153564453125, 128.743896484375, 123.677001953125],\n",
-       "  [123.93212890625, 124.447509765625, 119.0908203125],\n",
-       "  [118.9716796875, 119.27587890625, 113.660888671875],\n",
-       "  [114.501953125, 115.210205078125, 109.58447265625],\n",
-       "  [111.584716796875, 112.314208984375, 106.9423828125],\n",
-       "  [106.55126953125, 107.22900390625, 101.4072265625],\n",
-       "  [106.454833984375, 107.486083984375, 102.165283203125],\n",
-       "  [117.85986328125, 118.708740234375, 112.427001953125],\n",
-       "  [124.126708984375, 123.47412109375, 115.72998046875],\n",
-       "  [115.698486328125, 113.812255859375, 105.65966796875],\n",
-       "  [79.09228515625, 70.549560546875, 65.781494140625],\n",
-       "  [55.228759765625, 41.474365234375, 39.07861328125]],\n",
-       " [[98.881103515625, 92.37060546875, 85.748291015625],\n",
-       "  [97.904296875, 91.60205078125, 83.965087890625],\n",
-       "  [101.63037109375, 93.12744140625, 83.492919921875],\n",
-       "  [100.90625, 91.2529296875, 81.892578125],\n",
-       "  [98.08447265625, 88.015869140625, 78.25390625],\n",
-       "  [95.15478515625, 84.83740234375, 75.870849609375],\n",
-       "  [91.771240234375, 81.855712890625, 73.56298828125],\n",
-       "  [88.97705078125, 80.719970703125, 73.433837890625],\n",
-       "  [77.299560546875, 70.7294921875, 64.978271484375],\n",
-       "  [67.950439453125, 62.162109375, 57.82275390625],\n",
-       "  [56.861328125, 52.75927734375, 49.95556640625],\n",
-       "  [63.808837890625, 58.76953125, 54.67724609375],\n",
-       "  [63.72802734375, 54.839599609375, 50.2734375],\n",
-       "  [63.382080078125, 51.89599609375, 46.925537109375],\n",
-       "  [55.698486328125, 49.122802734375, 45.75927734375],\n",
-       "  [63.248046875, 54.725830078125, 50.388427734375],\n",
-       "  [71.100341796875, 60.1669921875, 53.204345703125],\n",
-       "  [72.7021484375, 61.309326171875, 53.64404296875],\n",
-       "  [70.0966796875, 60.260009765625, 55.25146484375],\n",
-       "  [71.549072265625, 60.538818359375, 54.080810546875],\n",
-       "  [66.188232421875, 56.065185546875, 49.550048828125],\n",
-       "  [52.630859375, 44.70751953125, 41.001708984375],\n",
-       "  [55.195556640625, 46.492919921875, 40.60693359375],\n",
-       "  [53.632568359375, 45.585693359375, 39.443603515625],\n",
-       "  [53.79931640625, 46.99853515625, 41.587890625],\n",
-       "  [81.202392578125, 72.3017578125, 64.17236328125],\n",
-       "  [96.050537109375, 89.98095703125, 81.693603515625],\n",
-       "  [97.96923828125, 92.916015625, 84.623779296875],\n",
-       "  [109.36572265625, 102.766845703125, 92.974365234375],\n",
-       "  [126.801025390625, 117.516357421875, 104.254150390625],\n",
-       "  [150.3837890625, 137.603271484375, 120.785888671875],\n",
-       "  [153.69677734375, 141.79443359375, 125.53759765625],\n",
-       "  [160.283203125, 150.45556640625, 135.385986328125],\n",
-       "  [160.368408203125, 150.56689453125, 135.24755859375],\n",
-       "  [160.277587890625, 149.4560546875, 133.83349609375],\n",
-       "  [159.91845703125, 148.615234375, 133.176025390625],\n",
-       "  [156.904296875, 143.321044921875, 126.92919921875],\n",
-       "  [152.431884765625, 140.11328125, 125.58740234375],\n",
-       "  [94.01513671875, 86.186767578125, 79.0458984375],\n",
-       "  [41.759765625, 41.2431640625, 41.519775390625],\n",
-       "  [31.956298828125, 31.396728515625, 31.76220703125],\n",
-       "  [30.435791015625, 29.8515625, 30.331298828125],\n",
-       "  [30.64990234375, 31.205078125, 31.8876953125],\n",
-       "  [32.32275390625, 32.669189453125, 33.192138671875],\n",
-       "  [37.078125, 36.36181640625, 36.739013671875],\n",
-       "  [56.532470703125, 52.76123046875, 50.26708984375],\n",
-       "  [61.062744140625, 58.90478515625, 57.509033203125],\n",
-       "  [56.1533203125, 54.7421875, 54.3349609375],\n",
-       "  [50.2216796875, 50.11181640625, 50.754638671875],\n",
-       "  [44.055419921875, 44.78173828125, 46.47265625],\n",
-       "  [38.452392578125, 39.95556640625, 42.197509765625],\n",
-       "  [39.07373046875, 40.192626953125, 41.995361328125],\n",
-       "  [41.720703125, 42.278076171875, 43.381103515625],\n",
-       "  [54.951904296875, 54.478515625, 54.824462890625],\n",
-       "  [69.42236328125, 66.472900390625, 63.8974609375],\n",
-       "  [81.172607421875, 76.86474609375, 72.54150390625],\n",
-       "  [94.45068359375, 88.710205078125, 81.93798828125],\n",
-       "  [103.73193359375, 97.669189453125, 90.23193359375],\n",
-       "  [109.0908203125, 103.612060546875, 96.078125],\n",
-       "  [112.944091796875, 107.47998046875, 99.798828125],\n",
-       "  [122.519775390625, 114.307861328125, 103.89208984375],\n",
-       "  [126.8486328125, 119.00146484375, 108.87109375],\n",
-       "  [127.742431640625, 121.72998046875, 112.8427734375],\n",
-       "  [131.987060546875, 126.78369140625, 117.367431640625]],\n",
-       " [[55.378173828125, 39.231201171875, 35.78955078125],\n",
-       "  [58.38525390625, 42.17041015625, 35.509521484375],\n",
-       "  [50.23779296875, 42.02099609375, 37.959228515625],\n",
-       "  [50.3701171875, 43.4677734375, 41.02490234375],\n",
-       "  [52.069580078125, 45.4453125, 43.6337890625],\n",
-       "  [48.51025390625, 47.502685546875, 45.841796875],\n",
-       "  [49.328369140625, 49.85400390625, 48.1806640625],\n",
-       "  [60.792236328125, 61.253662109375, 58.448486328125],\n",
-       "  [64.97119140625, 65.526611328125, 62.566650390625],\n",
-       "  [62.065185546875, 62.714111328125, 59.838134765625],\n",
-       "  [55.31591796875, 56.057373046875, 53.72607421875],\n",
-       "  [56.494140625, 56.920166015625, 54.4775390625],\n",
-       "  [54.1318359375, 53.755126953125, 51.01416015625],\n",
-       "  [52.963134765625, 53.01025390625, 49.67529296875],\n",
-       "  [61.81591796875, 61.866943359375, 58.320068359375],\n",
-       "  [70.264892578125, 69.93408203125, 65.80712890625],\n",
-       "  [77.562744140625, 76.871337890625, 72.1708984375],\n",
-       "  [80.417236328125, 79.87939453125, 74.99267578125],\n",
-       "  [80.12158203125, 79.9716796875, 74.684326171875],\n",
-       "  [79.400634765625, 78.97119140625, 73.428466796875],\n",
-       "  [75.83203125, 75.912353515625, 69.768310546875],\n",
-       "  [70.9560546875, 70.996337890625, 65.0869140625],\n",
-       "  [67.070556640625, 66.694580078125, 61.2958984375],\n",
-       "  [62.778076171875, 62.372314453125, 57.232421875],\n",
-       "  [57.701416015625, 57.84375, 52.58349609375],\n",
-       "  [53.984375, 54.09716796875, 48.725341796875],\n",
-       "  [53.173095703125, 52.067138671875, 46.025146484375],\n",
-       "  [53.98583984375, 52.234130859375, 45.3740234375],\n",
-       "  [56.44189453125, 54.122314453125, 47.037841796875],\n",
-       "  [58.76904296875, 56.05078125, 48.643310546875],\n",
-       "  [63.095947265625, 59.92236328125, 51.4931640625],\n",
-       "  [73.580078125, 69.381103515625, 59.488037109375],\n",
-       "  [80.2958984375, 76.19677734375, 66.171630859375],\n",
-       "  [82.7216796875, 79.2216796875, 69.234375],\n",
-       "  [86.016845703125, 82.56005859375, 72.71875],\n",
-       "  [86.861572265625, 83.044677734375, 72.634033203125],\n",
-       "  [85.967529296875, 81.921875, 71.7294921875],\n",
-       "  [84.491455078125, 80.140625, 70.381591796875],\n",
-       "  [85.532470703125, 81.2890625, 71.438232421875],\n",
-       "  [82.891845703125, 78.387451171875, 68.65478515625],\n",
-       "  [72.652587890625, 67.561279296875, 58.9384765625],\n",
-       "  [70.427734375, 63.21435546875, 53.738037109375],\n",
-       "  [74.95263671875, 64.095947265625, 52.845703125],\n",
-       "  [74.86767578125, 63.129638671875, 51.58251953125],\n",
-       "  [74.320068359375, 60.347900390625, 48.654541015625],\n",
-       "  [77.677978515625, 63.14501953125, 51.29833984375],\n",
-       "  [81.856689453125, 66.201904296875, 54.896728515625],\n",
-       "  [98.57080078125, 80.754150390625, 66.285400390625],\n",
-       "  [114.81640625, 96.099365234375, 80.563720703125],\n",
-       "  [116.527099609375, 98.981689453125, 84.028076171875],\n",
-       "  [118.619873046875, 101.8037109375, 87.313720703125],\n",
-       "  [123.467041015625, 107.77099609375, 93.43701171875],\n",
-       "  [125.494873046875, 113.6123046875, 100.590087890625],\n",
-       "  [124.364990234375, 114.64501953125, 102.409423828125],\n",
-       "  [118.87451171875, 109.77294921875, 97.160888671875],\n",
-       "  [116.83984375, 106.52587890625, 93.29638671875],\n",
-       "  [119.16015625, 110.513671875, 99.27294921875],\n",
-       "  [120.73681640625, 112.251953125, 100.732421875],\n",
-       "  [120.931884765625, 112.297607421875, 100.08837890625],\n",
-       "  [123.331787109375, 114.749755859375, 102.244140625],\n",
-       "  [122.900634765625, 114.256103515625, 101.8310546875],\n",
-       "  [122.745849609375, 114.037109375, 101.255859375],\n",
-       "  [122.178955078125, 113.8505859375, 101.46875],\n",
-       "  [122.279541015625, 115.208740234375, 102.47412109375]]]"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "patch_tokens"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "id": "fcbba014-b0d7-42b1-9893-9ea71ce52abd",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Birleştirilmiş token sayısı: 259\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def combine_tokens_with_separator(patch_tokens, separator_token=\"<patch_sep>\"):\n",
     "    \"\"\"\n",
-    "    Yama tokenlarını özel bir ayırıcıyla birleştirir.\n",
+    "    Combines patch tokens with a special separator.\n",
     "    \n",
     "    Args:\n",
-    "        patch_tokens (list): Her yamanın token listesi.\n",
-    "        separator_token (str): Yamaları ayırmak için kullanılacak token.\n",
+    "        patch_tokens (list): Token list for each patch.\n",
+    "        separator_token (str): Token used to separate patches.\n",
     "    \n",
     "    Returns:\n",
-    "        list: Birleştirilmiş token dizisi.\n",
+    "        list: Combined token sequence.\n",
     "    \"\"\"\n",
     "    combined = []\n",
     "    for i, tokens in enumerate(patch_tokens):\n",
     "        combined.extend(tokens)\n",
-    "        if i < len(patch_tokens) - 1:  # Son yamada ayırıcı ekleme\n",
+    "        if i < len(patch_tokens) - 1:  # Add a separator after each patch except the last\n",
     "            combined.append(separator_token)\n",
     "    return combined\n",
     "\n",
-    "# Tokenları birleştir\n",
+    "# Combine the tokens\n",
     "combined_tokens = combine_tokens_with_separator(patch_tokens)\n",
-    "print(f\"Birleştirilmiş token sayısı: {len(combined_tokens)}\")"
+    "print(f\"Number of combined tokens: {len(combined_tokens)}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "id": "f985056f-ecbf-40c8-b338-17c89b6d0863",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Üretilen Açıklama:\n",
-      " Ortalama renk RGB(107, 126, 130), Ortalama renk RGB(109, 127, 131), Ortalama renk RGB(111, 129, 133), Ortalama renk RGB(113, 131, 135), Ortalama renk RGB(113, 130, 134), Ortalama renk RGB(115, 133, 137), Ortalama renk RGB(119, 137, 140), Ortalama renk RGB(122, 137, 140), Ortalama renk RGB(125, 135, 138), Ortalama renk RGB(133, 136, 138), Ortalama renk RGB(144, 138, 138), Ortalama renk RGB(150, 137, 136), Ortalama renk RGB(155, 135, 133), Ortalama renk RGB(149, 124, 122), Ortalama renk RGB(149, 124, 122), Ortalama renk RGB(149, 130, 128), Ortalama renk RGB(150, 137, 134), Ortalama renk RGB(153, 140, 138), Ortalama renk RGB(154, 136, 134), Ortalama renk RGB(154, 132, 129), Ortalama renk RGB(153, 129, 127), Ortalama renk RGB(152, 128, 126), Ortalama renk RGB(152, 130, 128), Ortalama renk RGB(147, 130, 128), Ortalama renk RGB(138, 121, 120), Ortalama renk RGB(134, 117, 116), Ortalama renk RGB(133, 117, 116), Ortalama renk RGB(132, 119, 117), Ortalama renk RGB(122, 102, 102), Ortalama renk RGB(122, 97, 98), Ortalama renk RGB(131, 110, 112), Ortalama renk RGB(143, 132, 133), Ortalama renk RGB(149, 149, 149), Ortalama renk RGB(153, 159, 159), Ortalama renk RGB(169, 175, 174), Ortalama renk RGB(188, 194, 192), Ortalama renk RGB(186, 192, 190), Ortalama renk RGB(185, 191, 190), Ortalama renk RGB(168, 175, 175), Ortalama renk RGB(167, 175, 175), Ortalama renk RGB(176, 183, 183), Ortalama renk RGB(175, 182, 182), Ortalama renk RGB(172, 178, 177), Ortalama renk RGB(171, 175, 174), Ortalama renk RGB(163, 167, 165), Ortalama renk RGB(166, 169, 166), Ortalama renk RGB(165, 168, 165), Ortalama renk RGB(165, 168, 165), Ortalama renk RGB(160, 164, 162), Ortalama renk RGB(162, 165, 163), Ortalama renk RGB(160, 165, 164), Ortalama renk RGB(157, 162, 161), Ortalama renk RGB(156, 161, 160), Ortalama renk RGB(155, 160, 159), Ortalama renk RGB(152, 158, 157), Ortalama renk RGB(148, 154, 153), Ortalama renk RGB(141, 147, 147), Ortalama renk RGB(134, 139, 140), Ortalama renk RGB(125, 131, 132), Ortalama renk RGB(116, 121, 123), Ortalama renk RGB(110, 115, 116), Ortalama renk RGB(99, 102, 102), Ortalama renk RGB(100, 100, 97), Ortalama renk RGB(100, 96, 91)\n",
-      "Yama 1: Ortalama renk RGB(163, 179, 181), Ortalama renk RGB(159, 176, 179), Ortalama renk RGB(153, 172, 177), Ortalama renk RGB(150, 170, 175), Ortalama renk RGB(150, 171, 176), Ortalama renk RGB(156, 175, 179), Ortalama renk RGB(168, 185, 187), Ortalama renk RGB(180, 194, 194), Ortalama renk RGB(186, 199, 198), Ortalama renk RGB(189, 201, 200), Ortalama renk RGB(190, 201, 200), Ortalama renk RGB(193, 201, 199), Ortalama renk RGB(192, 199, 196), Ortalama renk RGB(189, 194, 191), Ortalama renk RGB(184, 188, 185), Ortalama renk RGB(174, 175, 173), Ortalama renk RGB(159, 159, 159), Ortalama renk RGB(148, 146, 147), Ortalama renk RGB(140, 137, 138), Ortalama renk RGB(132, 127, 130), Ortalama renk RGB(114, 106, 108), Ortalama renk RGB(99, 89, 92), Ortalama renk RGB(97, 89, 92), Ortalama renk RGB(94, 88, 91), Ortalama renk RGB(91, 84, 87), Ortalama renk RGB(92, 84, 87), Ortalama renk RGB(95, 85, 87), Ortalama renk RGB(102, 91, 91), Ortalama renk RGB(108, 95, 94), Ortalama renk RGB(111, 98, 96), Ortalama renk RGB(116, 102, 99), Ortalama renk RGB(115, 99, 97), Ortalama renk RGB(114, 96, 94), Ortalama renk RGB(115, 97, 95), Ortalama renk RGB(120, 106, 103), Ortalama renk RGB(124, 117, 114), Ortalama renk RGB(125, 124, 121), Ortalama renk RGB(132, 128, 123), Ortalama renk RGB(124, 123, 120), Ortalama renk RGB(120, 121, 119), Ortalama renk RGB(119, 121, 120), Ortalama renk RGB(122, 124, 123), Ortalama renk RGB(128, 129, 128), Ortalama renk RGB(137, 138, 137), Ortalama renk RGB(145, 145, 143), Ortalama renk RGB(143, 144, 143), Ortalama renk RGB(135, 138, 136), Ortalama renk RGB(132, 134, 132), Ortalama renk RGB(128, 129, 127), Ortalama renk RGB(129, 129, 127), Ortalama renk RGB(130, 130, 127), Ortalama renk RGB(127, 127, 123), Ortalama renk RGB(128, 128, 123), Ortalama renk RGB(123, 124, 119), Ortalama renk RGB(118, 119, 113), Ortalama renk RGB(114, 115, 109), Ortalama renk RGB(111, 112, 106), Ortalama renk RGB(106, 107, 101), Ortalama renk RGB(106, 107, 102), Ortalama renk RGB(117, 118, 112), Ortalama renk RGB(124, 123, 115), Ortalama renk RGB(115, 113, 105), Ortalama renk RGB(79, 70, 65), Ortalama renk RGB(55, 41, 39)\n",
-      "Yama 2: Ortalama renk RGB(98, 92, 85), Ortalama renk RGB(97, 91, 83), Ortalama renk RGB(101, 93, 83), Ortalama renk RGB(100, 91, 81), Ortalama renk RGB(98, 88, 78), Ortalama renk RGB(95, 84, 75), Ortalama renk RGB(91, 81, 73), Ortalama renk RGB(88, 80, 73), Ortalama renk RGB(77, 70, 64), Ortalama renk RGB(67, 62, 57), Ortalama renk RGB(56, 52, 49), Ortalama renk RGB(63, 58, 54), Ortalama renk RGB(63, 54, 50), Ortalama renk RGB(63, 51, 46), Ortalama renk RGB(55, 49, 45), Ortalama renk RGB(63, 54, 50), Ortalama renk RGB(71, 60, 53), Ortalama renk RGB(72, 61, 53), Ortalama renk RGB(70, 60, 55), Ortalama renk RGB(71, 60, 54), Ortalama renk RGB(66, 56, 49), Ortalama renk RGB(52, 44, 41), Ortalama renk RGB(55, 46, 40), Ortalama renk RGB(53, 45, 39), Ortalama renk RGB(53, 46, 41), Ortalama renk RGB(81, 72, 64), Ortalama renk RGB(96, 89, 81), Ortalama renk RGB(97, 92, 84), Ortalama renk RGB(109, 102, 92), Ortalama renk RGB(126, 117, 104), Ortalama renk RGB(150, 137, 120), Ortalama renk RGB(153, 141, 125), Ortalama renk RGB(160, 150, 135), Ortalama renk RGB(160, 150, 135), Ortalama renk RGB(160, 149, 133), Ortalama renk RGB(159, 148, 133), Ortalama renk RGB(156, 143, 126), Ortalama renk RGB(152, 140, 125), Ortalama renk RGB(94, 86, 79), Ortalama renk RGB(41, 41, 41), Ortalama renk RGB(31, 31, 31), Ortalama renk RGB(30, 29, 30), Ortalama renk RGB(30, 31, 31), Ortalama renk RGB(32, 32, 33), Ortalama renk RGB(37, 36, 36), Ortalama renk RGB(56, 52, 50), Ortalama renk RGB(61, 58, 57), Ortalama renk RGB(56, 54, 54), Ortalama renk RGB(50, 50, 50), Ortalama renk RGB(44, 44, 46), Ortalama renk RGB(38, 39, 42), Ortalama renk RGB(39, 40, 41), Ortalama renk RGB(41, 42, 43), Ortalama renk RGB(54, 54, 54), Ortalama renk RGB(69, 66, 63), Ortalama renk RGB(81, 76, 72), Ortalama renk RGB(94, 88, 81), Ortalama renk RGB(103, 97, 90), Ortalama renk RGB(109, 103, 96), Ortalama renk RGB(112, 107, 99), Ortalama renk RGB(122, 114, 103), Ortalama renk RGB(126, 119, 108), Ortalama renk RGB(127, 121, 112), Ortalama renk RGB(131, 126, 117)\n",
-      "Yama 3: Ortalama renk RGB(55, 39, 35), Ortalama renk RGB(58, 42, 35), Ortalama renk RGB(50, 42, 37), Ortalama renk RGB(50, 43, 41), Ortalama renk RGB(52, 45, 43), Ortalama renk RGB(48, 47, 45), Ortalama renk RGB(49, 49, 48), Ortalama renk RGB(60, 61, 58), Ortalama renk RGB(64, 65, 62), Ortalama renk RGB(62, 62, 59), Ortalama renk RGB(55, 56, 53), Ortalama renk RGB(56, 56, 54), Ortalama renk RGB(54, 53, 51), Ortalama renk RGB(52, 53, 49), Ortalama renk RGB(61, 61, 58), Ortalama renk RGB(70, 69, 65), Ortalama renk RGB(77, 76, 72), Ortalama renk RGB(80, 79, 74), Ortalama renk RGB(80, 79, 74), Ortalama renk RGB(79, 78, 73), Ortalama renk RGB(75, 75, 69), Ortalama renk RGB(70, 70, 65), Ortalama renk RGB(67, 66, 61), Ortalama renk RGB(62, 62, 57), Ortalama renk RGB(57, 57, 52), Ortalama renk RGB(53, 54, 48), Ortalama renk RGB(53, 52, 46), Ortalama renk RGB(53, 52, 45), Ortalama renk RGB(56, 54, 47), Ortalama renk RGB(58, 56, 48), Ortalama renk RGB(63, 59, 51), Ortalama renk RGB(73, 69, 59), Ortalama renk RGB(80, 76, 66), Ortalama renk RGB(82, 79, 69), Ortalama renk RGB(86, 82, 72), Ortalama renk RGB(86, 83, 72), Ortalama renk RGB(85, 81, 71), Ortalama renk RGB(84, 80, 70), Ortalama renk RGB(85, 81, 71), Ortalama renk RGB(82, 78, 68), Ortalama renk RGB(72, 67, 58), Ortalama renk RGB(70, 63, 53), Ortalama renk RGB(74, 64, 52), Ortalama renk RGB(74, 63, 51), Ortalama renk RGB(74, 60, 48), Ortalama renk RGB(77, 63, 51), Ortalama renk RGB(81, 66, 54), Ortalama renk RGB(98, 80, 66), Ortalama renk RGB(114, 96, 80), Ortalama renk RGB(116, 98, 84), Ortalama renk RGB(118, 101, 87), Ortalama renk RGB(123, 107, 93), Ortalama renk RGB(125, 113, 100), Ortalama renk RGB(124, 114, 102), Ortalama renk RGB(118, 109, 97), Ortalama renk RGB(116, 106, 93), Ortalama renk RGB(119, 110, 99), Ortalama renk RGB(120, 112, 100), Ortalama renk RGB(120, 112, 100), Ortalama renk RGB(123, 114, 102), Ortalama renk RGB(122, 114, 101), Ortalama renk RGB(122, 114, 101), Ortalama renk RGB(122, 113, 101), Ortalama renk RGB(122, 115, 102)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def generate_description_from_tokens(combined_tokens):\n",
     "    \"\"\"\n",
-    "    Tokenlardan basit bir metin açıklaması üretir.\n",
+    "    Generates a simple text description from tokens.\n",
     "    \n",
     "    Args:\n",
-    "        combined_tokens (list): Birleştirilmiş token dizisi.\n",
+    "        combined_tokens (list): Combined token sequence.\n",
     "    \n",
     "    Returns:\n",
-    "        str: Üretilen açıklama.\n",
+    "        str: Generated description.\n",
     "    \"\"\"\n",
     "    description = []\n",
     "    patch_idx = 0\n",
     "    for token in combined_tokens:\n",
     "        if token == \"<patch_sep>\":\n",
     "            patch_idx += 1\n",
-    "            description.append(f\"\\nYama {patch_idx}:\")\n",
+    "            description.append(f\"\\nPatch {patch_idx}:\")\n",
     "        elif isinstance(token, list):  # RGB token\n",
     "            r, g, b = token\n",
-    "            color_desc = f\"Ortalama renk RGB({int(r)}, {int(g)}, {int(b)})\"\n",
+    "            color_desc = f\"Average renk RGB({int(r)}, {int(g)}, {int(b)})\"\n",
     "            if not description or description[-1].startswith(\"\\n\"):\n",
     "                description.append(f\" {color_desc}\")\n",
     "            else:\n",
     "                description[-1] += f\", {color_desc}\"\n",
     "    return \"\".join(description)\n",
     "\n",
-    "# Açıklama üret\n",
+    "# Generate description\n",
     "response = generate_description_from_tokens(combined_tokens)\n",
-    "print(\"Üretilen Açıklama:\")\n",
+    "print(\"Generated description:\")\n",
     "print(response)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
    "id": "bf87d11e-4be6-404a-b3f6-696aeb229eb1",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: huggingface_hub in c:\\users\\emreq\\anaconda3\\lib\\site-packages (0.29.3)\n",
-      "Requirement already satisfied: filelock in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from huggingface_hub) (3.13.1)\n",
-      "Requirement already satisfied: fsspec>=2023.5.0 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from huggingface_hub) (2024.6.1)\n",
-      "Requirement already satisfied: packaging>=20.9 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from huggingface_hub) (24.1)\n",
-      "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from huggingface_hub) (6.0.1)\n",
-      "Requirement already satisfied: requests in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from huggingface_hub) (2.32.3)\n",
-      "Requirement already satisfied: tqdm>=4.42.1 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from huggingface_hub) (4.66.5)\n",
-      "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from huggingface_hub) (4.11.0)\n",
-      "Requirement already satisfied: colorama in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from tqdm>=4.42.1->huggingface_hub) (0.4.6)\n",
-      "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from requests->huggingface_hub) (3.3.2)\n",
-      "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from requests->huggingface_hub) (3.7)\n",
-      "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from requests->huggingface_hub) (2.2.3)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\emreq\\anaconda3\\lib\\site-packages (from requests->huggingface_hub) (2024.12.14)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "!pip install huggingface_hub"
    ]
@@ -733,36 +205,7 @@
    "execution_count": null,
    "id": "093e03d9-0889-4666-bcd3-7e53771c3c70",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "779ccc7e5f9e43b58a7a1eaafef6533f",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d80ff22b0ceb454c8f0dafc6389dc50b",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "model.safetensors:   0%|          | 0.00/813M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import torch\n",
     "import torch.nn as nn\n",
@@ -770,7 +213,7 @@
     "from torchvision import transforms\n",
     "from transformers import SiglipVisionModel, SiglipProcessor, AutoTokenizer, LlamaForCausalLM\n",
     "\n",
-    "# 1. Görüntüyü 512x512 yamalarına bölme\n",
+    "# 1. Split the image into 512x512 patches\n",
     "def divide_image_into_patches(image_path, patch_size=512):\n",
     "    image = Image.open(image_path).convert(\"RGB\")\n",
     "    preprocess = transforms.Compose([\n",
@@ -798,23 +241,23 @@
     "        self.vision_model = SiglipVisionModel.from_pretrained(vision_model_name)\n",
     "        self.processor = SiglipProcessor.from_pretrained(vision_model_name)\n",
     "        \n",
-    "        # Llama dil modeli (örnek olarak, daha küçük bir model de seçilebilir)\n",
+    "        # Llama language model (you may select a smaller model as well)\n",
     "        self.language_model = LlamaForCausalLM.from_pretrained(language_model_name)\n",
     "        self.tokenizer = AutoTokenizer.from_pretrained(language_model_name)\n",
     "        \n",
-    "        # SigLIP çıkışını Llama girişine uyarlama için projeksiyon\n",
+    "        # Project the SigLIP output to match the Llama input\n",
     "        self.proj = nn.Linear(self.vision_model.config.hidden_size, self.language_model.config.hidden_size)\n",
     "        \n",
-    "        # Özel tokenlar için ekleme (örneğin <patch_sep>)\n",
+    "        # Add special tokens (e.g., <patch_sep>)\n",
     "        self.tokenizer.add_special_tokens({\"additional_special_tokens\": [\"<patch_sep>\"]})\n",
     "        self.language_model.resize_token_embeddings(len(self.tokenizer))\n",
     "\n",
     "    def forward(self, patches, text_input=None):\n",
-    "        # SigLIP ile vizyon tokenlarını çıkar\n",
+    "        # Extract vision tokens with SigLIP\n",
     "        batch_size, num_patches, c, h, w = patches.shape\n",
     "        patches = patches.view(batch_size * num_patches, c, h, w)\n",
     "        \n",
-    "        # SigLIP girişi için tensörleri PIL görüntüsüne çevir (processor için)\n",
+    "        # Convert tensors to PIL images for SigLIP input (for the processor)\n",
     "        patch_images = [transforms.ToPILImage()(patch) for patch in patches]\n",
     "        inputs = self.processor(images=patch_images, return_tensors=\"pt\")\n",
     "        \n",
@@ -825,7 +268,7 @@
     "        # Projeksiyon ile dil modeline uyarla\n",
     "        vision_tokens = self.proj(vision_tokens)  # [num_patches, seq_len, llama_hidden_size]\n",
     "        \n",
-    "        # Yamaları birleştir ve <patch_sep> ekle\n",
+    "        # Combine the patches and add <patch_sep>\n",
     "        sep_token_id = self.tokenizer.convert_tokens_to_ids(\"<patch_sep>\")\n",
     "        combined_tokens = []\n",
     "        for i in range(num_patches):\n",
@@ -834,7 +277,7 @@
     "                combined_tokens.append(torch.full((1, 1, vision_tokens.size(-1)), sep_token_id, dtype=torch.float32))\n",
     "        vision_tokens = torch.cat(combined_tokens, dim=0)  # [total_seq_len, llama_hidden_size]\n",
     "\n",
-    "        # Dil modeli ile metin üret\n",
+    "        # Generate text with the language model\n",
     "        if text_input is not None:\n",
     "            text_inputs = self.tokenizer(text_input, return_tensors=\"pt\", padding=True)\n",
     "            input_ids = text_inputs[\"input_ids\"]\n",
@@ -846,9 +289,9 @@
     "\n",
     "    def generate(self, patches, max_len=50):\n",
     "        vision_tokens = self.forward(patches)  # [total_seq_len, hidden_size]\n",
-    "        batch_size = 1  # Tek görüntü için\n",
+    "        batch_size = 1  # For a single image\n",
     "        \n",
-    "        # Başlangıç tokenı ile metin üretimi\n",
+    "        # Generate text with the initial token\n",
     "        input_ids = torch.tensor([[self.tokenizer.bos_token_id]], dtype=torch.long)\n",
     "        for _ in range(max_len):\n",
     "            embeddings = self.language_model.model.embed_tokens(input_ids)\n",
@@ -862,10 +305,10 @@
     "        \n",
     "        return input_ids\n",
     "\n",
-    "# 3. İş akışı\n",
+    "# 3. Workflow\n",
     "def process_image_to_text(image_path, model, max_len=20):\n",
     "    patches = divide_image_into_patches(image_path)\n",
-    "    print(f\"Toplam {patches.shape[0]} yama oluşturuldu.\")\n",
+    "    print(f\"Created {patches.shape[0]} patches in total.\")\n",
     "    \n",
     "    patches = patches.unsqueeze(0)  # [1, num_patches, 3, 512, 512]\n",
     "    output_ids = model.generate(patches, max_len=max_len)\n",
@@ -874,13 +317,13 @@
     "\n",
     "# Test\n",
     "if __name__ == \"__main__\":\n",
-    "    # Modeli oluştur\n",
+    "    # Build the model\n",
     "    model = SmolVLM()\n",
     "    \n",
-    "    # Test görüntüsü\n",
-    "    image_path = \"app.jpeg\"  # Görüntü yolunu güncelleyin\n",
+    "    # Test image\n",
+    "    image_path = \"app.jpeg\"  # Update with your image path\n",
     "    generated_text = process_image_to_text(image_path, model)\n",
-    "    print(\"Üretilen Metin:\")\n",
+    "    print(\"Generated Text:\")\n",
     "    print(generated_text)"
    ]
   },
@@ -914,4 +357,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/Genel-3/compare_attention_Vs_mla.ipynb b/Genel-3/compare_attention_Vs_mla.ipynb
index f64b475..0a70413 100644
--- a/Genel-3/compare_attention_Vs_mla.ipynb
+++ b/Genel-3/compare_attention_Vs_mla.ipynb
@@ -51,7 +51,7 @@
     "import matplotlib.pyplot as plt\n",
     "from tqdm import tqdm\n",
     "\n",
-    "# Multi-head Attention (MHA) sınıfı\n",
+    "# Multi-head Attention (MHA) class\n",
     "class MultiHeadAttention(nn.Module):\n",
     "    def __init__(self, d_model, num_heads):\n",
     "        super(MultiHeadAttention, self).__init__()\n",
@@ -77,7 +77,7 @@
     "        output = self.W_o(context)\n",
     "        return output\n",
     "\n",
-    "# Multi-head Latent Attention (MLA) sınıfı\n",
+    "# Multi-head Latent Attention (MLA) class\n",
     "class MultiHeadLatentAttention(nn.Module):\n",
     "    def __init__(self, d_model, num_heads, latent_dim):\n",
     "        super(MultiHeadLatentAttention, self).__init__()\n",
@@ -106,13 +106,13 @@
     "        output = self.W_o(context)\n",
     "        return output\n",
     "\n",
-    "# Veri setini yükleme ve ön işlem (Düzeltildi)\n",
+    "# Loading and preprocessing the dataset (fixed)\n",
     "def prepare_data(batch_size=32, seq_len=128, d_model=512):\n",
-    "    # IMDB veri setini yükle\n",
-    "    dataset = load_dataset(\"imdb\", split=\"train[:1000]\")  # İlk 1000 örnek\n",
+    "    # Load the IMDB dataset\n",
+    "    dataset = load_dataset(\"imdb\", split=\"train[:1000]\")  # First 1000 samples\n",
     "    tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
     "    \n",
-    "    # Metinleri token'ize et ve tensöre çevir\n",
+    "    # Tokenize the texts and convert them to tensors\n",
     "    inputs = tokenizer(\n",
     "        dataset[\"text\"], \n",
     "        max_length=seq_len, \n",
@@ -121,15 +121,15 @@
     "        return_tensors=\"pt\"\n",
     "    )\n",
     "    \n",
-    "    # Embedding katmanı ile token'ları d_model boyutuna çevir\n",
+    "    # Use the embedding layer to map tokens to d_model size\n",
     "    embedding = nn.Embedding(tokenizer.vocab_size, d_model)\n",
     "    input_ids = inputs[\"input_ids\"]\n",
     "    data = embedding(input_ids)\n",
     "    \n",
-    "    # Batch'lere ayırırken tam bölünme sağla\n",
+    "    # Ensure an even split when creating batches\n",
     "    num_samples = data.size(0)\n",
     "    num_batches = num_samples // batch_size\n",
-    "    data = data[:num_batches * batch_size]  # Kalan örnekleri düşür\n",
+    "    data = data[:num_batches * batch_size]  # Drop the remaining samples\n",
     "    data = data.view(num_batches, batch_size, seq_len, d_model)\n",
     "    \n",
     "    print(f\"Veri boyutu: {data.shape} (num_batches, batch_size, seq_len, d_model)\")\n",
@@ -144,8 +144,8 @@
     "    for dim in latent_dims:\n",
     "        results[f\"MLA_{dim}\"] = {\"time\": [], \"memory\": [], \"output\": None}\n",
     "    \n",
-    "    for batch in tqdm(data, desc=\"Batch'ler üzerinde test\"):\n",
-    "        # MHA için\n",
+    "    for batch in tqdm(data, desc=\"Testing over batches\"):\n",
+    "        # For MHA\n",
     "        tracemalloc.start()\n",
     "        start_time = time.time()\n",
     "        mha_output = mha(batch)\n",
@@ -156,7 +156,7 @@
     "        results[\"MHA\"][\"memory\"].append(peak_memory / 1024 / 1024)\n",
     "        results[\"MHA\"][\"output\"] = mha_output\n",
     "        \n",
-    "        # MLA için farklı latent_dim'ler\n",
+    "        # Different latent_dim values for MLA\n",
     "        for dim, mla in mla_models.items():\n",
     "            tracemalloc.start()\n",
     "            start_time = time.time()\n",
@@ -170,7 +170,7 @@
     "    \n",
     "    return results\n",
     "\n",
-    "# Sonuçları görselleştirme\n",
+    "# Visualize the resultsme\n",
     "def visualize_results(results):\n",
     "    labels = list(results.keys())\n",
     "    avg_times = [sum(results[label][\"time\"]) / len(results[label][\"time\"]) for label in labels]\n",
@@ -179,12 +179,12 @@
     "    plt.figure(figsize=(12, 5))\n",
     "    plt.subplot(1, 2, 1)\n",
     "    plt.bar(labels, avg_times, color=[\"blue\", \"orange\", \"green\"])\n",
-    "    plt.title(\"Ortalama Çalışma Süresi (saniye)\")\n",
-    "    plt.ylabel(\"Süre (s)\")\n",
+    "    plt.title(\"Average Runtime (seconds)\")\n",
+    "    plt.ylabel(\"Time (s)\")\n",
     "    \n",
     "    plt.subplot(1, 2, 2)\n",
     "    plt.bar(labels, avg_memories, color=[\"blue\", \"orange\", \"green\"])\n",
-    "    plt.title(\"Ortalama Bellek Kullanımı (MB)\")\n",
+    "    plt.title(\"Average Memory Usage (MB)\")\n",
     "    plt.ylabel(\"Bellek (MB)\")\n",
     "    \n",
     "    plt.tight_layout()\n",
@@ -194,25 +194,25 @@
     "    for label in labels[1:]:\n",
     "        mla_out = results[label][\"output\"]\n",
     "        diff = torch.mean(torch.abs(mha_out - mla_out)).item()\n",
-    "        print(f\"{label} ile MHA arasındaki ortalama çıktı farkı: {diff:.6f}\")\n",
+    "        print(f\"Average output difference between {label} and MHA: {diff:.6f}\")\n",
     "\n",
     "# Ana fonksiyon\n",
     "def main():\n",
-    "    print(\"Veri seti hazırlanıyor...\")\n",
+    "    print(\"Preparing the dataset...\")\n",
     "    data = prepare_data(batch_size=32, seq_len=128, d_model=512)\n",
     "    \n",
-    "    print(\"Performans testi yapılıyor...\")\n",
+    "    print(\"Running performance tests...\")\n",
     "    results = run_performance_test(data, d_model=512, num_heads=8, latent_dims=[128, 64])\n",
     "    \n",
-    "    print(\"\\n=== Performans Sonuçları ===\")\n",
+    "    print(\"\\n=== Performance Results ===\")\n",
     "    for label in results:\n",
     "        avg_time = sum(results[label][\"time\"]) / len(results[label][\"time\"])\n",
     "        avg_memory = sum(results[label][\"memory\"]) / len(results[label][\"memory\"])\n",
     "        print(f\"{label}:\")\n",
-    "        print(f\" - Ortalama Çalışma Süresi: {avg_time:.4f} saniye\")\n",
-    "        print(f\" - Ortalama Bellek Kullanımı: {avg_memory:.2f} MB\")\n",
+    "        print(f\" - Average Runtime: {avg_time:.4f} seconds\")\n",
+    "        print(f\" - Average Memory Usage: {avg_memory:.2f} MB\")\n",
     "    \n",
-    "    print(\"\\nGörselleştirme yapılıyor...\")\n",
+    "    print(\"\\nVisualization in progress...\")\n",
     "    visualize_results(results)\n",
     "\n",
     "if __name__ == \"__main__\":\n",
@@ -287,7 +287,7 @@
     "        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)\n",
     "        return self.W_o(context)\n",
     "\n",
-    "# Sınıflandırma modeli\n",
+    "# Classification model\n",
     "class SentimentClassifier(nn.Module):\n",
     "    def __init__(self, d_model, num_heads, attention_type=\"MHA\", latent_dim=None):\n",
     "        super(SentimentClassifier, self).__init__()\n",
@@ -306,13 +306,13 @@
     "        x = self.pool(x).squeeze(-1)\n",
     "        return self.fc(x)\n",
     "\n",
-    "# Parametre sayısını hesaplama\n",
+    "# Parameter counting\n",
     "def count_parameters(model):\n",
     "    total_params = sum(p.numel() for p in model.parameters())\n",
     "    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
     "    return total_params, trainable_params\n",
     "\n",
-    "# Veri setini hazırlama\n",
+    "# Preparing the dataset\n",
     "def prepare_data(batch_size=32, seq_len=128):\n",
     "    dataset = load_dataset(\"imdb\", split=\"train[:1000]\")\n",
     "    tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
@@ -342,11 +342,11 @@
     "    test_data = test_data[:num_test_batches * batch_size].view(num_test_batches, batch_size, seq_len)\n",
     "    test_labels = test_labels[:num_test_batches * batch_size].view(num_test_batches, batch_size)\n",
     "    \n",
-    "    print(f\"Eğitim veri boyutu: {train_data.shape}\")\n",
+    "    print(f\"Training veri boyutu: {train_data.shape}\")\n",
     "    print(f\"Test veri boyutu: {test_data.shape}\")\n",
     "    return (train_data, train_labels), (test_data, test_labels)\n",
     "\n",
-    "# Modeli eğitme (Detaylı çıktı)\n",
+    "# Train the model (detailed output)\n",
     "def train_model(model, train_data, train_labels, epochs=5, lr=0.001):\n",
     "    optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n",
     "    criterion = nn.CrossEntropyLoss()\n",
@@ -373,7 +373,7 @@
     "            _, peak_memory = tracemalloc.get_traced_memory()\n",
     "            tracemalloc.stop()\n",
     "            \n",
-    "            # Batch bazında doğruluk\n",
+    "            # Batch-level accuracy\n",
     "            _, predicted = torch.max(outputs, 1)\n",
     "            correct = (predicted == batch_labels).sum().item()\n",
     "            batch_size = batch_labels.size(0)\n",
@@ -395,14 +395,14 @@
     "        results[\"memory\"].append(avg_memory)\n",
     "        \n",
     "        print(f\"\\nEpoch {epoch+1}/{epochs}:\")\n",
-    "        print(f\" - Ortalama Kayıp: {avg_loss:.4f}\")\n",
-    "        print(f\" - Doğruluk: {accuracy:.4f}\")\n",
-    "        print(f\" - Ortalama Batch Süresi: {avg_time:.4f} saniye\")\n",
-    "        print(f\" - Ortalama Bellek Kullanımı: {avg_memory:.2f} MB\")\n",
+    "        print(f\" - Average Loss: {avg_loss:.4f}\")\n",
+    "        print(f\" - Accuracy: {accuracy:.4f}\")\n",
+    "        print(f\" - Average Batch Time: {avg_time:.4f} seconds\")\n",
+    "        print(f\" - Average Memory Usage: {avg_memory:.2f} MB\")\n",
     "    \n",
     "    return results\n",
     "\n",
-    "# Test etme (Detaylı çıktı)\n",
+    "# Testing (detailed output)\n",
     "def test_model(model, test_data, test_labels):\n",
     "    correct = 0\n",
     "    total = 0\n",
@@ -420,10 +420,10 @@
     "    \n",
     "    accuracy = correct / total\n",
     "    avg_loss = test_loss / len(test_data)\n",
-    "    print(f\"\\nTest Sonuçları:\")\n",
-    "    print(f\" - Ortalama Kayıp: {avg_loss:.4f}\")\n",
-    "    print(f\" - Doğruluk: {accuracy:.4f}\")\n",
-    "    print(f\" - Toplam Örnek Sayısı: {total}\")\n",
+    "    print(f\"\\nTest Results:\")\n",
+    "    print(f\" - Average Loss: {avg_loss:.4f}\")\n",
+    "    print(f\" - Accuracy: {accuracy:.4f}\")\n",
+    "    print(f\" - Total Number of Samples: {total}\")\n",
     "    return accuracy, avg_loss\n",
     "\n",
     "# Ana fonksiyon\n",
@@ -431,38 +431,38 @@
     "    batch_size, seq_len, d_model, num_heads = 32, 128, 512, 8\n",
     "    latent_dim = 128\n",
     "    \n",
-    "    print(\"Veri seti hazırlanıyor...\")\n",
+    "    print(\"Preparing the dataset...\")\n",
     "    (train_data, train_labels), (test_data, test_labels) = prepare_data(batch_size, seq_len)\n",
     "    \n",
-    "    # Modelleri oluştur\n",
+    "    # Build the models\n",
     "    print(\"\\n=== MHA Modeli ===\")\n",
     "    mha_model = SentimentClassifier(d_model, num_heads, \"MHA\")\n",
     "    mha_total_params, mha_trainable_params = count_parameters(mha_model)\n",
-    "    print(f\"Toplam Parametre Sayısı: {mha_total_params:,}\")\n",
-    "    print(f\"Eğitilebilir Parametre Sayısı: {mha_trainable_params:,}\")\n",
+    "    print(f\"Total number of parameters: {mha_total_params:,}\")\n",
+    "    print(f\"Number of trainable parameters: {mha_trainable_params:,}\")\n",
     "    \n",
     "    print(\"\\n=== MLA Modeli ===\")\n",
     "    mla_model = SentimentClassifier(d_model, num_heads, \"MLA\", latent_dim)\n",
     "    mla_total_params, mla_trainable_params = count_parameters(mla_model)\n",
-    "    print(f\"Toplam Parametre Sayısı: {mla_total_params:,}\")\n",
-    "    print(f\"Eğitilebilir Parametre Sayısı: {mla_trainable_params:,}\")\n",
+    "    print(f\"Total Number of Parameters: {mla_total_params:,}\")\n",
+    "    print(f\"Number of trainable parameters: {mla_trainable_params:,}\")\n",
     "    \n",
-    "    print(\"\\nMHA modeli eğitiliyor...\")\n",
+    "    print(\"\\nTraining the MHA model...\")\n",
     "    mha_results = train_model(mha_model, train_data, train_labels)\n",
     "    mha_accuracy, mha_test_loss = test_model(mha_model, test_data, test_labels)\n",
     "    \n",
-    "    print(\"\\nMLA modeli eğitiliyor...\")\n",
+    "    print(\"\\nTraining the MLA model...\")\n",
     "    mla_results = train_model(mla_model, train_data, train_labels)\n",
     "    mla_accuracy, mla_test_loss = test_model(mla_model, test_data, test_labels)\n",
     "    \n",
-    "    # Sonuçları görselleştir\n",
+    "    # Visualize the results\n",
     "    epochs = range(1, 6)\n",
     "    plt.figure(figsize=(15, 10))\n",
     "    \n",
     "    plt.subplot(2, 2, 1)\n",
     "    plt.plot(epochs, mha_results[\"loss\"], label=\"MHA\")\n",
     "    plt.plot(epochs, mla_results[\"loss\"], label=\"MLA\")\n",
-    "    plt.title(\"Eğitim Kaybı\")\n",
+    "    plt.title(\"Training Loss\")\n",
     "    plt.xlabel(\"Epoch\")\n",
     "    plt.ylabel(\"Loss\")\n",
     "    plt.legend()\n",
@@ -470,7 +470,7 @@
     "    plt.subplot(2, 2, 2)\n",
     "    plt.plot(epochs, mha_results[\"accuracy\"], label=\"MHA\")\n",
     "    plt.plot(epochs, mla_results[\"accuracy\"], label=\"MLA\")\n",
-    "    plt.title(\"Eğitim Doğruluğu\")\n",
+    "    plt.title(\"Training Accuracy\")\n",
     "    plt.xlabel(\"Epoch\")\n",
     "    plt.ylabel(\"Accuracy\")\n",
     "    plt.legend()\n",
@@ -478,15 +478,15 @@
     "    plt.subplot(2, 2, 3)\n",
     "    plt.plot(epochs, mha_results[\"time\"], label=\"MHA\")\n",
     "    plt.plot(epochs, mla_results[\"time\"], label=\"MLA\")\n",
-    "    plt.title(\"Ortalama Batch Süresi (s)\")\n",
+    "    plt.title(\"Average Batch Time (s)\")\n",
     "    plt.xlabel(\"Epoch\")\n",
-    "    plt.ylabel(\"Süre (s)\")\n",
+    "    plt.ylabel(\"Time (s)\")\n",
     "    plt.legend()\n",
     "    \n",
     "    plt.subplot(2, 2, 4)\n",
     "    plt.plot(epochs, mha_results[\"memory\"], label=\"MHA\")\n",
     "    plt.plot(epochs, mla_results[\"memory\"], label=\"MLA\")\n",
-    "    plt.title(\"Ortalama Bellek Kullanımı (MB)\")\n",
+    "    plt.title(\"Average Memory Usage (MB)\")\n",
     "    plt.xlabel(\"Epoch\")\n",
     "    plt.ylabel(\"Bellek (MB)\")\n",
     "    plt.legend()\n",
@@ -566,7 +566,7 @@
     "        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)\n",
     "        return self.W_o(context)\n",
     "\n",
-    "# Sınıflandırma modeli\n",
+    "# Classification model\n",
     "class SentimentClassifier(nn.Module):\n",
     "    def __init__(self, d_model, num_heads, attention_type=\"MHA\", latent_dim=None):\n",
     "        super(SentimentClassifier, self).__init__()\n",
@@ -585,13 +585,13 @@
     "        x = self.pool(x).squeeze(-1)\n",
     "        return self.fc(x)\n",
     "\n",
-    "# Parametre sayısını hesaplama\n",
+    "# Parameter counting\n",
     "def count_parameters(model):\n",
     "    total_params = sum(p.numel() for p in model.parameters())\n",
     "    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
     "    return total_params, trainable_params\n",
     "\n",
-    "# Veri setini hazırlama (Validasyon seti eklendi)\n",
+    "# Preparing the dataset (Validation seti eklendi)\n",
     "def prepare_data(batch_size=32, seq_len=128):\n",
     "    dataset = load_dataset(\"imdb\", split=\"train[:1000]\")\n",
     "    tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
@@ -606,7 +606,7 @@
     "    input_ids = inputs[\"input_ids\"]\n",
     "    labels = torch.tensor(dataset[\"label\"])\n",
     "    \n",
-    "    # Eğitim, validasyon ve test setine ayır\n",
+    "    # Split into training, validation, and test sets\n",
     "    train_size = 700\n",
     "    val_size = 100\n",
     "    test_size = 200\n",
@@ -618,7 +618,7 @@
     "    test_data = input_ids[train_size+val_size:]\n",
     "    test_labels = labels[train_size+val_size:]\n",
     "    \n",
-    "    # Batch'lere ayır\n",
+    "    # Split into batches\n",
     "    num_train_batches = train_size // batch_size\n",
     "    train_data = train_data[:num_train_batches * batch_size].view(num_train_batches, batch_size, seq_len)\n",
     "    train_labels = train_labels[:num_train_batches * batch_size].view(num_train_batches, batch_size)\n",
@@ -631,12 +631,12 @@
     "    test_data = test_data[:num_test_batches * batch_size].view(num_test_batches, batch_size, seq_len)\n",
     "    test_labels = test_labels[:num_test_batches * batch_size].view(num_test_batches, batch_size)\n",
     "    \n",
-    "    print(f\"Eğitim veri boyutu: {train_data.shape}\")\n",
-    "    print(f\"Validasyon veri boyutu: {val_data.shape}\")\n",
+    "    print(f\"Training veri boyutu: {train_data.shape}\")\n",
+    "    print(f\"Validation veri boyutu: {val_data.shape}\")\n",
     "    print(f\"Test veri boyutu: {test_data.shape}\")\n",
     "    return (train_data, train_labels), (val_data, val_labels), (test_data, test_labels)\n",
     "\n",
-    "# Modeli eğitme ve validasyon (Batch bazında detaylı)\n",
+    "# Train and validate the model (detailed per batch)\n",
     "def train_model(model, train_data, train_labels, val_data, val_labels, epochs=5, lr=0.001):\n",
     "    optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n",
     "    criterion = nn.CrossEntropyLoss()\n",
@@ -654,7 +654,7 @@
     "        epoch_memory = 0\n",
     "        total_train_samples = 0\n",
     "        \n",
-    "        print(f\"\\n=== Epoch {epoch+1}/{epochs} - Eğitim ===\")\n",
+    "        print(f\"\\n=== Epoch {epoch+1}/{epochs} - Training ===\")\n",
     "        for i, (batch_data, batch_labels) in enumerate(zip(train_data, train_labels)):\n",
     "            tracemalloc.start()\n",
     "            start_time = time.time()\n",
@@ -679,9 +679,9 @@
     "            epoch_memory += peak_memory / 1024 / 1024\n",
     "            total_train_samples += batch_size\n",
     "            \n",
-    "            print(f\"Batch {i+1}/{len(train_data)} - Kayıp: {loss.item():.4f} - Doğruluk: {correct/batch_size:.4f}\")\n",
+    "            print(f\"Batch {i+1}/{len(train_data)} - Loss: {loss.item():.4f} - Accuracy: {correct/batch_size:.4f}\")\n",
     "        \n",
-    "        # Validasyon\n",
+    "        # Validation\n",
     "        model.eval()\n",
     "        epoch_val_loss = 0\n",
     "        epoch_val_correct = 0\n",
@@ -699,7 +699,7 @@
     "                epoch_val_correct += correct\n",
     "                total_val_samples += batch_size\n",
     "        \n",
-    "        # Ortalamaları hesapla\n",
+    "        # Compute the averages\n",
     "        train_loss = epoch_train_loss / len(train_data)\n",
     "        train_acc = epoch_train_correct / total_train_samples\n",
     "        val_loss = epoch_val_loss / len(val_data)\n",
@@ -714,11 +714,11 @@
     "        results[\"time\"].append(avg_time)\n",
     "        results[\"memory\"].append(avg_memory)\n",
     "        \n",
-    "        print(f\"\\nEpoch {epoch+1}/{epochs} Özeti:\")\n",
-    "        print(f\" - Eğitim Kayıp: {train_loss:.4f} - Eğitim Doğruluk: {train_acc:.4f}\")\n",
-    "        print(f\" - Validasyon Kayıp: {val_loss:.4f} - Validasyon Doğruluk: {val_acc:.4f}\")\n",
-    "        print(f\" - Ortalama Batch Süresi: {avg_time:.4f} saniye\")\n",
-    "        print(f\" - Ortalama Bellek Kullanımı: {avg_memory:.2f} MB\")\n",
+    "        print(f\"\\nEpoch {epoch+1}/{epochs} Summary:\")\n",
+    "        print(f\" - Training Loss: {train_loss:.4f} - Training Accuracy: {train_acc:.4f}\")\n",
+    "        print(f\" - Validation Loss: {val_loss:.4f} - Validation Accuracy: {val_acc:.4f}\")\n",
+    "        print(f\" - Average Batch Time: {avg_time:.4f} seconds\")\n",
+    "        print(f\" - Average Memory Usage: {avg_memory:.2f} MB\")\n",
     "    \n",
     "    return results\n",
     "\n",
@@ -738,14 +738,14 @@
     "            total += batch_labels.size(0)\n",
     "            correct += (predicted == batch_labels).sum().item()\n",
     "            test_loss += loss.item()\n",
-    "            print(f\"Test Batch {i+1}/{len(test_data)} - Kayıp: {loss.item():.4f} - Doğruluk: {(predicted == batch_labels).sum().item()/batch_labels.size(0):.4f}\")\n",
+    "            print(f\"Test Batch {i+1}/{len(test_data)} - Loss: {loss.item():.4f} - Accuracy: {(predicted == batch_labels).sum().item()/batch_labels.size(0):.4f}\")\n",
     "    \n",
     "    accuracy = correct / total\n",
     "    avg_loss = test_loss / len(test_data)\n",
-    "    print(f\"\\nTest Özeti:\")\n",
-    "    print(f\" - Ortalama Kayıp: {avg_loss:.4f}\")\n",
-    "    print(f\" - Doğruluk: {accuracy:.4f}\")\n",
-    "    print(f\" - Toplam Örnek Sayısı: {total}\")\n",
+    "    print(f\"\\nTest Summary:\")\n",
+    "    print(f\" - Average Loss: {avg_loss:.4f}\")\n",
+    "    print(f\" - Accuracy: {accuracy:.4f}\")\n",
+    "    print(f\" - Total Number of Samples: {total}\")\n",
     "    return accuracy, avg_loss\n",
     "\n",
     "# Ana fonksiyon\n",
@@ -753,13 +753,13 @@
     "    # Hiperparametreler\n",
     "    batch_size, seq_len, d_model, num_heads = 32, 128, 512, 8\n",
     "    epochs = 1\n",
-    "    lr = 0.0001  # Daha yavaş öğrenme için düşürüldü\n",
-    "    latent_dims = [64, 128, 256]  # Farklı latent_dim değerleri\n",
+    "    lr = 0.0001  # Reduced for slower learning\n",
+    "    latent_dims = [64, 128, 256]  # Different latent_dim values\n",
     "    \n",
-    "    print(\"Veri seti hazırlanıyor...\")\n",
+    "    print(\"Preparing the dataset...\")\n",
     "    (train_data, train_labels), (val_data, val_labels), (test_data, test_labels) = prepare_data(batch_size, seq_len)\n",
     "    \n",
-    "    # Hiperparametreleri yazdır\n",
+    "    # Print hyperparameters\n",
     "    print(\"\\n=== Hiperparametreler ===\")\n",
     "    print(f\"Batch Size: {batch_size}\")\n",
     "    print(f\"Sequence Length: {seq_len}\")\n",
@@ -772,88 +772,88 @@
     "    print(\"\\n=== MHA Modeli ===\")\n",
     "    mha_model = SentimentClassifier(d_model, num_heads, \"MHA\")\n",
     "    mha_total_params, mha_trainable_params = count_parameters(mha_model)\n",
-    "    print(f\"Toplam Parametre Sayısı: {mha_total_params:,}\")\n",
-    "    print(f\"Eğitilebilir Parametre Sayısı: {mha_trainable_params:,}\")\n",
+    "    print(f\"Total number of parameters: {mha_total_params:,}\")\n",
+    "    print(f\"Number of trainable parameters: {mha_trainable_params:,}\")\n",
     "    \n",
-    "    print(\"\\nMHA modeli eğitiliyor...\")\n",
+    "    print(\"\\nTraining the MHA model...\")\n",
     "    mha_results = train_model(mha_model, train_data, train_labels, val_data, val_labels, epochs, lr)\n",
     "    mha_accuracy, mha_test_loss = test_model(mha_model, test_data, test_labels)\n",
     "    \n",
-    "    # MLA Modelleri (farklı latent_dim'ler)\n",
+    "    # MLA models (different latent_dim values)\n",
     "    mla_results = {}\n",
     "    for latent_dim in latent_dims:\n",
     "        print(f\"\\n=== MLA Modeli (latent_dim={latent_dim}) ===\")\n",
     "        mla_model = SentimentClassifier(d_model, num_heads, \"MLA\", latent_dim)\n",
     "        mla_total_params, mla_trainable_params = count_parameters(mla_model)\n",
-    "        print(f\"Toplam Parametre Sayısı: {mla_total_params:,}\")\n",
-    "        print(f\"Eğitilebilir Parametre Sayısı: {mla_trainable_params:,}\")\n",
+    "        print(f\"Total Number of Parameters: {mla_total_params:,}\")\n",
+    "        print(f\"Number of trainable parameters: {mla_trainable_params:,}\")\n",
     "        \n",
-    "        print(f\"\\nMLA modeli (latent_dim={latent_dim}) eğitiliyor...\")\n",
+    "        print(f\"\\nTraining the MLA model (latent_dim={latent_dim})...\")\n",
     "        mla_results[latent_dim] = train_model(mla_model, train_data, train_labels, val_data, val_labels, epochs, lr)\n",
     "        mla_accuracy, mla_test_loss = test_model(mla_model, test_data, test_labels)\n",
     "        mla_results[latent_dim][\"test_acc\"] = mla_accuracy\n",
     "        mla_results[latent_dim][\"test_loss\"] = mla_test_loss\n",
     "    \n",
-    "    # Sonuçları görselleştir\n",
+    "    # Visualize the results\n",
     "    epochs_range = range(1, epochs + 1)\n",
     "    plt.figure(figsize=(15, 15))\n",
     "    \n",
-    "    # Eğitim Kaybı\n",
+    "    # Training loss\n",
     "    plt.subplot(3, 2, 1)\n",
     "    plt.plot(epochs_range, mha_results[\"train_loss\"], label=\"MHA\")\n",
     "    for latent_dim in latent_dims:\n",
     "        plt.plot(epochs_range, mla_results[latent_dim][\"train_loss\"], label=f\"MLA_{latent_dim}\")\n",
-    "    plt.title(\"Eğitim Kaybı\")\n",
+    "    plt.title(\"Training Loss\")\n",
     "    plt.xlabel(\"Epoch\")\n",
     "    plt.ylabel(\"Loss\")\n",
     "    plt.legend()\n",
     "    \n",
-    "    # Eğitim Doğruluğu\n",
+    "    # Training Accuracy\n",
     "    plt.subplot(3, 2, 2)\n",
     "    plt.plot(epochs_range, mha_results[\"train_acc\"], label=\"MHA\")\n",
     "    for latent_dim in latent_dims:\n",
     "        plt.plot(epochs_range, mla_results[latent_dim][\"train_acc\"], label=f\"MLA_{latent_dim}\")\n",
-    "    plt.title(\"Eğitim Doğruluğu\")\n",
+    "    plt.title(\"Training Accuracy\")\n",
     "    plt.xlabel(\"Epoch\")\n",
     "    plt.ylabel(\"Accuracy\")\n",
     "    plt.legend()\n",
     "    \n",
-    "    # Validasyon Kaybı\n",
+    "    # Validation loss\n",
     "    plt.subplot(3, 2, 3)\n",
     "    plt.plot(epochs_range, mha_results[\"val_loss\"], label=\"MHA\")\n",
     "    for latent_dim in latent_dims:\n",
     "        plt.plot(epochs_range, mla_results[latent_dim][\"val_loss\"], label=f\"MLA_{latent_dim}\")\n",
-    "    plt.title(\"Validasyon Kaybı\")\n",
+    "    plt.title(\"Validation Loss\")\n",
     "    plt.xlabel(\"Epoch\")\n",
     "    plt.ylabel(\"Loss\")\n",
     "    plt.legend()\n",
     "    \n",
-    "    # Validasyon Doğruluğu\n",
+    "    # Validation accuracy\n",
     "    plt.subplot(3, 2, 4)\n",
     "    plt.plot(epochs_range, mha_results[\"val_acc\"], label=\"MHA\")\n",
     "    for latent_dim in latent_dims:\n",
     "        plt.plot(epochs_range, mla_results[latent_dim][\"val_acc\"], label=f\"MLA_{latent_dim}\")\n",
-    "    plt.title(\"Validasyon Doğruluğu\")\n",
+    "    plt.title(\"Validation Accuracy\")\n",
     "    plt.xlabel(\"Epoch\")\n",
     "    plt.ylabel(\"Accuracy\")\n",
     "    plt.legend()\n",
     "    \n",
-    "    # Ortalama Batch Süresi\n",
+    "    # Average batch time\n",
     "    plt.subplot(3, 2, 5)\n",
     "    plt.plot(epochs_range, mha_results[\"time\"], label=\"MHA\")\n",
     "    for latent_dim in latent_dims:\n",
     "        plt.plot(epochs_range, mla_results[latent_dim][\"time\"], label=f\"MLA_{latent_dim}\")\n",
-    "    plt.title(\"Ortalama Batch Süresi (s)\")\n",
+    "    plt.title(\"Average Batch Time (s)\")\n",
     "    plt.xlabel(\"Epoch\")\n",
-    "    plt.ylabel(\"Süre (s)\")\n",
+    "    plt.ylabel(\"Time (s)\")\n",
     "    plt.legend()\n",
     "    \n",
-    "    # Ortalama Bellek Kullanımı\n",
+    "    # Average memory usage\n",
     "    plt.subplot(3, 2, 6)\n",
     "    plt.plot(epochs_range, mha_results[\"memory\"], label=\"MHA\")\n",
     "    for latent_dim in latent_dims:\n",
     "        plt.plot(epochs_range, mla_results[latent_dim][\"memory\"], label=f\"MLA_{latent_dim}\")\n",
-    "    plt.title(\"Ortalama Bellek Kullanımı (MB)\")\n",
+    "    plt.title(\"Average Memory Usage (MB)\")\n",
     "    plt.xlabel(\"Epoch\")\n",
     "    plt.ylabel(\"Bellek (MB)\")\n",
     "    plt.legend()\n",
@@ -880,7 +880,7 @@
     "import matplotlib.pyplot as plt\n",
     "from tqdm import tqdm\n",
     "\n",
-    "# RoPE için düzeltilmiş yardımcı fonksiyon\n",
+    "# Corrected helper function for RoPE\n",
     "def apply_rotary_pos_emb(q, k, seq_len, dim):\n",
     "    theta = torch.arange(0, dim, 2, dtype=torch.float, device=q.device) / dim\n",
     "    theta = 10000 ** (-theta)\n",
@@ -889,7 +889,7 @@
     "    sin_angles = torch.sin(angles).unsqueeze(0).unsqueeze(0)  # [1, 1, seq_len, dim/2]\n",
     "    cos_angles = torch.cos(angles).unsqueeze(0).unsqueeze(0)\n",
     "    \n",
-    "    # q ve k'nın yarısını döndürmek için ayır\n",
+    "    # Split q and k to return half\n",
     "    q_reshape = q.reshape(*q.shape[:-1], -1, 2)  # [batch, heads, seq_len, d_k/2, 2]\n",
     "    k_reshape = k.reshape(*k.shape[:-1], -1, 2)\n",
     "    \n",
@@ -969,7 +969,7 @@
     "        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)\n",
     "        return self.W_o(context)\n",
     "\n",
-    "# Longformer Attention (Global token desteğiyle geliştirilmiş)\n",
+    "# Longformer Attention (enhanced with global token support)\n",
     "class LongformerAttention(nn.Module):\n",
     "    def __init__(self, d_model, num_heads, window_size=4, dropout=0.1):\n",
     "        super().__init__()\n",
@@ -993,9 +993,9 @@
     "        # CLS token (ilk token) global olsun\n",
     "        scores = torch.zeros(batch_size, self.num_heads, seq_len, seq_len, device=x.device)\n",
     "        for i in range(seq_len):\n",
-    "            if i == 0:  # CLS token global dikkat alır\n",
+    "            if i == 0:  # The CLS token receives global attention\n",
     "                scores[:, :, 0, :] = torch.matmul(Q[:, :, 0:1], K.transpose(-2, -1)) / (self.d_k ** 0.5)\n",
-    "            else:  # Pencere bazlı dikkat\n",
+    "            else:  # Window-based attention\n",
     "                start = max(0, i - self.window_size)\n",
     "                end = min(seq_len, i + self.window_size + 1)\n",
     "                scores[:, :, i, start:end] = torch.matmul(Q[:, :, i:i+1], K[:, :, start:end].transpose(-2, -1)) / (self.d_k ** 0.5)\n",
@@ -1029,7 +1029,7 @@
     "        \n",
     "        return torch.stack(outputs, dim=1)\n",
     "\n",
-    "# Sınıflandırma modeli\n",
+    "# Classification model\n",
     "class SentimentClassifier(nn.Module):\n",
     "    def __init__(self, d_model, num_heads, layers_config, latent_dim=None):\n",
     "        super().__init__()\n",
@@ -1056,15 +1056,15 @@
     "        cls_output = x[:, 0]  # CLS token\n",
     "        return self.fc(cls_output)\n",
     "\n",
-    "# Parametre sayısını hesaplama\n",
+    "# Parameter counting\n",
     "def count_parameters(model):\n",
     "    total_params = sum(p.numel() for p in model.parameters())\n",
     "    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
     "    return total_params, trainable_params\n",
     "\n",
-    "# Veri setini hazırlama (Daha büyük veri seti)\n",
+    "# Preparing the dataset (larger dataset)\n",
     "def prepare_data(batch_size=32, seq_len=128):\n",
-    "    dataset = load_dataset(\"imdb\", split=\"train[:5000]\")  # Daha büyük veri seti\n",
+    "    dataset = load_dataset(\"imdb\", split=\"train[:5000]\")  # Larger dataset\n",
     "    tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
     "    \n",
     "    inputs = tokenizer(\n",
@@ -1100,12 +1100,12 @@
     "    test_data = test_data[:num_test_batches * batch_size].view(num_test_batches, batch_size, seq_len)\n",
     "    test_labels = test_labels[:num_test_batches * batch_size].view(num_test_batches, batch_size)\n",
     "    \n",
-    "    print(f\"Eğitim veri boyutu: {train_data.shape}\")\n",
-    "    print(f\"Validasyon veri boyutu: {val_data.shape}\")\n",
+    "    print(f\"Training veri boyutu: {train_data.shape}\")\n",
+    "    print(f\"Validation veri boyutu: {val_data.shape}\")\n",
     "    print(f\"Test veri boyutu: {test_data.shape}\")\n",
     "    return (train_data, train_labels), (val_data, val_labels), (test_data, test_labels)\n",
     "\n",
-    "# Modeli eğitme\n",
+    "# Train the model\n",
     "def train_model(model, train_data, train_labels, val_data, val_labels, epochs=5, lr=0.0001):\n",
     "    optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n",
     "    criterion = nn.CrossEntropyLoss()\n",
@@ -1120,7 +1120,7 @@
     "        epoch_memory = 0\n",
     "        total_train_samples = 0\n",
     "        \n",
-    "        print(f\"\\n=== Epoch {epoch+1}/{epochs} - Eğitim ===\")\n",
+    "        print(f\"\\n=== Epoch {epoch+1}/{epochs} - Training ===\")\n",
     "        for i, (batch_data, batch_labels) in enumerate(zip(train_data, train_labels)):\n",
     "            tracemalloc.start()\n",
     "            start_time = time.time()\n",
@@ -1145,7 +1145,7 @@
     "            epoch_memory += peak_memory / 1024 / 1024\n",
     "            total_train_samples += batch_size\n",
     "            \n",
-    "            print(f\"Batch {i+1}/{len(train_data)} - Kayıp: {loss.item():.4f} - Doğruluk: {correct/batch_size:.4f}\")\n",
+    "            print(f\"Batch {i+1}/{len(train_data)} - Loss: {loss.item():.4f} - Accuracy: {correct/batch_size:.4f}\")\n",
     "        \n",
     "        model.eval()\n",
     "        epoch_val_loss = 0\n",
@@ -1178,11 +1178,11 @@
     "        results[\"time\"].append(avg_time)\n",
     "        results[\"memory\"].append(avg_memory)\n",
     "        \n",
-    "        print(f\"\\nEpoch {epoch+1}/{epochs} Özeti:\")\n",
-    "        print(f\" - Eğitim Kayıp: {train_loss:.4f} - Eğitim Doğruluk: {train_acc:.4f}\")\n",
-    "        print(f\" - Validasyon Kayıp: {val_loss:.4f} - Validasyon Doğruluk: {val_acc:.4f}\")\n",
-    "        print(f\" - Ortalama Batch Süresi: {avg_time:.4f} saniye\")\n",
-    "        print(f\" - Ortalama Bellek Kullanımı: {avg_memory:.2f} MB\")\n",
+    "        print(f\"\\nEpoch {epoch+1}/{epochs} Summary:\")\n",
+    "        print(f\" - Training Loss: {train_loss:.4f} - Training Accuracy: {train_acc:.4f}\")\n",
+    "        print(f\" - Validation Loss: {val_loss:.4f} - Validation Accuracy: {val_acc:.4f}\")\n",
+    "        print(f\" - Average Batch Time: {avg_time:.4f} seconds\")\n",
+    "        print(f\" - Average Memory Usage: {avg_memory:.2f} MB\")\n",
     "    \n",
     "    return results\n",
     "\n",
@@ -1202,14 +1202,14 @@
     "            total += batch_labels.size(0)\n",
     "            correct += (predicted == batch_labels).sum().item()\n",
     "            test_loss += loss.item()\n",
-    "            print(f\"Test Batch {i+1}/{len(test_data)} - Kayıp: {loss.item():.4f} - Doğruluk: {(predicted == batch_labels).sum().item()/batch_labels.size(0):.4f}\")\n",
+    "            print(f\"Test Batch {i+1}/{len(test_data)} - Loss: {loss.item():.4f} - Accuracy: {(predicted == batch_labels).sum().item()/batch_labels.size(0):.4f}\")\n",
     "    \n",
     "    accuracy = correct / total\n",
     "    avg_loss = test_loss / len(test_data)\n",
-    "    print(f\"\\nTest Özeti:\")\n",
-    "    print(f\" - Ortalama Kayıp: {avg_loss:.4f}\")\n",
-    "    print(f\" - Doğruluk: {accuracy:.4f}\")\n",
-    "    print(f\" - Toplam Örnek Sayısı: {total}\")\n",
+    "    print(f\"\\nTest Summary:\")\n",
+    "    print(f\" - Average Loss: {avg_loss:.4f}\")\n",
+    "    print(f\" - Accuracy: {accuracy:.4f}\")\n",
+    "    print(f\" - Total Number of Samples: {total}\")\n",
     "    return accuracy, avg_loss\n",
     "\n",
     "# Ana fonksiyon\n",
@@ -1219,7 +1219,7 @@
     "    lr = 0.0001\n",
     "    latent_dims = [64, 128]\n",
     "    \n",
-    "    print(\"Veri seti hazırlanıyor...\")\n",
+    "    print(\"Preparing the dataset...\")\n",
     "    (train_data, train_labels), (val_data, val_labels), (test_data, test_labels) = prepare_data(batch_size, seq_len)\n",
     "    \n",
     "    print(\"\\n=== Hiperparametreler ===\")\n",
@@ -1242,23 +1242,23 @@
     "    for name, model in models.items():\n",
     "        print(f\"\\n=== {name} Modeli ===\")\n",
     "        total_params, trainable_params = count_parameters(model)\n",
-    "        print(f\"Toplam Parametre Sayısı: {total_params:,}\")\n",
-    "        print(f\"Eğitilebilir Parametre Sayısı: {trainable_params:,}\")\n",
+    "        print(f\"Total Number of Parameters: {total_params:,}\")\n",
+    "        print(f\"Number of trainable parameters: {trainable_params:,}\")\n",
     "        \n",
-    "        print(f\"\\n{name} modeli eğitiliyor...\")\n",
+    "        print(f\"\\nTraining the {name} model...\")\n",
     "        results[name] = train_model(model, train_data, train_labels, val_data, val_labels, epochs, lr)\n",
     "        accuracy, test_loss = test_model(model, test_data, test_labels)\n",
     "        results[name][\"test_acc\"] = accuracy\n",
     "        results[name][\"test_loss\"] = test_loss\n",
     "    \n",
-    "    # Görselleştirme\n",
+    "    # Visualization\n",
     "    epochs_range = range(1, epochs + 1)\n",
     "    plt.figure(figsize=(15, 15))\n",
     "    \n",
     "    for i, (metric, title) in enumerate([\n",
-    "        (\"train_loss\", \"Eğitim Kaybı\"), (\"train_acc\", \"Eğitim Doğruluğu\"),\n",
-    "        (\"val_loss\", \"Validasyon Kaybı\"), (\"val_acc\", \"Validasyon Doğruluğu\"),\n",
-    "        (\"time\", \"Ortalama Batch Süresi (s)\"), (\"memory\", \"Ortalama Bellek Kullanımı (MB)\")\n",
+    "        (\"train_loss\", \"Training Loss\"), (\"train_acc\", \"Training Accuracy\"),\n",
+    "        (\"val_loss\", \"Validation Loss\"), (\"val_acc\", \"Validation Accuracy\"),\n",
+    "        (\"time\", \"Average Batch Time (s)\"), (\"memory\", \"Average Memory Usage (MB)\")\n",
     "    ], 1):\n",
     "        plt.subplot(3, 2, i)\n",
     "        for name in models.keys():\n",
@@ -1271,13 +1271,13 @@
     "    plt.tight_layout()\n",
     "    plt.show()\n",
     "    \n",
-    "    # Yorumlar ve Öneriler\n",
-    "    print(\"\\n=== Yorumlar ve Öneriler ===\")\n",
-    "    print(\"1. Performans: SSM ve Longformer, uzun dizilerde avantaj sağlayabilir. Şu an 5000 örnek kullanıldı, farklar daha belirgin hale geldi.\")\n",
-    "    print(\"   Daha büyük bir veri seti (örneğin tüm IMDB) ile daha iyi sonuçlar alınabilir.\")\n",
-    "    print(\"2. Karmaşıklık: Katman yığma ve SSM eklenmesi parametre sayısını artırdı, dropout (0.1) ile overfitting önlenmeye çalışıldı.\")\n",
-    "    print(\"   Daha fazla regularization (örneğin weight decay) düşünülebilir.\")\n",
-    "    print(\"3. İyileştirme: LongformerAttention, CLS token’a global dikkat ile geliştirildi. Daha gerçekçi bir Longformer için dilated attention eklenebilir.\")\n",
+    "    # Comments and suggestions\n",
+    "    print(\"\\n=== Comments and Suggestions ===\")\n",
+    "    print(\"1. Performance: SSM and Longformer can help with long sequences. With 5000 samples the differences became clearer.\")\n",
+    "    print(\"   Better results can be obtained with a larger dataset (e.g., the full IMDB).\")\n",
+    "    print(\"2. Complexity: Stacking layers and adding SSM increased parameters; dropout (0.1) was used to prevent overfitting.\")\n",
+    "    print(\"   Consider additional regularization (e.g., weight decay).\")\n",
+    "    print(\"3. Improvement: LongformerAttention was enhanced with global attention on the CLS token. Add dilated attention for a more realistic Longformer.\")\n",
     "\n",
     "if __name__ == \"__main__\":\n",
     "    main()"
@@ -1300,7 +1300,7 @@
     "import numpy as np\n",
     "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
     "\n",
-    "# RoPE için yardımcı fonksiyon\n",
+    "# Helper function for RoPE\n",
     "def apply_rotary_pos_emb(q, k, seq_len, dim):\n",
     "    theta = torch.arange(0, dim, 2, dtype=torch.float, device=q.device) / dim\n",
     "    theta = 10000 ** (-theta)\n",
@@ -1390,7 +1390,7 @@
     "        self.d_model = d_model\n",
     "        self.embedding = nn.Embedding(vocab_size, d_model)\n",
     "        \n",
-    "        # Düzgün bir liste oluştur\n",
+    "        # Create a well-structured list\n",
     "        layers = []\n",
     "        for i in range(num_layers):\n",
     "            if i % 2 == 0:\n",
@@ -1401,7 +1401,7 @@
     "        self.layers = nn.ModuleList(layers)\n",
     "        \n",
     "        self.norm = nn.LayerNorm(d_model)\n",
-    "        self.fc = nn.Linear(d_model, 2)  # Duygu analizi için 2 sınıf\n",
+    "        self.fc = nn.Linear(d_model, 2)  # Two classes for sentiment analysis\n",
     "        \n",
     "    def forward(self, x):\n",
     "        x = self.embedding(x)\n",
@@ -1411,13 +1411,13 @@
     "        cls_output = x[:, 0]  # CLS token\n",
     "        return self.fc(cls_output)\n",
     "\n",
-    "# Parametre sayısını hesaplama\n",
+    "# Parameter counting\n",
     "def count_parameters(model):\n",
     "    total_params = sum(p.numel() for p in model.parameters())\n",
     "    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
     "    return total_params, trainable_params\n",
     "\n",
-    "# Veri setini hazırlama\n",
+    "# Preparing the dataset\n",
     "def prepare_data(batch_size=32, seq_len=128):\n",
     "    dataset = load_dataset(\"imdb\", split=\"train[:5000]\")\n",
     "    tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
@@ -1456,13 +1456,13 @@
     "    test_labels = test_labels[:num_test_batches * batch_size].view(num_test_batches, batch_size)\n",
     "    test_texts = dataset[\"text\"][train_size + val_size:train_size + val_size + num_test_batches * batch_size]\n",
     "    \n",
-    "    # Veri seti dağılımını kontrol et\n",
-    "    print(f\"Eğitim veri boyutu: {train_data.shape}, Etiket Dağılımı: {torch.bincount(train_labels.flatten())}\")\n",
-    "    print(f\"Validasyon veri boyutu: {val_data.shape}, Etiket Dağılımı: {torch.bincount(val_labels.flatten())}\")\n",
-    "    print(f\"Test veri boyutu: {test_data.shape}, Etiket Dağılımı: {torch.bincount(test_labels.flatten())}\")\n",
+    "    # Check dataset distribution\n",
+    "    print(f\"Training data shape: {train_data.shape}, Label distribution: {torch.bincount(train_labels.flatten())}\")\n",
+    "    print(f\"Validation data shape: {val_data.shape}, Label distribution: {torch.bincount(val_labels.flatten())}\")\n",
+    "    print(f\"Test data shape: {test_data.shape}, Label distribution: {torch.bincount(test_labels.flatten())}\")\n",
     "    return (train_data, train_labels), (val_data, val_labels), (test_data, test_labels, test_texts), tokenizer.vocab_size, tokenizer\n",
     "\n",
-    "# Modeli eğitme\n",
+    "# Train the model\n",
     "def train_model(model, train_data, train_labels, val_data, val_labels, epochs=5, lr=0.0001):\n",
     "    optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n",
     "    criterion = nn.CrossEntropyLoss()\n",
@@ -1477,7 +1477,7 @@
     "        epoch_memory = 0\n",
     "        total_train_samples = 0\n",
     "        \n",
-    "        print(f\"\\n=== Epoch {epoch + 1}/{epochs} - Eğitim ===\")\n",
+    "        print(f\"\\n=== Epoch {epoch + 1}/{epochs} - Training ===\")\n",
     "        for i, (batch_data, batch_labels) in enumerate(zip(train_data, train_labels)):\n",
     "            tracemalloc.start()\n",
     "            start_time = time.time()\n",
@@ -1503,7 +1503,7 @@
     "            total_train_samples += batch_size\n",
     "            \n",
     "            if (i + 1) % 10 == 0:\n",
-    "                print(f\"Batch {i + 1}/{len(train_data)} - Kayıp: {loss.item():.4f} - Doğruluk: {correct / batch_size:.4f}\")\n",
+    "                print(f\"Batch {i + 1}/{len(train_data)} - Loss: {loss.item():.4f} - Accuracy: {correct / batch_size:.4f}\")\n",
     "        \n",
     "        model.eval()\n",
     "        epoch_val_loss = 0\n",
@@ -1536,11 +1536,11 @@
     "        results[\"time\"].append(avg_time)\n",
     "        results[\"memory\"].append(avg_memory)\n",
     "        \n",
-    "        print(f\"\\nEpoch {epoch + 1}/{epochs} Özeti:\")\n",
-    "        print(f\" - Eğitim Kayıp: {train_loss:.4f} - Eğitim Doğruluk: {train_acc:.4f}\")\n",
-    "        print(f\" - Validasyon Kayıp: {val_loss:.4f} - Validasyon Doğruluk: {val_acc:.4f}\")\n",
-    "        print(f\" - Ortalama Batch Süresi: {avg_time:.4f} saniye\")\n",
-    "        print(f\" - Ortalama Bellek Kullanımı: {avg_memory:.2f} MB\")\n",
+    "        print(f\"\\nEpoch {epoch + 1}/{epochs} Summary:\")\n",
+    "        print(f\" - Training Loss: {train_loss:.4f} - Training Accuracy: {train_acc:.4f}\")\n",
+    "        print(f\" - Validation Loss: {val_loss:.4f} - Validation Accuracy: {val_acc:.4f}\")\n",
+    "        print(f\" - Average Batch Time: {avg_time:.4f} seconds\")\n",
+    "        print(f\" - Average Memory Usage: {avg_memory:.2f} MB\")\n",
     "    \n",
     "    return results\n",
     "\n",
@@ -1569,22 +1569,22 @@
     "            all_labels.extend(batch_labels.cpu().numpy())\n",
     "            all_outputs.extend(softmax_outputs.cpu().numpy())\n",
     "            \n",
-    "            print(f\"Test Batch {i + 1}/{len(test_data)} - Kayıp: {loss.item():.4f} - Doğruluk: {(predicted == batch_labels).sum().item() / batch_labels.size(0):.4f}\")\n",
+    "            print(f\"Test Batch {i + 1}/{len(test_data)} - Loss: {loss.item():.4f} - Accuracy: {(predicted == batch_labels).sum().item() / batch_labels.size(0):.4f}\")\n",
     "    \n",
     "    accuracy = correct / total\n",
     "    avg_loss = test_loss / len(test_data)\n",
     "    \n",
-    "    print(f\"\\nTest Özeti:\")\n",
-    "    print(f\" - Ortalama Kayıp: {avg_loss:.4f}\")\n",
-    "    print(f\" - Doğruluk: {accuracy:.4f}\")\n",
-    "    print(f\" - Toplam Örnek Sayısı: {total}\")\n",
-    "    print(f\" - Tahmin Dağılımı: {np.bincount(all_preds)}\")\n",
-    "    print(f\" - Gerçek Etiket Dağılımı: {np.bincount(all_labels)}\")\n",
+    "    print(f\"\\nTest Summary:\")\n",
+    "    print(f\" - Average Loss: {avg_loss:.4f}\")\n",
+    "    print(f\" - Accuracy: {accuracy:.4f}\")\n",
+    "    print(f\" - Total Number of Samples: {total}\")\n",
+    "    print(f\" - Prediction Distribution: {np.bincount(all_preds)}\")\n",
+    "    print(f\" - True Label Distribution: {np.bincount(all_labels)}\")\n",
     "    \n",
     "    # Confusion Matrix\n",
     "    cm = confusion_matrix(all_labels, all_preds)\n",
     "    if len(np.unique(all_labels)) < 2 or len(np.unique(all_preds)) < 2:\n",
-    "        print(\"Uyarı: Test verisinde tek bir sınıf tahmin edildi veya mevcut, Confusion Matrix tam anlamıyla çizilemez.\")\n",
+    "        print(\"Warning: Only a single class was predicted or present in the test data, so the confusion matrix cannot be fully drawn.\")\n",
     "        print(f\"Confusion Matrix: {cm}\")\n",
     "    else:\n",
     "        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[\"Negatif\", \"Pozitif\"])\n",
@@ -1597,16 +1597,16 @@
     "        plt.title(\"Confusion Matrix\")\n",
     "        plt.show()\n",
     "    \n",
-    "    # Örnek Tahminler\n",
-    "    print(f\"\\n=== Örnek Tahminler (İlk {num_samples} Örnek) ===\")\n",
+    "    # Example predictions\n",
+    "    print(f\"\\n=== Example Predictions (First {num_samples} Samples) ===\")\n",
     "    for i in range(min(num_samples, len(test_texts))):\n",
     "        text = test_texts[i][:100] + \"...\" if len(test_texts[i]) > 100 else test_texts[i]\n",
     "        true_label = \"Pozitif\" if all_labels[i] == 1 else \"Negatif\"\n",
     "        pred_label = \"Pozitif\" if all_preds[i] == 1 else \"Negatif\"\n",
     "        softmax_scores = all_outputs[i]\n",
     "        print(f\"Metin: {text}\")\n",
-    "        print(f\"Gerçek Etiket: {true_label} - Tahmin Edilen Etiket: {pred_label}\")\n",
-    "        print(f\"Softmax Skorları: Negatif: {softmax_scores[0]:.4f}, Pozitif: {softmax_scores[1]:.4f}\")\n",
+    "        print(f\"True Label: {true_label} - Predicted Label: {pred_label}\")\n",
+    "        print(f\"Softmax Scores: Negative: {softmax_scores[0]:.4f}, Positive: {softmax_scores[1]:.4f}\")\n",
     "        print(\"-\" * 50)\n",
     "    \n",
     "    return accuracy, avg_loss\n",
@@ -1614,12 +1614,12 @@
     "# Ana fonksiyon\n",
     "def main():\n",
     "    batch_size, seq_len, d_model, num_heads = 32, 128, 512, 8\n",
-    "    epochs = 2  # Daha fazla epoch için artırıldı\n",
+    "    epochs = 2  # Increased to allow more epochs\n",
     "    lr = 0.0001\n",
     "    latent_dim = 128\n",
     "    num_layers = 4\n",
     "    \n",
-    "    print(\"Veri seti hazırlanıyor...\")\n",
+    "    print(\"Preparing the dataset...\")\n",
     "    (train_data, train_labels), (val_data, val_labels), (test_data, test_labels, test_texts), vocab_size, tokenizer = prepare_data(batch_size, seq_len)\n",
     "    \n",
     "    print(\"\\n=== Hiperparametreler ===\")\n",
@@ -1633,27 +1633,27 @@
     "    print(f\"Learning Rate: {lr}\")\n",
     "    print(f\"Vocabulary Size: {vocab_size}\")\n",
     "    \n",
-    "    # Modeli oluştur\n",
+    "    # Build the model\n",
     "    model = CustomLLM(vocab_size, d_model, num_heads, latent_dim, num_layers)\n",
     "    total_params, trainable_params = count_parameters(model)\n",
     "    print(f\"\\n=== CustomLLM Modeli ===\")\n",
-    "    print(f\"Toplam Parametre Sayısı: {total_params:,}\")\n",
-    "    print(f\"Eğitilebilir Parametre Sayısı: {trainable_params:,}\")\n",
+    "    print(f\"Total Number of Parameters: {total_params:,}\")\n",
+    "    print(f\"Number of trainable parameters: {trainable_params:,}\")\n",
     "    \n",
-    "    print(\"\\nModel eğitiliyor...\")\n",
+    "    print(\"\\nTraining the model...\")\n",
     "    results = train_model(model, train_data, train_labels, val_data, val_labels, epochs, lr)\n",
     "    accuracy, test_loss = test_model(model, test_data, test_labels, test_texts, tokenizer)\n",
     "    results[\"test_acc\"] = accuracy\n",
     "    results[\"test_loss\"] = test_loss\n",
     "    \n",
-    "    # Görselleştirme\n",
+    "    # Visualization\n",
     "    epochs_range = range(1, epochs + 1)\n",
     "    plt.figure(figsize=(15, 15))\n",
     "    \n",
     "    for i, (metric, title) in enumerate([\n",
-    "        (\"train_loss\", \"Eğitim Kaybı\"), (\"train_acc\", \"Eğitim Doğruluğu\"),\n",
-    "        (\"val_loss\", \"Validasyon Kaybı\"), (\"val_acc\", \"Validasyon Doğruluğu\"),\n",
-    "        (\"time\", \"Ortalama Batch Süresi (s)\"), (\"memory\", \"Ortalama Bellek Kullanımı (MB)\")\n",
+    "        (\"train_loss\", \"Training Loss\"), (\"train_acc\", \"Training Accuracy\"),\n",
+    "        (\"val_loss\", \"Validation Loss\"), (\"val_acc\", \"Validation Accuracy\"),\n",
+    "        (\"time\", \"Average Batch Time (s)\"), (\"memory\", \"Average Memory Usage (MB)\")\n",
     "    ], 1):\n",
     "        plt.subplot(3, 2, i)\n",
     "        plt.plot(epochs_range, results[metric], label=\"CustomLLM\")\n",
@@ -1665,11 +1665,11 @@
     "    plt.tight_layout()\n",
     "    plt.show()\n",
     "    \n",
-    "    # Yorumlar ve Öneriler\n",
-    "    print(\"\\n=== Yorumlar ve Öneriler ===\")\n",
-    "    print(\"1. Performans: Model, test setinde yalnızca Negatif tahmin etti. Veri seti dağılımı kontrol edilmeli veya daha fazla epoch ile genelleme artırılmalı.\")\n",
-    "    print(\"2. Karmaşıklık: Eğitim çok hızlı tamamlandı, öğrenme oranı (lr) artırılabilir (örneğin 0.001) veya model kapasitesi genişletilebilir.\")\n",
-    "    print(\"3. İyileştirme: Softmax skorları incelenerek modelin sınıf偏見 (bias) durumu analiz edilmeli. Daha dengeli bir veri seti kullanılabilir.\")\n",
+    "    # Comments and suggestions\n",
+    "    print(\"\\n=== Comments and Suggestions ===\")\n",
+    "    print(\"1. Performance: The model predicted only Negative on the test set. Check class balance or increase epochs for better generalization.\")\n",
+    "    print(\"2. Complexity: Training finished quickly; increase learning rate (e.g., 0.001) or expand model capacity.\")\n",
+    "    print(\"3. Improvement: Inspect softmax scores to analyze class bias. Use a more balanced dataset if needed.\")\n",
     "\n",
     "if __name__ == \"__main__\":\n",
     "    main()"
@@ -1697,4 +1697,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git "a/Genel-4/Projeksiyon_Katmanlar\304\261.ipynb" "b/Genel-4/Projeksiyon_Katmanlar\304\261.ipynb"
index f8610b8..90d75e0 100644
--- "a/Genel-4/Projeksiyon_Katmanlar\304\261.ipynb"
+++ "b/Genel-4/Projeksiyon_Katmanlar\304\261.ipynb"
@@ -1,133 +1,124 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "authorship_tag": "ABX9TyMR0Xdi2l6f+Bd4d9tZW0gS",
-      "include_colab_link": true
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "provenance": [],
+   "authorship_tag": "ABX9TyMR0Xdi2l6f+Bd4d9tZW0gS",
+   "include_colab_link": true
   },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "view-in-github",
-        "colab_type": "text"
-      },
-      "source": [
-        "<a href=\"https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/Projeksiyon_Katmanlar%C4%B1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import torch\n",
-        "import torch.nn as nn\n",
-        "\n",
-        "# Model parametreleri\n",
-        "d_model = 64   # Modelin gizli boyutu\n",
-        "d_ff = 256     # Besleme ileri (feed-forward) boyutu\n",
-        "seq_len = 10   # Girdi sırasının uzunluğu\n",
-        "batch_size = 8 # Batch boyutu"
-      ],
-      "metadata": {
-        "id": "wpPyAbV0zfeE"
-      },
-      "execution_count": 2,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Girdi tensorü (örnek veriler)\n",
-        "input_tensor = torch.rand(batch_size, seq_len, d_model)  # Rastgele veri"
-      ],
-      "metadata": {
-        "id": "kDYRBKL6zgmz"
-      },
-      "execution_count": 3,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Projeksiyon Katmanları\n",
-        "class ProjectionLayers(nn.Module):\n",
-        "    def __init__(self, d_model, d_ff):\n",
-        "        super().__init__()\n",
-        "        # Query, Key, Value Projections\n",
-        "        self.q_proj = nn.Linear(d_model, d_model)\n",
-        "        self.k_proj = nn.Linear(d_model, d_model)\n",
-        "        self.v_proj = nn.Linear(d_model, d_model)\n",
-        "        self.o_proj = nn.Linear(d_model, d_model)\n",
-        "\n",
-        "        # Feed-forward Projections\n",
-        "        self.gate_proj = nn.Linear(d_model, d_model)\n",
-        "        self.up_proj = nn.Linear(d_model, d_ff)\n",
-        "        self.down_proj = nn.Linear(d_ff, d_model)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        # Attention Projections\n",
-        "        q = self.q_proj(x)  # Sorgu\n",
-        "        k = self.k_proj(x)  # Anahtar\n",
-        "        v = self.v_proj(x)  # Değer\n",
-        "\n",
-        "        # Dot-product attention için hazırlık\n",
-        "        attention_scores = torch.matmul(q, k.transpose(-2, -1)) / (d_model ** 0.5)\n",
-        "        attention_weights = torch.softmax(attention_scores, dim=-1)\n",
-        "        attention_output = torch.matmul(attention_weights, v)\n",
-        "\n",
-        "        # Output Projeksiyon\n",
-        "        output = self.o_proj(attention_output)\n",
-        "\n",
-        "        # Feed-forward katmanı\n",
-        "        gated = torch.sigmoid(self.gate_proj(output)) * output  # Gated mekanizma\n",
-        "        upsampled = self.up_proj(gated)  # Daha yüksek boyut\n",
-        "        downsampled = self.down_proj(upsampled)  # Boyut küçültme\n",
-        "        return downsampled"
-      ],
-      "metadata": {
-        "id": "0wAkujy5zjME"
-      },
-      "execution_count": 4,
-      "outputs": []
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "view-in-github",
+    "colab_type": "text"
+   },
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/Projeksiyon_Katmanlar%C4%B1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "# Model parametreleri\n",
+    "d_model = 64   # Modelin gizli boyutu\n",
+    "d_ff = 256     # Besleme ileri (feed-forward) boyutu\n",
+    "seq_len = 10   # Length of the input sequence\n",
+    "batch_size = 8 # Batch boyutu"
+   ],
+   "metadata": {
+    "id": "wpPyAbV0zfeE"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# Input tensor (example data)\n",
+    "input_tensor = torch.rand(batch_size, seq_len, d_model)  # Rastgele veri"
+   ],
+   "metadata": {
+    "id": "kDYRBKL6zgmz"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# Projection layers\n",
+    "class ProjectionLayers(nn.Module):\n",
+    "    def __init__(self, d_model, d_ff):\n",
+    "        super().__init__()\n",
+    "        # Query, Key, Value Projections\n",
+    "        self.q_proj = nn.Linear(d_model, d_model)\n",
+    "        self.k_proj = nn.Linear(d_model, d_model)\n",
+    "        self.v_proj = nn.Linear(d_model, d_model)\n",
+    "        self.o_proj = nn.Linear(d_model, d_model)\n",
+    "\n",
+    "        # Feed-forward Projections\n",
+    "        self.gate_proj = nn.Linear(d_model, d_model)\n",
+    "        self.up_proj = nn.Linear(d_model, d_ff)\n",
+    "        self.down_proj = nn.Linear(d_ff, d_model)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        # Attention Projections\n",
+    "        q = self.q_proj(x)  # Sorgu\n",
+    "        k = self.k_proj(x)  # Anahtar\n",
+    "        v = self.v_proj(x)  # Value\n",
+    "\n",
+    "        # Prepare for dot-product attention\n",
+    "        attention_scores = torch.matmul(q, k.transpose(-2, -1)) / (d_model ** 0.5)\n",
+    "        attention_weights = torch.softmax(attention_scores, dim=-1)\n",
+    "        attention_output = torch.matmul(attention_weights, v)\n",
+    "\n",
+    "        # Output Projeksiyon\n",
+    "        output = self.o_proj(attention_output)\n",
+    "\n",
+    "        # Feed-forward layer\n",
+    "        gated = torch.sigmoid(self.gate_proj(output)) * output  # Gated mekanizma\n",
+    "        upsampled = self.up_proj(gated)  # Higher dimensional projection\n",
+    "        downsampled = self.down_proj(upsampled)  # Reduce the dimensionality\n",
+    "        return downsampled"
+   ],
+   "metadata": {
+    "id": "0wAkujy5zjME"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
-    {
-      "cell_type": "code",
-      "execution_count": 5,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "fBZsytFczPvM",
-        "outputId": "7f0ddf0f-90e7-4f2e-f63b-a3f6d2007797"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Girdi Boyutu:  torch.Size([8, 10, 64])\n",
-            "Çıktı Boyutu:  torch.Size([8, 10, 64])\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Modeli tanımla ve çalıştır\n",
-        "model = ProjectionLayers(d_model=d_model, d_ff=d_ff)\n",
-        "output_tensor = model(input_tensor)\n",
-        "\n",
-        "print(\"Girdi Boyutu: \", input_tensor.shape)\n",
-        "print(\"Çıktı Boyutu: \", output_tensor.shape)\n"
-      ]
-    }
-  ]
+    "id": "fBZsytFczPvM",
+    "outputId": "7f0ddf0f-90e7-4f2e-f63b-a3f6d2007797"
+   },
+   "outputs": [],
+   "source": [
+    "# Define and run the model\n",
+    "model = ProjectionLayers(d_model=d_model, d_ff=d_ff)\n",
+    "output_tensor = model(input_tensor)\n",
+    "\n",
+    "print(\"Input Boyutu: \", input_tensor.shape)\n",
+    "print(\"Output Boyutu: \", output_tensor.shape)\n"
+   ]
+  }
+ ]
 }
\ No newline at end of file
diff --git a/Genel-4/SLM_+_COT_FINETUNE.ipynb b/Genel-4/SLM_+_COT_FINETUNE.ipynb
index 24f1c68..f95ec86 100644
--- a/Genel-4/SLM_+_COT_FINETUNE.ipynb
+++ b/Genel-4/SLM_+_COT_FINETUNE.ipynb
@@ -1,272 +1,272 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "authorship_tag": "ABX9TyPiG+IsBhY9lUPrholCiKS5",
-      "include_colab_link": true
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "provenance": [],
+   "authorship_tag": "ABX9TyPiG+IsBhY9lUPrholCiKS5",
+   "include_colab_link": true
   },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "view-in-github",
-        "colab_type": "text"
-      },
-      "source": [
-        "<a href=\"https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/SLM_%2B_COT_FINETUNE.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "from datasets import load_dataset\n",
-        "from transformers import (\n",
-        "    AutoModelForCausalLM,\n",
-        "    AutoTokenizer,\n",
-        "    TrainingArguments,\n",
-        "    Trainer,\n",
-        "    DataCollatorForLanguageModeling\n",
-        ")\n",
-        "import torch\n",
-        "from torch.utils.data import Dataset\n",
-        "from tqdm import tqdm"
-      ],
-      "metadata": {
-        "id": "Di-Z5MYNDD6o"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "class ConversationDataset(Dataset):\n",
-        "    def __init__(self, dataset, tokenizer, max_length=512):\n",
-        "        self.examples = []\n",
-        "\n",
-        "        print(\"Processing conversations...\")\n",
-        "        for item in tqdm(dataset):\n",
-        "            # Format conversation\n",
-        "            conversation = \"\"\n",
-        "            for turn in item['chosen']:\n",
-        "                role = turn['role']\n",
-        "                content = turn['content']\n",
-        "                if role == 'user':\n",
-        "                    conversation += f\"Human: {content}\\n\"\n",
-        "                else:\n",
-        "                    conversation += f\"Assistant: {content}\\n\"\n",
-        "\n",
-        "            # Tokenize\n",
-        "            encodings = tokenizer(\n",
-        "                conversation,\n",
-        "                truncation=True,\n",
-        "                max_length=max_length,\n",
-        "                padding=\"max_length\",\n",
-        "                return_tensors=\"pt\"\n",
-        "            )\n",
-        "\n",
-        "            self.examples.append({\n",
-        "                \"input_ids\": encodings[\"input_ids\"][0],\n",
-        "                \"attention_mask\": encodings[\"attention_mask\"][0],\n",
-        "                \"labels\": encodings[\"input_ids\"][0].clone()\n",
-        "            })\n",
-        "\n",
-        "    def __len__(self):\n",
-        "        return len(self.examples)\n",
-        "\n",
-        "    def __getitem__(self, idx):\n",
-        "        return self.examples[idx]"
-      ],
-      "metadata": {
-        "id": "uLZ6RwUwcs_2"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "def main():\n",
-        "    # Load dataset\n",
-        "    print(\"Loading dataset...\")\n",
-        "    dataset = load_dataset(\"kenhktsui/longtalk-cot-v0.1\")\n",
-        "\n",
-        "    # Kullan küçük bir subset (test için)\n",
-        "    dataset['train'] = dataset['train'].select(range(1000))\n",
-        "\n",
-        "    # Load model and tokenizer\n",
-        "    print(\"Loading model and tokenizer...\")\n",
-        "    model_name = \"HuggingFaceTB/SmolLM-135M\"\n",
-        "    model = AutoModelForCausalLM.from_pretrained(model_name)\n",
-        "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
-        "\n",
-        "    if tokenizer.pad_token is None:\n",
-        "        tokenizer.pad_token = tokenizer.eos_token\n",
-        "        model.config.pad_token_id = tokenizer.pad_token_id\n",
-        "\n",
-        "    # Split dataset\n",
-        "    print(\"Splitting dataset...\")\n",
-        "    train_size = int(0.9 * len(dataset['train']))\n",
-        "    val_size = len(dataset['train']) - train_size\n",
-        "    train_dataset, val_dataset = torch.utils.data.random_split(\n",
-        "        dataset['train'],\n",
-        "        [train_size, val_size]\n",
-        "    )\n",
-        "\n",
-        "    # Prepare datasets\n",
-        "    print(\"Preparing training dataset...\")\n",
-        "    train_dataset = ConversationDataset(train_dataset, tokenizer)\n",
-        "    print(\"Preparing validation dataset...\")\n",
-        "    eval_dataset = ConversationDataset(val_dataset, tokenizer)\n",
-        "\n",
-        "    # Training arguments\n",
-        "    # Training arguments\n",
-        "    training_args = TrainingArguments(\n",
-        "    output_dir=\"./results\",\n",
-        "    num_train_epochs=3,\n",
-        "    per_device_train_batch_size=4,\n",
-        "    per_device_eval_batch_size=4,\n",
-        "    warmup_steps=500,\n",
-        "    weight_decay=0.01,\n",
-        "    logging_dir=\"./logs\",\n",
-        "    logging_steps=100,\n",
-        "    evaluation_strategy=\"steps\",\n",
-        "    eval_steps=500,\n",
-        "    save_strategy=\"steps\",\n",
-        "    save_steps=500,\n",
-        "    load_best_model_at_end=True,\n",
-        "    gradient_accumulation_steps=4,\n",
-        "    fp16=True,\n",
-        "    report_to=\"none\"\n",
-        "    )\n",
-        "\n",
-        "    # Initialize trainer\n",
-        "    trainer = Trainer(\n",
-        "        model=model,\n",
-        "        args=training_args,\n",
-        "        train_dataset=train_dataset,\n",
-        "        eval_dataset=eval_dataset,\n",
-        "        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n",
-        "    )\n",
-        "\n",
-        "    # Start training\n",
-        "    trainer.train()\n",
-        "\n",
-        "    # Save model\n",
-        "    model_save_path = \"./fine_tuned_smolLM\"\n",
-        "    trainer.save_model(model_save_path)\n",
-        "    tokenizer.save_pretrained(model_save_path)"
-      ],
-      "metadata": {
-        "id": "4qdPDUcXdboZ"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "if __name__ == \"__main__\":\n",
-        "    main()"
-      ],
-      "metadata": {
-        "id": "Y9a_q7mxdfLA"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Model değerlendirme için gerekli fonksiyonlar ve test kodu\n",
-        "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
-        "import torch\n",
-        "\n",
-        "def model_yukle():\n",
-        "    model_path = \"./fine_tuned_smolLM\"\n",
-        "    model = AutoModelForCausalLM.from_pretrained(model_path)\n",
-        "    tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
-        "    return model, tokenizer\n",
-        "\n",
-        "def yanit_uret(prompt, model, tokenizer, max_length=250):\n",
-        "    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-        "    model = model.to(device)\n",
-        "\n",
-        "    # CoT formatında prompt hazırla\n",
-        "    formatted_prompt = f\"Question: {prompt}\\nLet's solve this step by step:\\n\"\n",
-        "\n",
-        "    inputs = tokenizer(formatted_prompt, return_tensors=\"pt\").to(device)\n",
-        "\n",
-        "    with torch.no_grad():\n",
-        "        outputs = model.generate(\n",
-        "            inputs[\"input_ids\"],\n",
-        "            max_length=max_length,\n",
-        "            num_return_sequences=1,\n",
-        "            temperature=0.7,\n",
-        "            top_p=0.9,\n",
-        "            do_sample=True,\n",
-        "            pad_token_id=tokenizer.pad_token_id\n",
-        "        )\n",
-        "\n",
-        "    return tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
-        "\n",
-        "# Chain-of-Thought tarzında test soruları\n",
-        "test_ornekleri = [\n",
-        "    \"What is the sum of the first 5 prime numbers?\",\n",
-        "    \"What is 1 + 1?\"\n",
-        "]\n",
-        "\n",
-        "# Modeli yükle ve test et\n",
-        "print(\"Model Evaluation Results:\")\n",
-        "print(\"-\" * 70)\n",
-        "\n",
-        "model, tokenizer = model_yukle()\n",
-        "\n",
-        "for ornek in test_ornekleri:\n",
-        "    print(f\"\\nQuestion: {ornek}\")\n",
-        "    yanit = yanit_uret(ornek, model, tokenizer)\n",
-        "    print(f\"Response:\\n{yanit}\")\n",
-        "    print(\"-\" * 70)"
-      ],
-      "metadata": {
-        "id": "3lrV75_AlX14"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "dataset = load_dataset(\"kenhktsui/longtalk-cot-v0.1\")\n",
-        "dataset[\"train\"][0]"
-      ],
-      "metadata": {
-        "id": "67UiY-lbqrBV"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "dataset[\"train\"].shape"
-      ],
-      "metadata": {
-        "id": "hDrgPIDwqtYF"
-      },
-      "execution_count": null,
-      "outputs": []
-    }
-  ]
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "view-in-github",
+    "colab_type": "text"
+   },
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/SLM_%2B_COT_FINETUNE.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import os\n",
+    "from datasets import load_dataset\n",
+    "from transformers import (\n",
+    "    AutoModelForCausalLM,\n",
+    "    AutoTokenizer,\n",
+    "    TrainingArguments,\n",
+    "    Trainer,\n",
+    "    DataCollatorForLanguageModeling\n",
+    ")\n",
+    "import torch\n",
+    "from torch.utils.data import Dataset\n",
+    "from tqdm import tqdm"
+   ],
+   "metadata": {
+    "id": "Di-Z5MYNDD6o"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "class ConversationDataset(Dataset):\n",
+    "    def __init__(self, dataset, tokenizer, max_length=512):\n",
+    "        self.examples = []\n",
+    "\n",
+    "        print(\"Processing conversations...\")\n",
+    "        for item in tqdm(dataset):\n",
+    "            # Format conversation\n",
+    "            conversation = \"\"\n",
+    "            for turn in item['chosen']:\n",
+    "                role = turn['role']\n",
+    "                content = turn['content']\n",
+    "                if role == 'user':\n",
+    "                    conversation += f\"Human: {content}\\n\"\n",
+    "                else:\n",
+    "                    conversation += f\"Assistant: {content}\\n\"\n",
+    "\n",
+    "            # Tokenize\n",
+    "            encodings = tokenizer(\n",
+    "                conversation,\n",
+    "                truncation=True,\n",
+    "                max_length=max_length,\n",
+    "                padding=\"max_length\",\n",
+    "                return_tensors=\"pt\"\n",
+    "            )\n",
+    "\n",
+    "            self.examples.append({\n",
+    "                \"input_ids\": encodings[\"input_ids\"][0],\n",
+    "                \"attention_mask\": encodings[\"attention_mask\"][0],\n",
+    "                \"labels\": encodings[\"input_ids\"][0].clone()\n",
+    "            })\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.examples)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        return self.examples[idx]"
+   ],
+   "metadata": {
+    "id": "uLZ6RwUwcs_2"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "def main():\n",
+    "    # Load dataset\n",
+    "    print(\"Loading dataset...\")\n",
+    "    dataset = load_dataset(\"kenhktsui/longtalk-cot-v0.1\")\n",
+    "\n",
+    "    # Use a small subset (for testing)\n",
+    "    dataset['train'] = dataset['train'].select(range(1000))\n",
+    "\n",
+    "    # Load model and tokenizer\n",
+    "    print(\"Loading model and tokenizer...\")\n",
+    "    model_name = \"HuggingFaceTB/SmolLM-135M\"\n",
+    "    model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "\n",
+    "    if tokenizer.pad_token is None:\n",
+    "        tokenizer.pad_token = tokenizer.eos_token\n",
+    "        model.config.pad_token_id = tokenizer.pad_token_id\n",
+    "\n",
+    "    # Split dataset\n",
+    "    print(\"Splitting dataset...\")\n",
+    "    train_size = int(0.9 * len(dataset['train']))\n",
+    "    val_size = len(dataset['train']) - train_size\n",
+    "    train_dataset, val_dataset = torch.utils.data.random_split(\n",
+    "        dataset['train'],\n",
+    "        [train_size, val_size]\n",
+    "    )\n",
+    "\n",
+    "    # Prepare datasets\n",
+    "    print(\"Preparing training dataset...\")\n",
+    "    train_dataset = ConversationDataset(train_dataset, tokenizer)\n",
+    "    print(\"Preparing validation dataset...\")\n",
+    "    eval_dataset = ConversationDataset(val_dataset, tokenizer)\n",
+    "\n",
+    "    # Training arguments\n",
+    "    # Training arguments\n",
+    "    training_args = TrainingArguments(\n",
+    "    output_dir=\"./results\",\n",
+    "    num_train_epochs=3,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    warmup_steps=500,\n",
+    "    weight_decay=0.01,\n",
+    "    logging_dir=\"./logs\",\n",
+    "    logging_steps=100,\n",
+    "    evaluation_strategy=\"steps\",\n",
+    "    eval_steps=500,\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=500,\n",
+    "    load_best_model_at_end=True,\n",
+    "    gradient_accumulation_steps=4,\n",
+    "    fp16=True,\n",
+    "    report_to=\"none\"\n",
+    "    )\n",
+    "\n",
+    "    # Initialize trainer\n",
+    "    trainer = Trainer(\n",
+    "        model=model,\n",
+    "        args=training_args,\n",
+    "        train_dataset=train_dataset,\n",
+    "        eval_dataset=eval_dataset,\n",
+    "        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n",
+    "    )\n",
+    "\n",
+    "    # Start training\n",
+    "    trainer.train()\n",
+    "\n",
+    "    # Save model\n",
+    "    model_save_path = \"./fine_tuned_smolLM\"\n",
+    "    trainer.save_model(model_save_path)\n",
+    "    tokenizer.save_pretrained(model_save_path)"
+   ],
+   "metadata": {
+    "id": "4qdPDUcXdboZ"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "    main()"
+   ],
+   "metadata": {
+    "id": "Y9a_q7mxdfLA"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# Functions and test code needed for model evaluation\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "import torch\n",
+    "\n",
+    "def model_yukle():\n",
+    "    model_path = \"./fine_tuned_smolLM\"\n",
+    "    model = AutoModelForCausalLM.from_pretrained(model_path)\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
+    "    return model, tokenizer\n",
+    "\n",
+    "def yanit_uret(prompt, model, tokenizer, max_length=250):\n",
+    "    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "    model = model.to(device)\n",
+    "\n",
+    "    # Prepare a CoT-style prompt\n",
+    "    formatted_prompt = f\"Question: {prompt}\\nLet's solve this step by step:\\n\"\n",
+    "\n",
+    "    inputs = tokenizer(formatted_prompt, return_tensors=\"pt\").to(device)\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model.generate(\n",
+    "            inputs[\"input_ids\"],\n",
+    "            max_length=max_length,\n",
+    "            num_return_sequences=1,\n",
+    "            temperature=0.7,\n",
+    "            top_p=0.9,\n",
+    "            do_sample=True,\n",
+    "            pad_token_id=tokenizer.pad_token_id\n",
+    "        )\n",
+    "\n",
+    "    return tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
+    "\n",
+    "# Chain-of-Thought style test questions\n",
+    "test_ornekleri = [\n",
+    "    \"What is the sum of the first 5 prime numbers?\",\n",
+    "    \"What is 1 + 1?\"\n",
+    "]\n",
+    "\n",
+    "# Load and test the model\n",
+    "print(\"Model Evaluation Results:\")\n",
+    "print(\"-\" * 70)\n",
+    "\n",
+    "model, tokenizer = model_yukle()\n",
+    "\n",
+    "for ornek in test_ornekleri:\n",
+    "    print(f\"\\nQuestion: {ornek}\")\n",
+    "    yanit = yanit_uret(ornek, model, tokenizer)\n",
+    "    print(f\"Response:\\n{yanit}\")\n",
+    "    print(\"-\" * 70)"
+   ],
+   "metadata": {
+    "id": "3lrV75_AlX14"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "dataset = load_dataset(\"kenhktsui/longtalk-cot-v0.1\")\n",
+    "dataset[\"train\"][0]"
+   ],
+   "metadata": {
+    "id": "67UiY-lbqrBV"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "dataset[\"train\"].shape"
+   ],
+   "metadata": {
+    "id": "hDrgPIDwqtYF"
+   },
+   "execution_count": null,
+   "outputs": []
+  }
+ ]
 }
\ No newline at end of file
diff --git a/Genel-4/Transformer_Attention_FFN_Varyantlari_Performans_T.ipynb b/Genel-4/Transformer_Attention_FFN_Varyantlari_Performans_T.ipynb
index 7b5c5df..45d3d85 100644
--- a/Genel-4/Transformer_Attention_FFN_Varyantlari_Performans_T.ipynb
+++ b/Genel-4/Transformer_Attention_FFN_Varyantlari_Performans_T.ipynb
@@ -1,1673 +1,1673 @@
 {
-  "metadata": {
-    "kernelspec": {
-      "language": "python",
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python",
-      "version": "3.10.12",
-      "mimetype": "text/x-python",
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "pygments_lexer": "ipython3",
-      "nbconvert_exporter": "python",
-      "file_extension": ".py"
-    },
-    "kaggle": {
-      "accelerator": "none",
-      "dataSources": [],
-      "dockerImageVersionId": 30887,
-      "isInternetEnabled": true,
-      "language": "python",
-      "sourceType": "notebook",
-      "isGpuEnabled": false
-    },
-    "colab": {
-      "provenance": [],
-      "include_colab_link": true
-    }
+ "metadata": {
+  "kernelspec": {
+   "language": "python",
+   "display_name": "Python 3",
+   "name": "python3"
   },
-  "nbformat_minor": 0,
-  "nbformat": 4,
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "view-in-github",
-        "colab_type": "text"
-      },
-      "source": [
-        "<a href=\"https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/Transformer_Attention_FFN_Varyantlari_Performans_T.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).\n",
-        "# Source for \"Build a Large Language Model From Scratch\"\n",
-        "#   - https://www.manning.com/books/build-a-large-language-model-from-scratch\n",
-        "# Code: https://github.com/rasbt/LLMs-from-scratch\n",
-        "\n",
-        "# This file collects all the relevant code that we covered thus far\n",
-        "# throughout Chapters 2-4.\n",
-        "# This file can be run as a standalone script.\n",
-        "\n",
-        "import tiktoken\n",
-        "import torch\n",
-        "import torch.nn as nn\n",
-        "from torch.utils.data import Dataset, DataLoader\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "\n",
-        "#####################################\n",
-        "# Chapter 2\n",
-        "#####################################\n",
-        "\n",
-        "class GPTDatasetV1(Dataset):\n",
-        "    def __init__(self, txt, tokenizer, max_length, stride):\n",
-        "        self.input_ids = []\n",
-        "        self.target_ids = []\n",
-        "\n",
-        "        # Tokenize the entire text\n",
-        "        token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
-        "\n",
-        "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
-        "        for i in range(0, len(token_ids) - max_length, stride):\n",
-        "            input_chunk = token_ids[i:i + max_length]\n",
-        "            target_chunk = token_ids[i + 1: i + max_length + 1]\n",
-        "            self.input_ids.append(torch.tensor(input_chunk))\n",
-        "            self.target_ids.append(torch.tensor(target_chunk))\n",
-        "\n",
-        "    def __len__(self):\n",
-        "        return len(self.input_ids)\n",
-        "\n",
-        "    def __getitem__(self, idx):\n",
-        "        return self.input_ids[idx], self.target_ids[idx]\n",
-        "\n",
-        "\n",
-        "def create_dataloader_v1(txt, batch_size=4, max_length=256,\n",
-        "                         stride=128, shuffle=True, drop_last=True, num_workers=0):\n",
-        "    # Initialize the tokenizer\n",
-        "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
-        "\n",
-        "    # Create dataset\n",
-        "    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n",
-        "\n",
-        "    # Create dataloader\n",
-        "    dataloader = DataLoader(\n",
-        "        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)\n",
-        "\n",
-        "    return dataloader\n",
-        "\n",
-        "\n",
-        "#####################################\n",
-        "# Chapter 3\n",
-        "#####################################\n",
-        "\n",
-        "class MultiHeadAttention(nn.Module):\n",
-        "    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):\n",
-        "        super().__init__()\n",
-        "        assert d_out % num_heads == 0, \"d_out must be divisible by n_heads\"\n",
-        "\n",
-        "        self.d_out = d_out\n",
-        "        self.num_heads = num_heads\n",
-        "        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim\n",
-        "\n",
-        "        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
-        "        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
-        "        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
-        "        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        b, num_tokens, d_in = x.shape\n",
-        "\n",
-        "        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)\n",
-        "        queries = self.W_query(x)\n",
-        "        values = self.W_value(x)\n",
-        "\n",
-        "        # We implicitly split the matrix by adding a `num_heads` dimension\n",
-        "        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)\n",
-        "        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)\n",
-        "        values = values.view(b, num_tokens, self.num_heads, self.head_dim)\n",
-        "        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)\n",
-        "\n",
-        "        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)\n",
-        "        keys = keys.transpose(1, 2)\n",
-        "        queries = queries.transpose(1, 2)\n",
-        "        values = values.transpose(1, 2)\n",
-        "\n",
-        "        # Compute scaled dot-product attention (aka self-attention) with a causal mask\n",
-        "        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head\n",
-        "\n",
-        "        # Original mask truncated to the number of tokens and converted to boolean\n",
-        "        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n",
-        "\n",
-        "        # Use the mask to fill attention scores\n",
-        "        attn_scores.masked_fill_(mask_bool, -torch.inf)\n",
-        "\n",
-        "        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)\n",
-        "        attn_weights = self.dropout(attn_weights)\n",
-        "\n",
-        "        # Shape: (b, num_tokens, num_heads, head_dim)\n",
-        "        context_vec = (attn_weights @ values).transpose(1, 2)\n",
-        "\n",
-        "        # Combine heads, where self.d_out = self.num_heads * self.head_dim\n",
-        "        context_vec = context_vec.reshape(b, num_tokens, self.d_out)\n",
-        "        context_vec = self.out_proj(context_vec)  # optional projection\n",
-        "\n",
-        "        return context_vec\n",
-        "\n",
-        "\n",
-        "#####################################\n",
-        "# Chapter 4\n",
-        "#####################################\n",
-        "\n",
-        "class LayerNorm(nn.Module):\n",
-        "    def __init__(self, emb_dim):\n",
-        "        super().__init__()\n",
-        "        self.eps = 1e-5\n",
-        "        self.scale = nn.Parameter(torch.ones(emb_dim))\n",
-        "        self.shift = nn.Parameter(torch.zeros(emb_dim))\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        mean = x.mean(dim=-1, keepdim=True)\n",
-        "        var = x.var(dim=-1, keepdim=True, unbiased=False)\n",
-        "        norm_x = (x - mean) / torch.sqrt(var + self.eps)\n",
-        "        return self.scale * norm_x + self.shift\n",
-        "\n",
-        "\n",
-        "class GELU(nn.Module):\n",
-        "    def __init__(self):\n",
-        "        super().__init__()\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        return 0.5 * x * (1 + torch.tanh(\n",
-        "            torch.sqrt(torch.tensor(2.0 / torch.pi)) *\n",
-        "            (x + 0.044715 * torch.pow(x, 3))\n",
-        "        ))\n",
-        "\n",
-        "\n",
-        "class FeedForward(nn.Module):\n",
-        "    def __init__(self, cfg):\n",
-        "        super().__init__()\n",
-        "        self.layers = nn.Sequential(\n",
-        "            nn.Linear(cfg[\"emb_dim\"], 4 * cfg[\"emb_dim\"]),\n",
-        "            GELU(),\n",
-        "            nn.Linear(4 * cfg[\"emb_dim\"], cfg[\"emb_dim\"]),\n",
-        "        )\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        return self.layers(x)\n",
-        "\n",
-        "\n",
-        "class TransformerBlock(nn.Module):\n",
-        "    def __init__(self, cfg):\n",
-        "        super().__init__()\n",
-        "        self.att = MultiHeadAttention(\n",
-        "            d_in=cfg[\"emb_dim\"],\n",
-        "            d_out=cfg[\"emb_dim\"],\n",
-        "            context_length=cfg[\"context_length\"],\n",
-        "            num_heads=cfg[\"n_heads\"],\n",
-        "            dropout=cfg[\"drop_rate\"],\n",
-        "            qkv_bias=cfg[\"qkv_bias\"])\n",
-        "        self.ff = FeedForward(cfg)\n",
-        "        self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
-        "        self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
-        "        self.drop_shortcut = nn.Dropout(cfg[\"drop_rate\"])\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        # Shortcut connection for attention block\n",
-        "        shortcut = x\n",
-        "        x = self.norm1(x)\n",
-        "        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]\n",
-        "        x = self.drop_shortcut(x)\n",
-        "        x = x + shortcut  # Add the original input back\n",
-        "\n",
-        "        # Shortcut connection for feed-forward block\n",
-        "        shortcut = x\n",
-        "        x = self.norm2(x)\n",
-        "        x = self.ff(x)\n",
-        "        x = self.drop_shortcut(x)\n",
-        "        x = x + shortcut  # Add the original input back\n",
-        "\n",
-        "        return x\n",
-        "\n",
-        "\n",
-        "class GPTModel(nn.Module):\n",
-        "    def __init__(self, cfg):\n",
-        "        super().__init__()\n",
-        "        self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
-        "        self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
-        "        self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
-        "\n",
-        "        self.trf_blocks = nn.Sequential(\n",
-        "            *[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n",
-        "\n",
-        "        self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n",
-        "        self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False)\n",
-        "\n",
-        "    def forward(self, in_idx):\n",
-        "        batch_size, seq_len = in_idx.shape\n",
-        "        tok_embeds = self.tok_emb(in_idx)\n",
-        "        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
-        "        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]\n",
-        "        x = self.drop_emb(x)\n",
-        "        x = self.trf_blocks(x)\n",
-        "        x = self.final_norm(x)\n",
-        "        logits = self.out_head(x)\n",
-        "        return logits\n",
-        "\n",
-        "\n",
-        "def generate_text_simple(model, idx, max_new_tokens, context_size):\n",
-        "    # idx is (B, T) array of indices in the current context\n",
-        "    for _ in range(max_new_tokens):\n",
-        "\n",
-        "        # Crop current context if it exceeds the supported context size\n",
-        "        # E.g., if LLM supports only 5 tokens, and the context size is 10\n",
-        "        # then only the last 5 tokens are used as context\n",
-        "        idx_cond = idx[:, -context_size:]\n",
-        "\n",
-        "        # Get the predictions\n",
-        "        with torch.no_grad():\n",
-        "            logits = model(idx_cond)\n",
-        "\n",
-        "        # Focus only on the last time step\n",
-        "        # (batch, n_token, vocab_size) becomes (batch, vocab_size)\n",
-        "        logits = logits[:, -1, :]\n",
-        "\n",
-        "        # Get the idx of the vocab entry with the highest logits value\n",
-        "        idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch, 1)\n",
-        "\n",
-        "        # Append sampled index to the running sequence\n",
-        "        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)\n",
-        "\n",
-        "    return idx\n",
-        "\n",
-        "\n",
-        "#####################################\n",
-        "# Chapter 5\n",
-        "####################################\n",
-        "\n",
-        "\n",
-        "def calc_loss_batch(input_batch, target_batch, model, device):\n",
-        "    input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n",
-        "    logits = model(input_batch)\n",
-        "    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())\n",
-        "    return loss\n",
-        "\n",
-        "\n",
-        "def calc_loss_loader(data_loader, model, device, num_batches=None):\n",
-        "    total_loss = 0.\n",
-        "    if len(data_loader) == 0:\n",
-        "        return float(\"nan\")\n",
-        "    elif num_batches is None:\n",
-        "        num_batches = len(data_loader)\n",
-        "    else:\n",
-        "        num_batches = min(num_batches, len(data_loader))\n",
-        "    for i, (input_batch, target_batch) in enumerate(data_loader):\n",
-        "        if i < num_batches:\n",
-        "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
-        "            total_loss += loss.item()\n",
-        "        else:\n",
-        "            break\n",
-        "    return total_loss / num_batches\n",
-        "\n",
-        "\n",
-        "def evaluate_model(model, train_loader, val_loader, device, eval_iter):\n",
-        "    model.eval()\n",
-        "    with torch.no_grad():\n",
-        "        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)\n",
-        "        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)\n",
-        "    model.train()\n",
-        "    return train_loss, val_loss\n",
-        "\n",
-        "\n",
-        "def generate_and_print_sample(model, tokenizer, device, start_context):\n",
-        "    model.eval()\n",
-        "    context_size = model.pos_emb.weight.shape[0]\n",
-        "    encoded = text_to_token_ids(start_context, tokenizer).to(device)\n",
-        "    with torch.no_grad():\n",
-        "        token_ids = generate_text_simple(\n",
-        "            model=model, idx=encoded,\n",
-        "            max_new_tokens=50, context_size=context_size)\n",
-        "        decoded_text = token_ids_to_text(token_ids, tokenizer)\n",
-        "        print(decoded_text.replace(\"\\n\", \" \"))  # Compact print format\n",
-        "    model.train()\n",
-        "\n",
-        "\n",
-        "def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):\n",
-        "    fig, ax1 = plt.subplots(figsize=(5, 3))\n",
-        "\n",
-        "    # Plot training and validation loss against epochs\n",
-        "    ax1.plot(epochs_seen, train_losses, label=\"Training loss\")\n",
-        "    ax1.plot(epochs_seen, val_losses, linestyle=\"-.\", label=\"Validation loss\")\n",
-        "    ax1.set_xlabel(\"Epochs\")\n",
-        "    ax1.set_ylabel(\"Loss\")\n",
-        "    ax1.legend(loc=\"upper right\")\n",
-        "\n",
-        "    # Create a second x-axis for tokens seen\n",
-        "    ax2 = ax1.twiny()  # Create a second x-axis that shares the same y-axis\n",
-        "    ax2.plot(tokens_seen, train_losses, alpha=0)  # Invisible plot for aligning ticks\n",
-        "    ax2.set_xlabel(\"Tokens seen\")\n",
-        "\n",
-        "    fig.tight_layout()  # Adjust layout to make room\n",
-        "    # plt.show()\n",
-        "\n",
-        "\n",
-        "def text_to_token_ids(text, tokenizer):\n",
-        "    encoded = tokenizer.encode(text)\n",
-        "    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension\n",
-        "    return encoded_tensor\n",
-        "\n",
-        "\n",
-        "def token_ids_to_text(token_ids, tokenizer):\n",
-        "    flat = token_ids.squeeze(0)  # remove batch dimension\n",
-        "    return tokenizer.decode(flat.tolist())\n"
-      ],
-      "metadata": {
-        "trusted": true,
-        "execution": {
-          "iopub.status.busy": "2025-02-12T22:03:47.011719Z",
-          "iopub.execute_input": "2025-02-12T22:03:47.01204Z",
-          "iopub.status.idle": "2025-02-12T22:03:47.049181Z",
-          "shell.execute_reply.started": "2025-02-12T22:03:47.012016Z",
-          "shell.execute_reply": "2025-02-12T22:03:47.0478Z"
-        },
-        "id": "ML-KCXoIw_I0"
-      },
-      "outputs": [],
-      "execution_count": null
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import time\n",
-        "import torch\n",
-        "import torch.optim as optim\n",
-        "\n",
-        "#####################################\n",
-        "# Ayarlar ve Veri Hazırlığı\n",
-        "#####################################\n",
-        "\n",
-        "def load_data():\n",
-        "    # Demo amaçlı küçük bir metin. Gerçek uygulamada daha büyük bir corpus kullanılmalı.\n",
-        "    text = (\"Once upon a time, in a land far, far away, there was a kingdom where magic was common \"\n",
-        "            \"and adventure awaited around every corner. \") * 100  # metni tekrarlayarak uzunluyoruz\n",
-        "    return text\n",
-        "\n",
-        "def prepare_dataloaders(text, batch_size=4, max_length=128, stride=64):\n",
-        "    # Eğitim ve doğrulama için veriyi bölelim (örneğin, %90 eğitim, %10 doğrulama)\n",
-        "    split_idx = int(0.9 * len(text))\n",
-        "    train_text = text[:split_idx]\n",
-        "    val_text = text[split_idx:]\n",
-        "    train_loader = create_dataloader_v1(train_text, batch_size=batch_size,\n",
-        "                                        max_length=max_length, stride=stride)\n",
-        "    val_loader = create_dataloader_v1(val_text, batch_size=batch_size,\n",
-        "                                      max_length=max_length, stride=stride)\n",
-        "    return train_loader, val_loader\n",
-        "\n",
-        "#####################################\n",
-        "# Model Eğitimi\n",
-        "#####################################\n",
-        "\n",
-        "def train_model(model, train_loader, val_loader, device, epochs=5, eval_iter=10):\n",
-        "    optimizer = optim.Adam(model.parameters(), lr=3e-4)\n",
-        "    model.to(device)\n",
-        "\n",
-        "    for epoch in range(epochs):\n",
-        "        model.train()\n",
-        "        epoch_loss = 0.0\n",
-        "        start_time = time.time()\n",
-        "\n",
-        "        for batch_idx, (input_batch, target_batch) in enumerate(train_loader):\n",
-        "            optimizer.zero_grad()\n",
-        "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
-        "            loss.backward()\n",
-        "            optimizer.step()\n",
-        "            epoch_loss += loss.item()\n",
-        "\n",
-        "            if (batch_idx + 1) % 10 == 0:\n",
-        "                print(f\"Epoch {epoch+1} Batch {batch_idx+1}, Loss: {loss.item():.4f}\")\n",
-        "\n",
-        "        avg_loss = epoch_loss / len(train_loader)\n",
-        "        elapsed = time.time() - start_time\n",
-        "        print(f\"Epoch {epoch+1} tamamlandı (süre: {elapsed:.2f}s), ort. loss: {avg_loss:.4f}\")\n",
-        "\n",
-        "        # Kısa bir değerlendirme: eğitim ve doğrulama loss değerlerini hesapla\n",
-        "        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)\n",
-        "        print(f\"Epoch {epoch+1} değerlendirme: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}\\n\")\n",
-        "\n",
-        "#####################################\n",
-        "# Metin Üretimi\n",
-        "#####################################\n",
-        "\n",
-        "def generate_sample(model, tokenizer, device, prompt, max_new_tokens=50):\n",
-        "    print(\"Üretilen metin örneği:\\n\")\n",
-        "    generate_and_print_sample(model, tokenizer, device, prompt)\n",
-        "\n",
-        "#####################################\n",
-        "# Ana Fonksiyon\n",
-        "#####################################\n",
-        "\n",
-        "def main():\n",
-        "    # Cihaz seçimi (GPU varsa kullanılır)\n",
-        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "    print(f\"Kullanılan cihaz: {device}\")\n",
-        "\n",
-        "    # Veri hazırlığı\n",
-        "    text = load_data()\n",
-        "    train_loader, val_loader = prepare_dataloaders(text, batch_size=4, max_length=128, stride=64)\n",
-        "\n",
-        "    # Tokenizer ve model konfigürasyonu\n",
-        "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
-        "    cfg = {\n",
-        "        \"vocab_size\": tokenizer.n_vocab,  # Tokenizer'ın sözlüğündeki kelime sayısı\n",
-        "        \"emb_dim\": 128,                   # Küçük bir embedding boyutu (demo amaçlı)\n",
-        "        \"context_length\": 128,            # Maksimum dizi uzunluğu\n",
-        "        \"drop_rate\": 0.1,\n",
-        "        \"n_layers\": 8,                    # Katman sayısı\n",
-        "        \"n_heads\": 4,                     # Çoklu başlık sayısı (emb_dim'in tam böleni olmalı)\n",
-        "        \"qkv_bias\": True,\n",
-        "    }\n",
-        "\n",
-        "    # Model oluşturulması\n",
-        "    model = GPTModel(cfg)\n",
-        "\n",
-        "    # Modelin eğitimi\n",
-        "    train_model(model, train_loader, val_loader, device, epochs=5, eval_iter=10)\n",
-        "\n",
-        "    # Eğitim bittikten sonra, bir başlangıç prompt'u ile metin üretimi yapalım\n",
-        "    prompt = \"Once upon a time \"\n",
-        "    generate_sample(model, tokenizer, device, prompt, max_new_tokens=50)\n",
-        "\n",
-        "if __name__ == \"__main__\":\n",
-        "    main()\n"
-      ],
-      "metadata": {
-        "trusted": true,
-        "execution": {
-          "iopub.status.busy": "2025-02-12T18:01:02.043298Z",
-          "iopub.execute_input": "2025-02-12T18:01:02.043586Z",
-          "iopub.status.idle": "2025-02-12T18:01:04.598147Z",
-          "shell.execute_reply.started": "2025-02-12T18:01:02.043567Z",
-          "shell.execute_reply": "2025-02-12T18:01:04.597344Z"
-        },
-        "id": "DEJyddCuw_I3"
-      },
-      "outputs": [],
-      "execution_count": null
+  "language_info": {
+   "name": "python",
+   "version": "3.10.12",
+   "mimetype": "text/x-python",
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "pygments_lexer": "ipython3",
+   "nbconvert_exporter": "python",
+   "file_extension": ".py"
+  },
+  "kaggle": {
+   "accelerator": "none",
+   "dataSources": [],
+   "dockerImageVersionId": 30887,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook",
+   "isGpuEnabled": false
+  },
+  "colab": {
+   "provenance": [],
+   "include_colab_link": true
+  }
+ },
+ "nbformat_minor": 0,
+ "nbformat": 4,
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "view-in-github",
+    "colab_type": "text"
+   },
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/Transformer_Attention_FFN_Varyantlari_Performans_T.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).\n",
+    "# Source for \"Build a Large Language Model From Scratch\"\n",
+    "#   - https://www.manning.com/books/build-a-large-language-model-from-scratch\n",
+    "# Code: https://github.com/rasbt/LLMs-from-scratch\n",
+    "\n",
+    "# This file collects all the relevant code that we covered thus far\n",
+    "# throughout Chapters 2-4.\n",
+    "# This file can be run as a standalone script.\n",
+    "\n",
+    "import tiktoken\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "\n",
+    "#####################################\n",
+    "# Chapter 2\n",
+    "#####################################\n",
+    "\n",
+    "class GPTDatasetV1(Dataset):\n",
+    "    def __init__(self, txt, tokenizer, max_length, stride):\n",
+    "        self.input_ids = []\n",
+    "        self.target_ids = []\n",
+    "\n",
+    "        # Tokenize the entire text\n",
+    "        token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
+    "\n",
+    "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
+    "        for i in range(0, len(token_ids) - max_length, stride):\n",
+    "            input_chunk = token_ids[i:i + max_length]\n",
+    "            target_chunk = token_ids[i + 1: i + max_length + 1]\n",
+    "            self.input_ids.append(torch.tensor(input_chunk))\n",
+    "            self.target_ids.append(torch.tensor(target_chunk))\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.input_ids)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        return self.input_ids[idx], self.target_ids[idx]\n",
+    "\n",
+    "\n",
+    "def create_dataloader_v1(txt, batch_size=4, max_length=256,\n",
+    "                         stride=128, shuffle=True, drop_last=True, num_workers=0):\n",
+    "    # Initialize the tokenizer\n",
+    "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
+    "\n",
+    "    # Create dataset\n",
+    "    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n",
+    "\n",
+    "    # Create dataloader\n",
+    "    dataloader = DataLoader(\n",
+    "        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)\n",
+    "\n",
+    "    return dataloader\n",
+    "\n",
+    "\n",
+    "#####################################\n",
+    "# Chapter 3\n",
+    "#####################################\n",
+    "\n",
+    "class MultiHeadAttention(nn.Module):\n",
+    "    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):\n",
+    "        super().__init__()\n",
+    "        assert d_out % num_heads == 0, \"d_out must be divisible by n_heads\"\n",
+    "\n",
+    "        self.d_out = d_out\n",
+    "        self.num_heads = num_heads\n",
+    "        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim\n",
+    "\n",
+    "        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
+    "        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
+    "        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
+    "        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        b, num_tokens, d_in = x.shape\n",
+    "\n",
+    "        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)\n",
+    "        queries = self.W_query(x)\n",
+    "        values = self.W_value(x)\n",
+    "\n",
+    "        # We implicitly split the matrix by adding a `num_heads` dimension\n",
+    "        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)\n",
+    "        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)\n",
+    "        values = values.view(b, num_tokens, self.num_heads, self.head_dim)\n",
+    "        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)\n",
+    "\n",
+    "        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)\n",
+    "        keys = keys.transpose(1, 2)\n",
+    "        queries = queries.transpose(1, 2)\n",
+    "        values = values.transpose(1, 2)\n",
+    "\n",
+    "        # Compute scaled dot-product attention (aka self-attention) with a causal mask\n",
+    "        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head\n",
+    "\n",
+    "        # Original mask truncated to the number of tokens and converted to boolean\n",
+    "        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n",
+    "\n",
+    "        # Use the mask to fill attention scores\n",
+    "        attn_scores.masked_fill_(mask_bool, -torch.inf)\n",
+    "\n",
+    "        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)\n",
+    "        attn_weights = self.dropout(attn_weights)\n",
+    "\n",
+    "        # Shape: (b, num_tokens, num_heads, head_dim)\n",
+    "        context_vec = (attn_weights @ values).transpose(1, 2)\n",
+    "\n",
+    "        # Combine heads, where self.d_out = self.num_heads * self.head_dim\n",
+    "        context_vec = context_vec.reshape(b, num_tokens, self.d_out)\n",
+    "        context_vec = self.out_proj(context_vec)  # optional projection\n",
+    "\n",
+    "        return context_vec\n",
+    "\n",
+    "\n",
+    "#####################################\n",
+    "# Chapter 4\n",
+    "#####################################\n",
+    "\n",
+    "class LayerNorm(nn.Module):\n",
+    "    def __init__(self, emb_dim):\n",
+    "        super().__init__()\n",
+    "        self.eps = 1e-5\n",
+    "        self.scale = nn.Parameter(torch.ones(emb_dim))\n",
+    "        self.shift = nn.Parameter(torch.zeros(emb_dim))\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        mean = x.mean(dim=-1, keepdim=True)\n",
+    "        var = x.var(dim=-1, keepdim=True, unbiased=False)\n",
+    "        norm_x = (x - mean) / torch.sqrt(var + self.eps)\n",
+    "        return self.scale * norm_x + self.shift\n",
+    "\n",
+    "\n",
+    "class GELU(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return 0.5 * x * (1 + torch.tanh(\n",
+    "            torch.sqrt(torch.tensor(2.0 / torch.pi)) *\n",
+    "            (x + 0.044715 * torch.pow(x, 3))\n",
+    "        ))\n",
+    "\n",
+    "\n",
+    "class FeedForward(nn.Module):\n",
+    "    def __init__(self, cfg):\n",
+    "        super().__init__()\n",
+    "        self.layers = nn.Sequential(\n",
+    "            nn.Linear(cfg[\"emb_dim\"], 4 * cfg[\"emb_dim\"]),\n",
+    "            GELU(),\n",
+    "            nn.Linear(4 * cfg[\"emb_dim\"], cfg[\"emb_dim\"]),\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.layers(x)\n",
+    "\n",
+    "\n",
+    "class TransformerBlock(nn.Module):\n",
+    "    def __init__(self, cfg):\n",
+    "        super().__init__()\n",
+    "        self.att = MultiHeadAttention(\n",
+    "            d_in=cfg[\"emb_dim\"],\n",
+    "            d_out=cfg[\"emb_dim\"],\n",
+    "            context_length=cfg[\"context_length\"],\n",
+    "            num_heads=cfg[\"n_heads\"],\n",
+    "            dropout=cfg[\"drop_rate\"],\n",
+    "            qkv_bias=cfg[\"qkv_bias\"])\n",
+    "        self.ff = FeedForward(cfg)\n",
+    "        self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
+    "        self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
+    "        self.drop_shortcut = nn.Dropout(cfg[\"drop_rate\"])\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        # Shortcut connection for attention block\n",
+    "        shortcut = x\n",
+    "        x = self.norm1(x)\n",
+    "        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]\n",
+    "        x = self.drop_shortcut(x)\n",
+    "        x = x + shortcut  # Add the original input back\n",
+    "\n",
+    "        # Shortcut connection for feed-forward block\n",
+    "        shortcut = x\n",
+    "        x = self.norm2(x)\n",
+    "        x = self.ff(x)\n",
+    "        x = self.drop_shortcut(x)\n",
+    "        x = x + shortcut  # Add the original input back\n",
+    "\n",
+    "        return x\n",
+    "\n",
+    "\n",
+    "class GPTModel(nn.Module):\n",
+    "    def __init__(self, cfg):\n",
+    "        super().__init__()\n",
+    "        self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
+    "        self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
+    "        self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
+    "\n",
+    "        self.trf_blocks = nn.Sequential(\n",
+    "            *[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n",
+    "\n",
+    "        self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n",
+    "        self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False)\n",
+    "\n",
+    "    def forward(self, in_idx):\n",
+    "        batch_size, seq_len = in_idx.shape\n",
+    "        tok_embeds = self.tok_emb(in_idx)\n",
+    "        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
+    "        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]\n",
+    "        x = self.drop_emb(x)\n",
+    "        x = self.trf_blocks(x)\n",
+    "        x = self.final_norm(x)\n",
+    "        logits = self.out_head(x)\n",
+    "        return logits\n",
+    "\n",
+    "\n",
+    "def generate_text_simple(model, idx, max_new_tokens, context_size):\n",
+    "    # idx is (B, T) array of indices in the current context\n",
+    "    for _ in range(max_new_tokens):\n",
+    "\n",
+    "        # Crop current context if it exceeds the supported context size\n",
+    "        # E.g., if LLM supports only 5 tokens, and the context size is 10\n",
+    "        # then only the last 5 tokens are used as context\n",
+    "        idx_cond = idx[:, -context_size:]\n",
+    "\n",
+    "        # Get the predictions\n",
+    "        with torch.no_grad():\n",
+    "            logits = model(idx_cond)\n",
+    "\n",
+    "        # Focus only on the last time step\n",
+    "        # (batch, n_token, vocab_size) becomes (batch, vocab_size)\n",
+    "        logits = logits[:, -1, :]\n",
+    "\n",
+    "        # Get the idx of the vocab entry with the highest logits value\n",
+    "        idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch, 1)\n",
+    "\n",
+    "        # Append sampled index to the running sequence\n",
+    "        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)\n",
+    "\n",
+    "    return idx\n",
+    "\n",
+    "\n",
+    "#####################################\n",
+    "# Chapter 5\n",
+    "####################################\n",
+    "\n",
+    "\n",
+    "def calc_loss_batch(input_batch, target_batch, model, device):\n",
+    "    input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n",
+    "    logits = model(input_batch)\n",
+    "    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())\n",
+    "    return loss\n",
+    "\n",
+    "\n",
+    "def calc_loss_loader(data_loader, model, device, num_batches=None):\n",
+    "    total_loss = 0.\n",
+    "    if len(data_loader) == 0:\n",
+    "        return float(\"nan\")\n",
+    "    elif num_batches is None:\n",
+    "        num_batches = len(data_loader)\n",
+    "    else:\n",
+    "        num_batches = min(num_batches, len(data_loader))\n",
+    "    for i, (input_batch, target_batch) in enumerate(data_loader):\n",
+    "        if i < num_batches:\n",
+    "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
+    "            total_loss += loss.item()\n",
+    "        else:\n",
+    "            break\n",
+    "    return total_loss / num_batches\n",
+    "\n",
+    "\n",
+    "def evaluate_model(model, train_loader, val_loader, device, eval_iter):\n",
+    "    model.eval()\n",
+    "    with torch.no_grad():\n",
+    "        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)\n",
+    "        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)\n",
+    "    model.train()\n",
+    "    return train_loss, val_loss\n",
+    "\n",
+    "\n",
+    "def generate_and_print_sample(model, tokenizer, device, start_context):\n",
+    "    model.eval()\n",
+    "    context_size = model.pos_emb.weight.shape[0]\n",
+    "    encoded = text_to_token_ids(start_context, tokenizer).to(device)\n",
+    "    with torch.no_grad():\n",
+    "        token_ids = generate_text_simple(\n",
+    "            model=model, idx=encoded,\n",
+    "            max_new_tokens=50, context_size=context_size)\n",
+    "        decoded_text = token_ids_to_text(token_ids, tokenizer)\n",
+    "        print(decoded_text.replace(\"\\n\", \" \"))  # Compact print format\n",
+    "    model.train()\n",
+    "\n",
+    "\n",
+    "def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):\n",
+    "    fig, ax1 = plt.subplots(figsize=(5, 3))\n",
+    "\n",
+    "    # Plot training and validation loss against epochs\n",
+    "    ax1.plot(epochs_seen, train_losses, label=\"Training loss\")\n",
+    "    ax1.plot(epochs_seen, val_losses, linestyle=\"-.\", label=\"Validation loss\")\n",
+    "    ax1.set_xlabel(\"Epochs\")\n",
+    "    ax1.set_ylabel(\"Loss\")\n",
+    "    ax1.legend(loc=\"upper right\")\n",
+    "\n",
+    "    # Create a second x-axis for tokens seen\n",
+    "    ax2 = ax1.twiny()  # Create a second x-axis that shares the same y-axis\n",
+    "    ax2.plot(tokens_seen, train_losses, alpha=0)  # Invisible plot for aligning ticks\n",
+    "    ax2.set_xlabel(\"Tokens seen\")\n",
+    "\n",
+    "    fig.tight_layout()  # Adjust layout to make room\n",
+    "    # plt.show()\n",
+    "\n",
+    "\n",
+    "def text_to_token_ids(text, tokenizer):\n",
+    "    encoded = tokenizer.encode(text)\n",
+    "    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension\n",
+    "    return encoded_tensor\n",
+    "\n",
+    "\n",
+    "def token_ids_to_text(token_ids, tokenizer):\n",
+    "    flat = token_ids.squeeze(0)  # remove batch dimension\n",
+    "    return tokenizer.decode(flat.tolist())\n"
+   ],
+   "metadata": {
+    "trusted": true,
+    "execution": {
+     "iopub.status.busy": "2025-02-12T22:03:47.011719Z",
+     "iopub.execute_input": "2025-02-12T22:03:47.01204Z",
+     "iopub.status.idle": "2025-02-12T22:03:47.049181Z",
+     "shell.execute_reply.started": "2025-02-12T22:03:47.012016Z",
+     "shell.execute_reply": "2025-02-12T22:03:47.0478Z"
     },
-    {
-      "cell_type": "code",
-      "source": [
-        "import time\n",
-        "import math\n",
-        "import torch\n",
-        "import torch.nn as nn\n",
-        "import torch.optim as optim\n",
-        "from torch.utils.data import Dataset, DataLoader\n",
-        "import matplotlib.pyplot as plt\n",
-        "import tiktoken\n",
-        "from datasets import load_dataset  # Hugging Face datasets library\n",
-        "import re\n",
-        "\n",
-        "#####################################\n",
-        "# Rotary Positional Embeddings (ROPE) Implementation\n",
-        "#####################################\n",
-        "def apply_rotary_pos_emb(x):\n",
-        "    \"\"\"\n",
-        "    Apply Rotary Positional Embeddings (ROPE) to the input tensor.\n",
-        "\n",
-        "    Args:\n",
-        "        x (torch.Tensor): Input tensor of shape (batch, num_heads, seq_len, head_dim).\n",
-        "\n",
-        "    Returns:\n",
-        "        torch.Tensor: Tensor with ROPE applied.\n",
-        "    \"\"\"\n",
-        "    batch, n_heads, seq_len, head_dim = x.shape\n",
-        "    assert head_dim % 2 == 0, \"head_dim must be even for ROPE\"\n",
-        "\n",
-        "    # Calculate inverse frequencies and positions\n",
-        "    inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2, device=x.device).float() / head_dim))\n",
-        "    positions = torch.arange(seq_len, device=x.device).float()\n",
-        "    sinusoid_inp = torch.einsum(\"i,j->ij\", positions, inv_freq)  # (seq_len, head_dim/2)\n",
-        "    sin = torch.sin(sinusoid_inp)[None, None, :, :]  # (1, 1, seq_len, head_dim/2)\n",
-        "    cos = torch.cos(sinusoid_inp)[None, None, :, :]  # (1, 1, seq_len, head_dim/2)\n",
-        "\n",
-        "    # Split the input tensor into two halves and apply ROPE\n",
-        "    x1, x2 = x[..., :head_dim//2], x[..., head_dim//2:]\n",
-        "    x_rotated = torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)\n",
-        "    return x_rotated\n",
-        "\n",
-        "#####################################\n",
-        "# Dataset and DataLoader: Wikitext (Hugging Face)\n",
-        "#####################################\n",
-        "class GPTDatasetV1(Dataset):\n",
-        "    def __init__(self, text, tokenizer, max_length, stride):\n",
-        "        self.input_ids = []\n",
-        "        self.target_ids = []\n",
-        "\n",
-        "        # Tokenize the text\n",
-        "        token_ids = tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n",
-        "\n",
-        "        # Create input-target pairs\n",
-        "        for i in range(0, len(token_ids) - max_length, stride):\n",
-        "            input_chunk = token_ids[i:i + max_length]\n",
-        "            target_chunk = token_ids[i + 1: i + max_length + 1]\n",
-        "            self.input_ids.append(torch.tensor(input_chunk))\n",
-        "            self.target_ids.append(torch.tensor(target_chunk))\n",
-        "\n",
-        "    def __len__(self):\n",
-        "        return len(self.input_ids)\n",
-        "\n",
-        "    def __getitem__(self, idx):\n",
-        "        return self.input_ids[idx], self.target_ids[idx]\n",
-        "\n",
-        "def create_dataloader_v1(text, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):\n",
-        "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
-        "    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)\n",
-        "    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)\n",
-        "    return dataloader\n",
-        "\n",
-        "def load_wikitext_data(num_lines=10000, dataset_name=\"wikitext\", subset=\"wikitext-103-raw-v1\"):\n",
-        "    \"\"\"\n",
-        "    Load Wikitext data from Hugging Face and concatenate the first `num_lines` lines into a single text.\n",
-        "\n",
-        "    Args:\n",
-        "        num_lines (int): Number of lines to load.\n",
-        "        dataset_name (str): Name of the dataset.\n",
-        "        subset (str): Subset of the dataset.\n",
-        "\n",
-        "    Returns:\n",
-        "        str: Concatenated text.\n",
-        "    \"\"\"\n",
-        "    ds = load_dataset(dataset_name, subset)\n",
-        "    text_lines = ds[\"train\"][\"text\"][:num_lines]\n",
-        "    text = \"\\n\".join(text_lines)\n",
-        "    return text\n",
-        "\n",
-        "def preprocess_text(text):\n",
-        "    \"\"\"\n",
-        "    Preprocess the text data by removing unwanted characters and normalizing whitespace.\n",
-        "\n",
-        "    Args:\n",
-        "        text (str): Input text.\n",
-        "\n",
-        "    Returns:\n",
-        "        str: Preprocessed text.\n",
-        "    \"\"\"\n",
-        "    # Remove special characters and digits, and normalize whitespace\n",
-        "    text = re.sub(r'[^A-Za-z\\s]', '', text)\n",
-        "    text = re.sub(r'\\s+', ' ', text).strip()\n",
-        "    return text\n",
-        "\n",
-        "#####################################\n",
-        "# Advanced Model Components (GPTModel)\n",
-        "#####################################\n",
-        "\n",
-        "class MultiHeadAttention(nn.Module):\n",
-        "    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False, use_rope=False):\n",
-        "        super().__init__()\n",
-        "        assert d_out % num_heads == 0, \"d_out must be divisible by num_heads\"\n",
-        "        self.d_out = d_out\n",
-        "        self.num_heads = num_heads\n",
-        "        self.head_dim = d_out // num_heads\n",
-        "\n",
-        "        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
-        "        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
-        "        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
-        "        self.out_proj = nn.Linear(d_out, d_out)\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "        self.use_rope = use_rope\n",
-        "        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        b, num_tokens, _ = x.shape\n",
-        "\n",
-        "        keys = self.W_key(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)\n",
-        "        queries = self.W_query(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)\n",
-        "        values = self.W_value(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)\n",
-        "\n",
-        "        if self.use_rope:\n",
-        "            queries = apply_rotary_pos_emb(queries)\n",
-        "            keys = apply_rotary_pos_emb(keys)\n",
-        "\n",
-        "        attn_scores = queries @ keys.transpose(2, 3)\n",
-        "        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n",
-        "        attn_scores.masked_fill_(mask_bool, float(\"-inf\"))\n",
-        "\n",
-        "        attn_weights = torch.softmax(attn_scores / math.sqrt(self.head_dim), dim=-1)\n",
-        "        attn_weights = self.dropout(attn_weights)\n",
-        "\n",
-        "        context_vec = (attn_weights @ values).transpose(1, 2).reshape(b, num_tokens, self.d_out)\n",
-        "        context_vec = self.out_proj(context_vec)\n",
-        "        return context_vec\n",
-        "\n",
-        "class LayerNorm(nn.Module):\n",
-        "    def __init__(self, emb_dim):\n",
-        "        super().__init__()\n",
-        "        self.eps = 1e-5\n",
-        "        self.scale = nn.Parameter(torch.ones(emb_dim))\n",
-        "        self.shift = nn.Parameter(torch.zeros(emb_dim))\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        mean = x.mean(dim=-1, keepdim=True)\n",
-        "        var = x.var(dim=-1, keepdim=True, unbiased=False)\n",
-        "        norm_x = (x - mean) / torch.sqrt(var + self.eps)\n",
-        "        return self.scale * norm_x + self.shift\n",
-        "\n",
-        "class GELU(nn.Module):\n",
-        "    def forward(self, x):\n",
-        "        return 0.5 * x * (1 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))\n",
-        "\n",
-        "class FeedForward(nn.Module):\n",
-        "    def __init__(self, cfg):\n",
-        "        super().__init__()\n",
-        "        self.layers = nn.Sequential(\n",
-        "            nn.Linear(cfg[\"emb_dim\"], 4 * cfg[\"emb_dim\"]),\n",
-        "            GELU(),\n",
-        "            nn.Linear(4 * cfg[\"emb_dim\"], cfg[\"emb_dim\"]),\n",
-        "        )\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        return self.layers(x)\n",
-        "\n",
-        "class TransformerBlock(nn.Module):\n",
-        "    def __init__(self, cfg):\n",
-        "        super().__init__()\n",
-        "        self.att = MultiHeadAttention(\n",
-        "            d_in=cfg[\"emb_dim\"],\n",
-        "            d_out=cfg[\"emb_dim\"],\n",
-        "            context_length=cfg[\"context_length\"],\n",
-        "            num_heads=cfg[\"n_heads\"],\n",
-        "            dropout=cfg[\"drop_rate\"],\n",
-        "            qkv_bias=cfg[\"qkv_bias\"],\n",
-        "            use_rope=cfg.get(\"use_rope\", False)\n",
-        "        )\n",
-        "        self.ff = FeedForward(cfg)\n",
-        "        self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
-        "        self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
-        "        self.drop_shortcut = nn.Dropout(cfg[\"drop_rate\"])\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        shortcut = x\n",
-        "        x = self.norm1(x)\n",
-        "        x = self.att(x)\n",
-        "        x = self.drop_shortcut(x)\n",
-        "        x = x + shortcut\n",
-        "\n",
-        "        shortcut = x\n",
-        "        x = self.norm2(x)\n",
-        "        x = self.ff(x)\n",
-        "        x = self.drop_shortcut(x)\n",
-        "        x = x + shortcut\n",
-        "        return x\n",
-        "\n",
-        "class GPTModel(nn.Module):\n",
-        "    def __init__(self, cfg):\n",
-        "        super().__init__()\n",
-        "        self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
-        "        self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
-        "        self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
-        "\n",
-        "        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n",
-        "\n",
-        "        self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n",
-        "        self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False)\n",
-        "\n",
-        "    def forward(self, in_idx):\n",
-        "        batch_size, seq_len = in_idx.shape\n",
-        "        tok_embeds = self.tok_emb(in_idx)\n",
-        "        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
-        "        x = tok_embeds + pos_embeds\n",
-        "        x = self.drop_emb(x)\n",
-        "        x = self.trf_blocks(x)\n",
-        "        x = self.final_norm(x)\n",
-        "        logits = self.out_head(x)\n",
-        "        return logits\n",
-        "\n",
-        "#####################################\n",
-        "# Training and Evaluation Functions\n",
-        "#####################################\n",
-        "def calc_loss_batch(input_batch, target_batch, model, device):\n",
-        "    input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n",
-        "    logits = model(input_batch)\n",
-        "    loss = nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())\n",
-        "    return loss\n",
-        "\n",
-        "def calc_loss_loader(data_loader, model, device, num_batches=None):\n",
-        "    total_loss = 0.0\n",
-        "    if len(data_loader) == 0:\n",
-        "        return float(\"nan\")\n",
-        "    num_batches = num_batches if num_batches is not None else len(data_loader)\n",
-        "    num_batches = min(num_batches, len(data_loader))\n",
-        "    for i, (input_batch, target_batch) in enumerate(data_loader):\n",
-        "        if i < num_batches:\n",
-        "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
-        "            total_loss += loss.item()\n",
-        "        else:\n",
-        "            break\n",
-        "    return total_loss / num_batches\n",
-        "\n",
-        "def evaluate_model(model, train_loader, val_loader, device, eval_iter):\n",
-        "    model.eval()\n",
-        "    with torch.no_grad():\n",
-        "        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)\n",
-        "        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)\n",
-        "    model.train()\n",
-        "    return train_loss, val_loss\n",
-        "\n",
-        "def generate_text_simple(model, idx, max_new_tokens, context_size):\n",
-        "    for _ in range(max_new_tokens):\n",
-        "        idx_cond = idx[:, -context_size:]\n",
-        "        with torch.no_grad():\n",
-        "            logits = model(idx_cond)\n",
-        "        logits = logits[:, -1, :]\n",
-        "        idx_next = torch.argmax(logits, dim=-1, keepdim=True)\n",
-        "        idx = torch.cat((idx, idx_next), dim=1)\n",
-        "    return idx\n",
-        "\n",
-        "def generate_and_print_sample(model, tokenizer, device, start_context):\n",
-        "    model.eval()\n",
-        "    context_size = model.pos_emb.weight.shape[0]\n",
-        "    encoded = text_to_token_ids(start_context, tokenizer).to(device)\n",
-        "    with torch.no_grad():\n",
-        "        token_ids = generate_text_simple(model=model, idx=encoded, max_new_tokens=50, context_size=context_size)\n",
-        "        decoded_text = token_ids_to_text(token_ids, tokenizer)\n",
-        "        print(decoded_text.replace(\"\\n\", \" \"))\n",
-        "    model.train()\n",
-        "\n",
-        "def text_to_token_ids(text, tokenizer):\n",
-        "    encoded = tokenizer.encode(text)\n",
-        "    return torch.tensor(encoded).unsqueeze(0)\n",
-        "\n",
-        "def token_ids_to_text(token_ids, tokenizer):\n",
-        "    flat = token_ids.squeeze(0)\n",
-        "    return tokenizer.decode(flat.tolist())\n",
-        "\n",
-        "def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):\n",
-        "    fig, ax1 = plt.subplots(figsize=(5, 3))\n",
-        "    ax1.plot(epochs_seen, train_losses, label=\"Training loss\")\n",
-        "    ax1.plot(epochs_seen, val_losses, linestyle=\"-.\", label=\"Validation loss\")\n",
-        "    ax1.set_xlabel(\"Epochs\")\n",
-        "    ax1.set_ylabel(\"Loss\")\n",
-        "    ax1.legend(loc=\"upper right\")\n",
-        "    ax2 = ax1.twiny()\n",
-        "    ax2.plot(tokens_seen, train_losses, alpha=0)\n",
-        "    ax2.set_xlabel(\"Tokens seen\")\n",
-        "    fig.tight_layout()\n",
-        "    plt.show()\n",
-        "\n",
-        "#####################################\n",
-        "# Model Training\n",
-        "#####################################\n",
-        "def train_model(model, train_loader, val_loader, device, epochs=30, eval_iter=20, lr=1e-4):\n",
-        "    optimizer = optim.Adam(model.parameters(), lr=lr)\n",
-        "    model.to(device)\n",
-        "    for epoch in range(epochs):\n",
-        "        model.train()\n",
-        "        epoch_loss = 0.0\n",
-        "        start_time = time.time()\n",
-        "\n",
-        "        for batch_idx, (input_batch, target_batch) in enumerate(train_loader):\n",
-        "            optimizer.zero_grad()\n",
-        "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
-        "            loss.backward()\n",
-        "            optimizer.step()\n",
-        "            epoch_loss += loss.item()\n",
-        "            if (batch_idx + 1) % 10 == 0:\n",
-        "                print(f\"Epoch {epoch+1} Batch {batch_idx+1}, Loss: {loss.item():.4f}\")\n",
-        "\n",
-        "        avg_loss = epoch_loss / len(train_loader)\n",
-        "        elapsed = time.time() - start_time\n",
-        "        print(f\"Epoch {epoch+1} completed (time: {elapsed:.2f}s), avg. loss: {avg_loss:.4f}\")\n",
-        "        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)\n",
-        "        print(f\"Epoch {epoch+1} evaluation: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}\\n\")\n",
-        "\n",
-        "#####################################\n",
-        "# Text Generation\n",
-        "#####################################\n",
-        "def generate_sample(model, tokenizer, device, prompt, max_new_tokens=50):\n",
-        "    print(\"Generated text sample:\\n\")\n",
-        "    generate_and_print_sample(model, tokenizer, device, prompt)\n",
-        "\n",
-        "#####################################\n",
-        "# Main Function\n",
-        "#####################################\n",
-        "def main():\n",
-        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "    print(f\"Using device: {device}\")\n",
-        "\n",
-        "    # Load Wikitext data from Hugging Face and use the first 50k lines\n",
-        "    text = load_wikitext_data(num_lines=50000, dataset_name=\"wikitext\", subset=\"wikitext-103-raw-v1\")\n",
-        "\n",
-        "    # Preprocess the text data\n",
-        "    text = preprocess_text(text)\n",
-        "\n",
-        "    # Split data into training and validation sets (e.g., 90% train, 10% validation)\n",
-        "    split_idx = int(0.9 * len(text))\n",
-        "    train_text = text[:split_idx]\n",
-        "    val_text = text[split_idx:]\n",
-        "    train_loader = create_dataloader_v1(train_text, batch_size=8, max_length=256, stride=128)\n",
-        "    val_loader = create_dataloader_v1(val_text, batch_size=8, max_length=256, stride=128)\n",
-        "\n",
-        "    # Tokenizer and advanced model configuration\n",
-        "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
-        "    cfg = {\n",
-        "        \"vocab_size\": tokenizer.n_vocab,\n",
-        "        \"emb_dim\": 256,\n",
-        "        \"context_length\": 256,\n",
-        "        \"drop_rate\": 0.1,\n",
-        "        \"n_layers\": 6,\n",
-        "        \"n_heads\": 8,\n",
-        "        \"qkv_bias\": True,\n",
-        "        \"use_rope\": True,\n",
-        "    }\n",
-        "\n",
-        "    model = GPTModel(cfg)\n",
-        "    train_model(model, train_loader, val_loader, device, epochs=1, eval_iter=25, lr=1e-5)\n",
-        "\n",
-        "    # Generate text after training with a given prompt\n",
-        "    prompt = \"Valkyria Chronicles III \"\n",
-        "    generate_sample(model, tokenizer, device, prompt, max_new_tokens=100)\n",
-        "\n",
-        "if __name__ == \"__main__\":\n",
-        "    main()\n"
-      ],
-      "metadata": {
-        "trusted": true,
-        "execution": {
-          "iopub.status.busy": "2025-02-12T20:33:49.198352Z",
-          "iopub.execute_input": "2025-02-12T20:33:49.198767Z",
-          "iopub.status.idle": "2025-02-12T20:38:21.836553Z",
-          "shell.execute_reply.started": "2025-02-12T20:33:49.198738Z",
-          "shell.execute_reply": "2025-02-12T20:38:21.83572Z"
-        },
-        "id": "2E8o3nAXw_I3"
-      },
-      "outputs": [],
-      "execution_count": null
+    "id": "ML-KCXoIw_I0"
+   },
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import time\n",
+    "import torch\n",
+    "import torch.optim as optim\n",
+    "\n",
+    "#####################################\n",
+    "# Settings and data preparation\n",
+    "#####################################\n",
+    "\n",
+    "def load_data():\n",
+    "    # Small sample text for demonstration. Use a larger corpus in real applications.\n",
+    "    text = (\"Once upon a time, in a land far, far away, there was a kingdom where magic was common \"\n",
+    "            \"and adventure awaited around every corner. \") * 100  # metni tekrarlayarak uzunluyoruz\n",
+    "    return text\n",
+    "\n",
+    "def prepare_dataloaders(text, batch_size=4, max_length=128, stride=64):\n",
+    "    # Split the data for training and validation (e.g., 90% training, 10% validation)\n",
+    "    split_idx = int(0.9 * len(text))\n",
+    "    train_text = text[:split_idx]\n",
+    "    val_text = text[split_idx:]\n",
+    "    train_loader = create_dataloader_v1(train_text, batch_size=batch_size,\n",
+    "                                        max_length=max_length, stride=stride)\n",
+    "    val_loader = create_dataloader_v1(val_text, batch_size=batch_size,\n",
+    "                                      max_length=max_length, stride=stride)\n",
+    "    return train_loader, val_loader\n",
+    "\n",
+    "#####################################\n",
+    "# Model Trainingi\n",
+    "#####################################\n",
+    "\n",
+    "def train_model(model, train_loader, val_loader, device, epochs=5, eval_iter=10):\n",
+    "    optimizer = optim.Adam(model.parameters(), lr=3e-4)\n",
+    "    model.to(device)\n",
+    "\n",
+    "    for epoch in range(epochs):\n",
+    "        model.train()\n",
+    "        epoch_loss = 0.0\n",
+    "        start_time = time.time()\n",
+    "\n",
+    "        for batch_idx, (input_batch, target_batch) in enumerate(train_loader):\n",
+    "            optimizer.zero_grad()\n",
+    "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "            epoch_loss += loss.item()\n",
+    "\n",
+    "            if (batch_idx + 1) % 10 == 0:\n",
+    "                print(f\"Epoch {epoch+1} Batch {batch_idx+1}, Loss: {loss.item():.4f}\")\n",
+    "\n",
+    "        avg_loss = epoch_loss / len(train_loader)\n",
+    "        elapsed = time.time() - start_time\n",
+    "        print(f\"Epoch {epoch+1} completed (time: {elapsed:.2f}s), avg. loss: {avg_loss:.4f}\")\n",
+    "\n",
+    "        # Brief evaluation: compute training and validation losses\n",
+    "        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)\n",
+    "        print(f\"Epoch {epoch+1} evaluation: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}\\n\")\n",
+    "\n",
+    "#####################################\n",
+    "# Text generation\n",
+    "#####################################\n",
+    "\n",
+    "def generate_sample(model, tokenizer, device, prompt, max_new_tokens=50):\n",
+    "    print(\"Generated text example:\\n\")\n",
+    "    generate_and_print_sample(model, tokenizer, device, prompt)\n",
+    "\n",
+    "#####################################\n",
+    "# Ana Fonksiyon\n",
+    "#####################################\n",
+    "\n",
+    "def main():\n",
+    "    # Device selection (use GPU if available)\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "    print(f\"Device in use: {device}\")\n",
+    "\n",
+    "    # Data preparation\n",
+    "    text = load_data()\n",
+    "    train_loader, val_loader = prepare_dataloaders(text, batch_size=4, max_length=128, stride=64)\n",
+    "\n",
+    "    # Tokenizer and model configuration\n",
+    "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
+    "    cfg = {\n",
+    "        \"vocab_size\": tokenizer.n_vocab,  # Number of tokens in the tokenizer vocabulary\n",
+    "        \"emb_dim\": 128,                   # Small embedding dimension (for demo purposes)\n",
+    "        \"context_length\": 128,            # Maximum sequence length\n",
+    "        \"drop_rate\": 0.1,\n",
+    "        \"n_layers\": 8,                    # Number of layers\n",
+    "        \"n_heads\": 4,                     # Number of heads (must divide emb_dim exactly)\n",
+    "        \"qkv_bias\": True,\n",
+    "    }\n",
+    "\n",
+    "    # Model construction\n",
+    "    model = GPTModel(cfg)\n",
+    "\n",
+    "    # Model training\n",
+    "    train_model(model, train_loader, val_loader, device, epochs=5, eval_iter=10)\n",
+    "\n",
+    "    # After training, generate text with an initial prompt\n",
+    "    prompt = \"Once upon a time \"\n",
+    "    generate_sample(model, tokenizer, device, prompt, max_new_tokens=50)\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()\n"
+   ],
+   "metadata": {
+    "trusted": true,
+    "execution": {
+     "iopub.status.busy": "2025-02-12T18:01:02.043298Z",
+     "iopub.execute_input": "2025-02-12T18:01:02.043586Z",
+     "iopub.status.idle": "2025-02-12T18:01:04.598147Z",
+     "shell.execute_reply.started": "2025-02-12T18:01:02.043567Z",
+     "shell.execute_reply": "2025-02-12T18:01:04.597344Z"
     },
-    {
-      "cell_type": "code",
-      "source": [
-        "import torch\n",
-        "import torch.nn as nn\n",
-        "import torch.optim as optim\n",
-        "import math\n",
-        "import time\n",
-        "\n",
-        "#############################################\n",
-        "# 1. Alternatif Normalizasyon: RMSNorm\n",
-        "#############################################\n",
-        "class RMSNorm(nn.Module):\n",
-        "    def __init__(self, emb_dim, eps=1e-8):\n",
-        "        super().__init__()\n",
-        "        self.eps = eps\n",
-        "        self.scale = nn.Parameter(torch.ones(emb_dim))\n",
-        "    def forward(self, x):\n",
-        "        # x shape: (..., emb_dim)\n",
-        "        norm_x = x / torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)\n",
-        "        return self.scale * norm_x\n",
-        "\n",
-        "def get_norm(norm_type, emb_dim):\n",
-        "    if norm_type == 'layernorm':\n",
-        "        return nn.LayerNorm(emb_dim)\n",
-        "    elif norm_type == 'rmsnorm':\n",
-        "        return RMSNorm(emb_dim)\n",
-        "    else:\n",
-        "        raise ValueError(\"Unknown normalization type\")\n",
-        "\n",
-        "#############################################\n",
-        "# 2. Ortak Konfigürasyon\n",
-        "#############################################\n",
-        "class Config:\n",
-        "    def __init__(self, vocab_size=30522, emb_dim=768, max_length=512, n_layers=4, n_heads=12,\n",
-        "                 dropout=0.1, norm_type='layernorm'):\n",
-        "        self.vocab_size = vocab_size\n",
-        "        self.emb_dim = emb_dim\n",
-        "        self.max_length = max_length\n",
-        "        self.n_layers = n_layers\n",
-        "        self.n_heads = n_heads\n",
-        "        self.dropout = dropout\n",
-        "        self.norm_type = norm_type  # 'layernorm' veya 'rmsnorm'\n",
-        "        # Advanced varyantlar için ek parametreler:\n",
-        "        self.latent_dim = emb_dim // 2   # RoPE ve latent projeksiyon için\n",
-        "        self.num_experts = 4             # MoE FFN’de kullanılacak uzman sayısı\n",
-        "\n",
-        "#############################################\n",
-        "# --- Attention Modülleri ---\n",
-        "#############################################\n",
-        "# 1. Standard Dot-Product Attention\n",
-        "class StandardAttention(nn.Module):\n",
-        "    def __init__(self, emb_dim, n_heads, dropout):\n",
-        "        super().__init__()\n",
-        "        assert emb_dim % n_heads == 0, \"Embedding boyutu baş sayısına tam bölünmeli.\"\n",
-        "        self.n_heads = n_heads\n",
-        "        self.head_dim = emb_dim // n_heads\n",
-        "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        batch, seq_len, emb_dim = x.size()\n",
-        "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
-        "        attn = torch.softmax(scores, dim=-1)\n",
-        "        attn = self.dropout(attn)\n",
-        "        context = torch.matmul(attn, V)\n",
-        "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
-        "        return self.out_proj(context)\n",
-        "\n",
-        "# 2. RoPE Attention\n",
-        "def apply_rope(x, base=10000):\n",
-        "    # x: (batch, n_heads, seq_len, head_dim)\n",
-        "    batch, n_heads, seq_len, head_dim = x.shape\n",
-        "    inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device=x.device).float() / head_dim))\n",
-        "    pos = torch.arange(seq_len, device=x.device).float()\n",
-        "    sinusoid_inp = torch.einsum(\"i,j->ij\", pos, inv_freq)  # (seq_len, head_dim/2)\n",
-        "    sin = torch.sin(sinusoid_inp).unsqueeze(0).unsqueeze(0)\n",
-        "    cos = torch.cos(sinusoid_inp).unsqueeze(0).unsqueeze(0)\n",
-        "    x1 = x[..., :head_dim//2]\n",
-        "    x2 = x[..., head_dim//2:]\n",
-        "    return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)\n",
-        "\n",
-        "class RoPEAttention(nn.Module):\n",
-        "    def __init__(self, emb_dim, n_heads, dropout):\n",
-        "        super().__init__()\n",
-        "        assert emb_dim % n_heads == 0, \"Embedding boyutu baş sayısına tam bölünmeli.\"\n",
-        "        self.n_heads = n_heads\n",
-        "        self.head_dim = emb_dim // n_heads\n",
-        "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        batch, seq_len, emb_dim = x.size()\n",
-        "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        Q = apply_rope(Q)\n",
-        "        K = apply_rope(K)\n",
-        "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
-        "        attn = torch.softmax(scores, dim=-1)\n",
-        "        attn = self.dropout(attn)\n",
-        "        context = torch.matmul(attn, V)\n",
-        "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
-        "        return self.out_proj(context)\n",
-        "\n",
-        "# 3. FlashAttention benzeri Attention (placeholder)\n",
-        "def flash_attention(Q, K, V):\n",
-        "    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(Q.size(-1))\n",
-        "    attn = torch.softmax(scores, dim=-1)\n",
-        "    return torch.matmul(attn, V)\n",
-        "\n",
-        "class FlashAttentionModule(nn.Module):\n",
-        "    def __init__(self, emb_dim, n_heads, dropout):\n",
-        "        super().__init__()\n",
-        "        assert emb_dim % n_heads == 0, \"Embedding boyutu baş sayısına tam bölünmeli.\"\n",
-        "        self.n_heads = n_heads\n",
-        "        self.head_dim = emb_dim // n_heads\n",
-        "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        batch, seq_len, emb_dim = x.size()\n",
-        "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        context = flash_attention(Q, K, V)\n",
-        "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
-        "        return self.out_proj(context)\n",
-        "\n",
-        "# 4. Multi-Query Attention: Keys & Values tek projeksiyon\n",
-        "class MultiQueryAttention(nn.Module):\n",
-        "    def __init__(self, emb_dim, n_heads, dropout):\n",
-        "        super().__init__()\n",
-        "        self.n_heads = n_heads\n",
-        "        self.head_dim = emb_dim // n_heads\n",
-        "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.k_proj = nn.Linear(emb_dim, self.head_dim)\n",
-        "        self.v_proj = nn.Linear(emb_dim, self.head_dim)\n",
-        "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        batch, seq_len, emb_dim = x.size()\n",
-        "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        K = self.k_proj(x).unsqueeze(1).expand(batch, self.n_heads, seq_len, self.head_dim)\n",
-        "        V = self.v_proj(x).unsqueeze(1).expand(batch, self.n_heads, seq_len, self.head_dim)\n",
-        "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
-        "        attn = torch.softmax(scores, dim=-1)\n",
-        "        attn = self.dropout(attn)\n",
-        "        context = torch.matmul(attn, V)\n",
-        "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
-        "        return self.out_proj(context)\n",
-        "\n",
-        "# 5. ALiBi Attention: Lineer bias ekleyerek göreceli pozisyon bilgisini entegre eder (Press et al., 2021)\n",
-        "class ALiBiAttention(nn.Module):\n",
-        "    def __init__(self, emb_dim, n_heads, dropout, alibi_scaling=-1.0):\n",
-        "        super().__init__()\n",
-        "        assert emb_dim % n_heads == 0, \"Embedding boyutu, baş sayısına tam bölünmeli.\"\n",
-        "        self.n_heads = n_heads\n",
-        "        self.head_dim = emb_dim // n_heads\n",
-        "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "        self.alibi_scaling = alibi_scaling\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        batch, seq_len, emb_dim = x.size()\n",
-        "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
-        "        # ALiBi bias: B[i,j] = (j - i) * scale\n",
-        "        bias = torch.arange(seq_len, device=x.device).unsqueeze(0) - torch.arange(seq_len, device=x.device).unsqueeze(1)\n",
-        "        bias = self.alibi_scaling * bias.float()\n",
-        "        scores = scores + bias.unsqueeze(0).unsqueeze(0)\n",
-        "        attn = torch.softmax(scores, dim=-1)\n",
-        "        attn = self.dropout(attn)\n",
-        "        context = torch.matmul(attn, V)\n",
-        "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
-        "        return self.out_proj(context)\n",
-        "\n",
-        "#############################################\n",
-        "# --- FFN Varyantları ---\n",
-        "#############################################\n",
-        "# 1. Standart FFN\n",
-        "class StandardFFN(nn.Module):\n",
-        "    def __init__(self, emb_dim, expansion=4, dropout=0.1):\n",
-        "        super().__init__()\n",
-        "        self.net = nn.Sequential(\n",
-        "            nn.Linear(emb_dim, expansion * emb_dim),\n",
-        "            nn.GELU(),\n",
-        "            nn.Dropout(dropout),\n",
-        "            nn.Linear(expansion * emb_dim, emb_dim)\n",
-        "        )\n",
-        "    def forward(self, x):\n",
-        "        return self.net(x)\n",
-        "\n",
-        "# 2. MoE FFN\n",
-        "class MoEFFN(nn.Module):\n",
-        "    def __init__(self, emb_dim, num_experts, expansion=4, dropout=0.1):\n",
-        "        super().__init__()\n",
-        "        self.num_experts = num_experts\n",
-        "        self.experts = nn.ModuleList([\n",
-        "            nn.Sequential(\n",
-        "                nn.Linear(emb_dim, expansion * emb_dim),\n",
-        "                nn.GELU(),\n",
-        "                nn.Dropout(dropout),\n",
-        "                nn.Linear(expansion * emb_dim, emb_dim)\n",
-        "            ) for _ in range(num_experts)\n",
-        "        ])\n",
-        "        self.gate = nn.Linear(emb_dim, num_experts)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        gate_scores = torch.softmax(self.gate(x), dim=-1)  # (batch, seq_len, num_experts)\n",
-        "        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=-1)  # (batch, seq_len, emb_dim, num_experts)\n",
-        "        gate_scores = gate_scores.unsqueeze(2)  # (batch, seq_len, 1, num_experts)\n",
-        "        return (expert_outputs * gate_scores).sum(dim=-1)\n",
-        "\n",
-        "#############################################\n",
-        "# --- Transformer Bloğu: Seçilebilir Attention ve FFN varyantları, Dropout, Pre-Norm ---\n",
-        "#############################################\n",
-        "class TransformerBlock(nn.Module):\n",
-        "    def __init__(self, emb_dim, n_heads, attn_module, ffn_module, dropout, norm_type):\n",
-        "        super().__init__()\n",
-        "        self.norm1 = get_norm(norm_type, emb_dim)\n",
-        "        self.attn = attn_module(emb_dim, n_heads, dropout)\n",
-        "        self.norm2 = get_norm(norm_type, emb_dim)\n",
-        "        self.ffn = ffn_module(emb_dim, dropout=dropout)  # ffn_module: StandardFFN or MoEFFN (for MoE, lambda is used)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        x = x + self.attn(self.norm1(x))\n",
-        "        x = x + self.ffn(self.norm2(x))\n",
-        "        return x\n",
-        "\n",
-        "#############################################\n",
-        "# --- Transformer Modeli: Farklı varyantların seçilebildiği yapı ---\n",
-        "#############################################\n",
-        "class TransformerModel(nn.Module):\n",
-        "    def __init__(self, config, attn_variant='standard', ffn_variant='standard'):\n",
-        "        super().__init__()\n",
-        "        self.token_embed = nn.Embedding(config.vocab_size, config.emb_dim)\n",
-        "        self.pos_embed   = nn.Embedding(config.max_length, config.emb_dim)\n",
-        "\n",
-        "        attn_dict = {\n",
-        "            'standard': StandardAttention,\n",
-        "            'rope': RoPEAttention,\n",
-        "            'flash': FlashAttentionModule,\n",
-        "            'multiquery': MultiQueryAttention,\n",
-        "            'alibi': ALiBiAttention\n",
-        "        }\n",
-        "        ffn_dict = {\n",
-        "            'standard': StandardFFN,\n",
-        "            'moe': lambda emb_dim, dropout: MoEFFN(emb_dim, config.num_experts, dropout=dropout)\n",
-        "        }\n",
-        "        self.layers = nn.ModuleList([\n",
-        "            TransformerBlock(config.emb_dim, config.n_heads, attn_dict[attn_variant], ffn_dict[ffn_variant], config.dropout, config.norm_type)\n",
-        "            for _ in range(config.n_layers)\n",
-        "        ])\n",
-        "        self.norm = get_norm(config.norm_type, config.emb_dim)\n",
-        "        self.output_proj = nn.Linear(config.emb_dim, config.vocab_size, bias=False)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        seq_len = x.size(1)\n",
-        "        x = self.token_embed(x) + self.pos_embed(torch.arange(seq_len, device=x.device))\n",
-        "        for layer in self.layers:\n",
-        "            x = layer(x)\n",
-        "        x = self.norm(x)\n",
-        "        return self.output_proj(x)\n",
-        "\n",
-        "#############################################\n",
-        "# --- Ek: Model Özeti ve Parametre Sayısı Fonksiyonu ---\n",
-        "#############################################\n",
-        "def model_summary(model):\n",
-        "    total_params = sum(p.numel() for p in model.parameters())\n",
-        "    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
-        "    print(f\"Toplam Parametre: {total_params:,}\")\n",
-        "    print(f\"Eğitilebilir Parametre: {trainable:,}\")\n",
-        "\n",
-        "#############################################\n",
-        "# --- Ek: Greedy Decoding Fonksiyonu ---\n",
-        "#############################################\n",
-        "def greedy_decode(model, start_token, max_length, device):\n",
-        "    model.eval()\n",
-        "    generated = [start_token]\n",
-        "    input_seq = torch.tensor([generated], device=device)\n",
-        "    with torch.no_grad():\n",
-        "        for _ in range(max_length - 1):\n",
-        "            logits = model(input_seq)  # (batch, seq_len, vocab_size)\n",
-        "            next_token = torch.argmax(logits[0, -1, :]).item()\n",
-        "            generated.append(next_token)\n",
-        "            input_seq = torch.tensor([generated], device=device)\n",
-        "    model.train()\n",
-        "    return generated\n",
-        "\n",
-        "#############################################\n",
-        "# --- Ek: Basit Eğitim Döngüsü (Training Loop) ---\n",
-        "#############################################\n",
-        "def train_model(model, config, epochs=3):\n",
-        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "    model.to(device)\n",
-        "    optimizer = optim.AdamW(model.parameters(), lr=1e-4)\n",
-        "    loss_fn = nn.CrossEntropyLoss()\n",
-        "    # Dummy dataset: rastgele token dizileri\n",
-        "    for epoch in range(epochs):\n",
-        "        model.train()\n",
-        "        dummy_input = torch.randint(0, config.vocab_size, (8, config.max_length), device=device)\n",
-        "        dummy_target = torch.randint(0, config.vocab_size, (8, config.max_length), device=device)\n",
-        "        optimizer.zero_grad()\n",
-        "        logits = model(dummy_input)  # (batch, seq_len, vocab_size)\n",
-        "        loss = loss_fn(logits.view(-1, config.vocab_size), dummy_target.view(-1))\n",
-        "        loss.backward()\n",
-        "        optimizer.step()\n",
-        "        print(f\"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}\")\n",
-        "\n",
-        "#############################################\n",
-        "# --- Detaylı Test Fonksiyonları (Önceki Versiyonun Geliştirilmiş Hali) ---\n",
-        "#############################################\n",
-        "def run_detailed_tests(config, variant_list):\n",
-        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "    for variant in variant_list:\n",
-        "        attn_var = variant['attn']\n",
-        "        ffn_var = variant['ffn']\n",
-        "        print(f\"\\nTest: Attention = {attn_var}, FFN = {ffn_var}\")\n",
-        "        model = TransformerModel(config, attn_variant=attn_var, ffn_variant=ffn_var).to(device)\n",
-        "        model_summary(model)\n",
-        "        model.train()\n",
-        "        dummy_input = torch.randint(0, config.vocab_size, (4, config.max_length), device=device)\n",
-        "        logits = model(dummy_input)\n",
-        "        loss = nn.CrossEntropyLoss()(logits.view(-1, config.vocab_size),\n",
-        "                                     torch.randint(0, config.vocab_size, (4 * config.max_length,), device=device))\n",
-        "        loss.backward()\n",
-        "        print(f\"Loss: {loss.item():.4f}, Output shape: {logits.shape}\")\n",
-        "\n",
-        "        torch.cuda.synchronize() if device.type == 'cuda' else None\n",
-        "        start_time = time.time()\n",
-        "        for _ in range(10):\n",
-        "            _ = model(dummy_input)\n",
-        "        torch.cuda.synchronize() if device.type == 'cuda' else None\n",
-        "        avg_time = (time.time() - start_time) / 10.0\n",
-        "        print(f\"Ortalama ileri geçiş süresi: {avg_time:.6f} sn\")\n",
-        "\n",
-        "        # Greedy decoding test (ilk 10 token üretiliyor)\n",
-        "        start_token = dummy_input[0, 0].item()\n",
-        "        generated = greedy_decode(model, start_token, max_length=10, device=device)\n",
-        "        print(f\"Greedy Decode Çıktısı: {generated}\")\n",
-        "\n",
-        "#############################################\n",
-        "# --- Ana Çalışma Bölümü: Farklı varyantları deneyelim ---\n",
-        "#############################################\n",
-        "if __name__ == \"__main__\":\n",
-        "    # Konfigürasyona norm tipi ve dropout eklenmiştir.\n",
-        "    config = Config(vocab_size=30522, emb_dim=768, max_length=128, n_layers=4, n_heads=12, dropout=0.1, norm_type='rmsnorm')\n",
-        "\n",
-        "    # Denenecek varyantlar: farklı attention ve FFN varyantları\n",
-        "    variant_list = [\n",
-        "        {'attn': 'standard', 'ffn': 'standard'},\n",
-        "        {'attn': 'rope',     'ffn': 'standard'},\n",
-        "        {'attn': 'flash',    'ffn': 'standard'},\n",
-        "        {'attn': 'multiquery', 'ffn': 'standard'},\n",
-        "        {'attn': 'alibi',    'ffn': 'standard'},\n",
-        "        {'attn': 'standard', 'ffn': 'moe'},\n",
-        "        {'attn': 'rope',     'ffn': 'moe'},\n",
-        "        {'attn': 'flash',    'ffn': 'moe'},\n",
-        "        {'attn': 'multiquery', 'ffn': 'moe'},\n",
-        "        {'attn': 'alibi',    'ffn': 'moe'},\n",
-        "    ]\n",
-        "\n",
-        "    print(\"=== Detaylı Varyant Testleri ===\")\n",
-        "    run_detailed_tests(config, variant_list)\n",
-        "\n",
-        "    print(\"\\n=== Eğitim Döngüsü Testi ===\")\n",
-        "    # Bir varyant seçelim (örneğin, gelişmiş varyant: RoPE + MoE FFN)\n",
-        "    model = TransformerModel(config, attn_variant='rope', ffn_variant='moe').to(torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\"))\n",
-        "    train_model(model, config, epochs=3)\n",
-        "\n",
-        "    print(\"\\n=== Greedy Decoding Testi ===\")\n",
-        "    # Greedy decoding örneği: İlk tokenı dummy inputtan alıp 20 token üretelim\n",
-        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "    prompt_token = torch.randint(0, config.vocab_size, (1,)).item()\n",
-        "    generated_tokens = greedy_decode(model, prompt_token, max_length=20, device=device)\n",
-        "    print(\"Üretilen Tokenlar:\", generated_tokens)"
-      ],
-      "metadata": {
-        "trusted": true,
-        "execution": {
-          "iopub.status.busy": "2025-02-12T22:04:05.352069Z",
-          "iopub.execute_input": "2025-02-12T22:04:05.352443Z",
-          "iopub.status.idle": "2025-02-12T22:06:42.132452Z",
-          "shell.execute_reply.started": "2025-02-12T22:04:05.352413Z",
-          "shell.execute_reply": "2025-02-12T22:06:42.131334Z"
-        },
-        "id": "4Q_jYSvEw_I3"
-      },
-      "outputs": [],
-      "execution_count": null
+    "id": "DEJyddCuw_I3"
+   },
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import time\n",
+    "import math\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "import matplotlib.pyplot as plt\n",
+    "import tiktoken\n",
+    "from datasets import load_dataset  # Hugging Face datasets library\n",
+    "import re\n",
+    "\n",
+    "#####################################\n",
+    "# Rotary Positional Embeddings (ROPE) Implementation\n",
+    "#####################################\n",
+    "def apply_rotary_pos_emb(x):\n",
+    "    \"\"\"\n",
+    "    Apply Rotary Positional Embeddings (ROPE) to the input tensor.\n",
+    "\n",
+    "    Args:\n",
+    "        x (torch.Tensor): Input tensor of shape (batch, num_heads, seq_len, head_dim).\n",
+    "\n",
+    "    Returns:\n",
+    "        torch.Tensor: Tensor with ROPE applied.\n",
+    "    \"\"\"\n",
+    "    batch, n_heads, seq_len, head_dim = x.shape\n",
+    "    assert head_dim % 2 == 0, \"head_dim must be even for ROPE\"\n",
+    "\n",
+    "    # Calculate inverse frequencies and positions\n",
+    "    inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2, device=x.device).float() / head_dim))\n",
+    "    positions = torch.arange(seq_len, device=x.device).float()\n",
+    "    sinusoid_inp = torch.einsum(\"i,j->ij\", positions, inv_freq)  # (seq_len, head_dim/2)\n",
+    "    sin = torch.sin(sinusoid_inp)[None, None, :, :]  # (1, 1, seq_len, head_dim/2)\n",
+    "    cos = torch.cos(sinusoid_inp)[None, None, :, :]  # (1, 1, seq_len, head_dim/2)\n",
+    "\n",
+    "    # Split the input tensor into two halves and apply ROPE\n",
+    "    x1, x2 = x[..., :head_dim//2], x[..., head_dim//2:]\n",
+    "    x_rotated = torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)\n",
+    "    return x_rotated\n",
+    "\n",
+    "#####################################\n",
+    "# Dataset and DataLoader: Wikitext (Hugging Face)\n",
+    "#####################################\n",
+    "class GPTDatasetV1(Dataset):\n",
+    "    def __init__(self, text, tokenizer, max_length, stride):\n",
+    "        self.input_ids = []\n",
+    "        self.target_ids = []\n",
+    "\n",
+    "        # Tokenize the text\n",
+    "        token_ids = tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n",
+    "\n",
+    "        # Create input-target pairs\n",
+    "        for i in range(0, len(token_ids) - max_length, stride):\n",
+    "            input_chunk = token_ids[i:i + max_length]\n",
+    "            target_chunk = token_ids[i + 1: i + max_length + 1]\n",
+    "            self.input_ids.append(torch.tensor(input_chunk))\n",
+    "            self.target_ids.append(torch.tensor(target_chunk))\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.input_ids)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        return self.input_ids[idx], self.target_ids[idx]\n",
+    "\n",
+    "def create_dataloader_v1(text, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):\n",
+    "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
+    "    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)\n",
+    "    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)\n",
+    "    return dataloader\n",
+    "\n",
+    "def load_wikitext_data(num_lines=10000, dataset_name=\"wikitext\", subset=\"wikitext-103-raw-v1\"):\n",
+    "    \"\"\"\n",
+    "    Load Wikitext data from Hugging Face and concatenate the first `num_lines` lines into a single text.\n",
+    "\n",
+    "    Args:\n",
+    "        num_lines (int): Number of lines to load.\n",
+    "        dataset_name (str): Name of the dataset.\n",
+    "        subset (str): Subset of the dataset.\n",
+    "\n",
+    "    Returns:\n",
+    "        str: Concatenated text.\n",
+    "    \"\"\"\n",
+    "    ds = load_dataset(dataset_name, subset)\n",
+    "    text_lines = ds[\"train\"][\"text\"][:num_lines]\n",
+    "    text = \"\\n\".join(text_lines)\n",
+    "    return text\n",
+    "\n",
+    "def preprocess_text(text):\n",
+    "    \"\"\"\n",
+    "    Preprocess the text data by removing unwanted characters and normalizing whitespace.\n",
+    "\n",
+    "    Args:\n",
+    "        text (str): Input text.\n",
+    "\n",
+    "    Returns:\n",
+    "        str: Preprocessed text.\n",
+    "    \"\"\"\n",
+    "    # Remove special characters and digits, and normalize whitespace\n",
+    "    text = re.sub(r'[^A-Za-z\\s]', '', text)\n",
+    "    text = re.sub(r'\\s+', ' ', text).strip()\n",
+    "    return text\n",
+    "\n",
+    "#####################################\n",
+    "# Advanced Model Components (GPTModel)\n",
+    "#####################################\n",
+    "\n",
+    "class MultiHeadAttention(nn.Module):\n",
+    "    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False, use_rope=False):\n",
+    "        super().__init__()\n",
+    "        assert d_out % num_heads == 0, \"d_out must be divisible by num_heads\"\n",
+    "        self.d_out = d_out\n",
+    "        self.num_heads = num_heads\n",
+    "        self.head_dim = d_out // num_heads\n",
+    "\n",
+    "        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
+    "        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
+    "        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
+    "        self.out_proj = nn.Linear(d_out, d_out)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "        self.use_rope = use_rope\n",
+    "        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        b, num_tokens, _ = x.shape\n",
+    "\n",
+    "        keys = self.W_key(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)\n",
+    "        queries = self.W_query(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)\n",
+    "        values = self.W_value(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)\n",
+    "\n",
+    "        if self.use_rope:\n",
+    "            queries = apply_rotary_pos_emb(queries)\n",
+    "            keys = apply_rotary_pos_emb(keys)\n",
+    "\n",
+    "        attn_scores = queries @ keys.transpose(2, 3)\n",
+    "        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n",
+    "        attn_scores.masked_fill_(mask_bool, float(\"-inf\"))\n",
+    "\n",
+    "        attn_weights = torch.softmax(attn_scores / math.sqrt(self.head_dim), dim=-1)\n",
+    "        attn_weights = self.dropout(attn_weights)\n",
+    "\n",
+    "        context_vec = (attn_weights @ values).transpose(1, 2).reshape(b, num_tokens, self.d_out)\n",
+    "        context_vec = self.out_proj(context_vec)\n",
+    "        return context_vec\n",
+    "\n",
+    "class LayerNorm(nn.Module):\n",
+    "    def __init__(self, emb_dim):\n",
+    "        super().__init__()\n",
+    "        self.eps = 1e-5\n",
+    "        self.scale = nn.Parameter(torch.ones(emb_dim))\n",
+    "        self.shift = nn.Parameter(torch.zeros(emb_dim))\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        mean = x.mean(dim=-1, keepdim=True)\n",
+    "        var = x.var(dim=-1, keepdim=True, unbiased=False)\n",
+    "        norm_x = (x - mean) / torch.sqrt(var + self.eps)\n",
+    "        return self.scale * norm_x + self.shift\n",
+    "\n",
+    "class GELU(nn.Module):\n",
+    "    def forward(self, x):\n",
+    "        return 0.5 * x * (1 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))\n",
+    "\n",
+    "class FeedForward(nn.Module):\n",
+    "    def __init__(self, cfg):\n",
+    "        super().__init__()\n",
+    "        self.layers = nn.Sequential(\n",
+    "            nn.Linear(cfg[\"emb_dim\"], 4 * cfg[\"emb_dim\"]),\n",
+    "            GELU(),\n",
+    "            nn.Linear(4 * cfg[\"emb_dim\"], cfg[\"emb_dim\"]),\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.layers(x)\n",
+    "\n",
+    "class TransformerBlock(nn.Module):\n",
+    "    def __init__(self, cfg):\n",
+    "        super().__init__()\n",
+    "        self.att = MultiHeadAttention(\n",
+    "            d_in=cfg[\"emb_dim\"],\n",
+    "            d_out=cfg[\"emb_dim\"],\n",
+    "            context_length=cfg[\"context_length\"],\n",
+    "            num_heads=cfg[\"n_heads\"],\n",
+    "            dropout=cfg[\"drop_rate\"],\n",
+    "            qkv_bias=cfg[\"qkv_bias\"],\n",
+    "            use_rope=cfg.get(\"use_rope\", False)\n",
+    "        )\n",
+    "        self.ff = FeedForward(cfg)\n",
+    "        self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
+    "        self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
+    "        self.drop_shortcut = nn.Dropout(cfg[\"drop_rate\"])\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        shortcut = x\n",
+    "        x = self.norm1(x)\n",
+    "        x = self.att(x)\n",
+    "        x = self.drop_shortcut(x)\n",
+    "        x = x + shortcut\n",
+    "\n",
+    "        shortcut = x\n",
+    "        x = self.norm2(x)\n",
+    "        x = self.ff(x)\n",
+    "        x = self.drop_shortcut(x)\n",
+    "        x = x + shortcut\n",
+    "        return x\n",
+    "\n",
+    "class GPTModel(nn.Module):\n",
+    "    def __init__(self, cfg):\n",
+    "        super().__init__()\n",
+    "        self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
+    "        self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
+    "        self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
+    "\n",
+    "        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n",
+    "\n",
+    "        self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n",
+    "        self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False)\n",
+    "\n",
+    "    def forward(self, in_idx):\n",
+    "        batch_size, seq_len = in_idx.shape\n",
+    "        tok_embeds = self.tok_emb(in_idx)\n",
+    "        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
+    "        x = tok_embeds + pos_embeds\n",
+    "        x = self.drop_emb(x)\n",
+    "        x = self.trf_blocks(x)\n",
+    "        x = self.final_norm(x)\n",
+    "        logits = self.out_head(x)\n",
+    "        return logits\n",
+    "\n",
+    "#####################################\n",
+    "# Training and Evaluation Functions\n",
+    "#####################################\n",
+    "def calc_loss_batch(input_batch, target_batch, model, device):\n",
+    "    input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n",
+    "    logits = model(input_batch)\n",
+    "    loss = nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())\n",
+    "    return loss\n",
+    "\n",
+    "def calc_loss_loader(data_loader, model, device, num_batches=None):\n",
+    "    total_loss = 0.0\n",
+    "    if len(data_loader) == 0:\n",
+    "        return float(\"nan\")\n",
+    "    num_batches = num_batches if num_batches is not None else len(data_loader)\n",
+    "    num_batches = min(num_batches, len(data_loader))\n",
+    "    for i, (input_batch, target_batch) in enumerate(data_loader):\n",
+    "        if i < num_batches:\n",
+    "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
+    "            total_loss += loss.item()\n",
+    "        else:\n",
+    "            break\n",
+    "    return total_loss / num_batches\n",
+    "\n",
+    "def evaluate_model(model, train_loader, val_loader, device, eval_iter):\n",
+    "    model.eval()\n",
+    "    with torch.no_grad():\n",
+    "        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)\n",
+    "        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)\n",
+    "    model.train()\n",
+    "    return train_loss, val_loss\n",
+    "\n",
+    "def generate_text_simple(model, idx, max_new_tokens, context_size):\n",
+    "    for _ in range(max_new_tokens):\n",
+    "        idx_cond = idx[:, -context_size:]\n",
+    "        with torch.no_grad():\n",
+    "            logits = model(idx_cond)\n",
+    "        logits = logits[:, -1, :]\n",
+    "        idx_next = torch.argmax(logits, dim=-1, keepdim=True)\n",
+    "        idx = torch.cat((idx, idx_next), dim=1)\n",
+    "    return idx\n",
+    "\n",
+    "def generate_and_print_sample(model, tokenizer, device, start_context):\n",
+    "    model.eval()\n",
+    "    context_size = model.pos_emb.weight.shape[0]\n",
+    "    encoded = text_to_token_ids(start_context, tokenizer).to(device)\n",
+    "    with torch.no_grad():\n",
+    "        token_ids = generate_text_simple(model=model, idx=encoded, max_new_tokens=50, context_size=context_size)\n",
+    "        decoded_text = token_ids_to_text(token_ids, tokenizer)\n",
+    "        print(decoded_text.replace(\"\\n\", \" \"))\n",
+    "    model.train()\n",
+    "\n",
+    "def text_to_token_ids(text, tokenizer):\n",
+    "    encoded = tokenizer.encode(text)\n",
+    "    return torch.tensor(encoded).unsqueeze(0)\n",
+    "\n",
+    "def token_ids_to_text(token_ids, tokenizer):\n",
+    "    flat = token_ids.squeeze(0)\n",
+    "    return tokenizer.decode(flat.tolist())\n",
+    "\n",
+    "def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):\n",
+    "    fig, ax1 = plt.subplots(figsize=(5, 3))\n",
+    "    ax1.plot(epochs_seen, train_losses, label=\"Training loss\")\n",
+    "    ax1.plot(epochs_seen, val_losses, linestyle=\"-.\", label=\"Validation loss\")\n",
+    "    ax1.set_xlabel(\"Epochs\")\n",
+    "    ax1.set_ylabel(\"Loss\")\n",
+    "    ax1.legend(loc=\"upper right\")\n",
+    "    ax2 = ax1.twiny()\n",
+    "    ax2.plot(tokens_seen, train_losses, alpha=0)\n",
+    "    ax2.set_xlabel(\"Tokens seen\")\n",
+    "    fig.tight_layout()\n",
+    "    plt.show()\n",
+    "\n",
+    "#####################################\n",
+    "# Model Training\n",
+    "#####################################\n",
+    "def train_model(model, train_loader, val_loader, device, epochs=30, eval_iter=20, lr=1e-4):\n",
+    "    optimizer = optim.Adam(model.parameters(), lr=lr)\n",
+    "    model.to(device)\n",
+    "    for epoch in range(epochs):\n",
+    "        model.train()\n",
+    "        epoch_loss = 0.0\n",
+    "        start_time = time.time()\n",
+    "\n",
+    "        for batch_idx, (input_batch, target_batch) in enumerate(train_loader):\n",
+    "            optimizer.zero_grad()\n",
+    "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "            epoch_loss += loss.item()\n",
+    "            if (batch_idx + 1) % 10 == 0:\n",
+    "                print(f\"Epoch {epoch+1} Batch {batch_idx+1}, Loss: {loss.item():.4f}\")\n",
+    "\n",
+    "        avg_loss = epoch_loss / len(train_loader)\n",
+    "        elapsed = time.time() - start_time\n",
+    "        print(f\"Epoch {epoch+1} completed (time: {elapsed:.2f}s), avg. loss: {avg_loss:.4f}\")\n",
+    "        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)\n",
+    "        print(f\"Epoch {epoch+1} evaluation: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}\\n\")\n",
+    "\n",
+    "#####################################\n",
+    "# Text Generation\n",
+    "#####################################\n",
+    "def generate_sample(model, tokenizer, device, prompt, max_new_tokens=50):\n",
+    "    print(\"Generated text sample:\\n\")\n",
+    "    generate_and_print_sample(model, tokenizer, device, prompt)\n",
+    "\n",
+    "#####################################\n",
+    "# Main Function\n",
+    "#####################################\n",
+    "def main():\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "    print(f\"Using device: {device}\")\n",
+    "\n",
+    "    # Load Wikitext data from Hugging Face and use the first 50k lines\n",
+    "    text = load_wikitext_data(num_lines=50000, dataset_name=\"wikitext\", subset=\"wikitext-103-raw-v1\")\n",
+    "\n",
+    "    # Preprocess the text data\n",
+    "    text = preprocess_text(text)\n",
+    "\n",
+    "    # Split data into training and validation sets (e.g., 90% train, 10% validation)\n",
+    "    split_idx = int(0.9 * len(text))\n",
+    "    train_text = text[:split_idx]\n",
+    "    val_text = text[split_idx:]\n",
+    "    train_loader = create_dataloader_v1(train_text, batch_size=8, max_length=256, stride=128)\n",
+    "    val_loader = create_dataloader_v1(val_text, batch_size=8, max_length=256, stride=128)\n",
+    "\n",
+    "    # Tokenizer and advanced model configuration\n",
+    "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
+    "    cfg = {\n",
+    "        \"vocab_size\": tokenizer.n_vocab,\n",
+    "        \"emb_dim\": 256,\n",
+    "        \"context_length\": 256,\n",
+    "        \"drop_rate\": 0.1,\n",
+    "        \"n_layers\": 6,\n",
+    "        \"n_heads\": 8,\n",
+    "        \"qkv_bias\": True,\n",
+    "        \"use_rope\": True,\n",
+    "    }\n",
+    "\n",
+    "    model = GPTModel(cfg)\n",
+    "    train_model(model, train_loader, val_loader, device, epochs=1, eval_iter=25, lr=1e-5)\n",
+    "\n",
+    "    # Generate text after training with a given prompt\n",
+    "    prompt = \"Valkyria Chronicles III \"\n",
+    "    generate_sample(model, tokenizer, device, prompt, max_new_tokens=100)\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()\n"
+   ],
+   "metadata": {
+    "trusted": true,
+    "execution": {
+     "iopub.status.busy": "2025-02-12T20:33:49.198352Z",
+     "iopub.execute_input": "2025-02-12T20:33:49.198767Z",
+     "iopub.status.idle": "2025-02-12T20:38:21.836553Z",
+     "shell.execute_reply.started": "2025-02-12T20:33:49.198738Z",
+     "shell.execute_reply": "2025-02-12T20:38:21.83572Z"
     },
-    {
-      "cell_type": "code",
-      "source": [
-        "#!pip install evaluate reportlab"
-      ],
-      "metadata": {
-        "trusted": true,
-        "execution": {
-          "iopub.status.busy": "2025-02-12T22:06:55.854229Z",
-          "iopub.execute_input": "2025-02-12T22:06:55.854996Z",
-          "iopub.status.idle": "2025-02-12T22:06:55.861061Z",
-          "shell.execute_reply.started": "2025-02-12T22:06:55.854884Z",
-          "shell.execute_reply": "2025-02-12T22:06:55.859524Z"
-        },
-        "id": "6ZXigQy4w_I4"
-      },
-      "outputs": [],
-      "execution_count": null
+    "id": "2E8o3nAXw_I3"
+   },
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import math\n",
+    "import time\n",
+    "\n",
+    "#############################################\n",
+    "# 1. Alternatif Normalizasyon: RMSNorm\n",
+    "#############################################\n",
+    "class RMSNorm(nn.Module):\n",
+    "    def __init__(self, emb_dim, eps=1e-8):\n",
+    "        super().__init__()\n",
+    "        self.eps = eps\n",
+    "        self.scale = nn.Parameter(torch.ones(emb_dim))\n",
+    "    def forward(self, x):\n",
+    "        # x shape: (..., emb_dim)\n",
+    "        norm_x = x / torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)\n",
+    "        return self.scale * norm_x\n",
+    "\n",
+    "def get_norm(norm_type, emb_dim):\n",
+    "    if norm_type == 'layernorm':\n",
+    "        return nn.LayerNorm(emb_dim)\n",
+    "    elif norm_type == 'rmsnorm':\n",
+    "        return RMSNorm(emb_dim)\n",
+    "    else:\n",
+    "        raise ValueError(\"Unknown normalization type\")\n",
+    "\n",
+    "#############################################\n",
+    "# 2. Shared configuration\n",
+    "#############################################\n",
+    "class Config:\n",
+    "    def __init__(self, vocab_size=30522, emb_dim=768, max_length=512, n_layers=4, n_heads=12,\n",
+    "                 dropout=0.1, norm_type='layernorm'):\n",
+    "        self.vocab_size = vocab_size\n",
+    "        self.emb_dim = emb_dim\n",
+    "        self.max_length = max_length\n",
+    "        self.n_layers = n_layers\n",
+    "        self.n_heads = n_heads\n",
+    "        self.dropout = dropout\n",
+    "        self.norm_type = norm_type  # 'layernorm' veya 'rmsnorm'\n",
+    "        # Additional parameters for advanced variants:\n",
+    "        self.latent_dim = emb_dim // 2   # For RoPE and latent projection\n",
+    "        self.num_experts = 4             # Number of experts used in the MoE FFN\n",
+    "\n",
+    "#############################################\n",
+    "# --- Attention modules ---\n",
+    "#############################################\n",
+    "# 1. Standard Dot-Product Attention\n",
+    "class StandardAttention(nn.Module):\n",
+    "    def __init__(self, emb_dim, n_heads, dropout):\n",
+    "        super().__init__()\n",
+    "        assert emb_dim % n_heads == 0, \"Embedding dimension must be divisible by the number of heads.\"\n",
+    "        self.n_heads = n_heads\n",
+    "        self.head_dim = emb_dim // n_heads\n",
+    "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        batch, seq_len, emb_dim = x.size()\n",
+    "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
+    "        attn = torch.softmax(scores, dim=-1)\n",
+    "        attn = self.dropout(attn)\n",
+    "        context = torch.matmul(attn, V)\n",
+    "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
+    "        return self.out_proj(context)\n",
+    "\n",
+    "# 2. RoPE Attention\n",
+    "def apply_rope(x, base=10000):\n",
+    "    # x: (batch, n_heads, seq_len, head_dim)\n",
+    "    batch, n_heads, seq_len, head_dim = x.shape\n",
+    "    inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device=x.device).float() / head_dim))\n",
+    "    pos = torch.arange(seq_len, device=x.device).float()\n",
+    "    sinusoid_inp = torch.einsum(\"i,j->ij\", pos, inv_freq)  # (seq_len, head_dim/2)\n",
+    "    sin = torch.sin(sinusoid_inp).unsqueeze(0).unsqueeze(0)\n",
+    "    cos = torch.cos(sinusoid_inp).unsqueeze(0).unsqueeze(0)\n",
+    "    x1 = x[..., :head_dim//2]\n",
+    "    x2 = x[..., head_dim//2:]\n",
+    "    return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)\n",
+    "\n",
+    "class RoPEAttention(nn.Module):\n",
+    "    def __init__(self, emb_dim, n_heads, dropout):\n",
+    "        super().__init__()\n",
+    "        assert emb_dim % n_heads == 0, \"Embedding dimension must be divisible by the number of heads.\"\n",
+    "        self.n_heads = n_heads\n",
+    "        self.head_dim = emb_dim // n_heads\n",
+    "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        batch, seq_len, emb_dim = x.size()\n",
+    "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        Q = apply_rope(Q)\n",
+    "        K = apply_rope(K)\n",
+    "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
+    "        attn = torch.softmax(scores, dim=-1)\n",
+    "        attn = self.dropout(attn)\n",
+    "        context = torch.matmul(attn, V)\n",
+    "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
+    "        return self.out_proj(context)\n",
+    "\n",
+    "# 3. FlashAttention benzeri Attention (placeholder)\n",
+    "def flash_attention(Q, K, V):\n",
+    "    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(Q.size(-1))\n",
+    "    attn = torch.softmax(scores, dim=-1)\n",
+    "    return torch.matmul(attn, V)\n",
+    "\n",
+    "class FlashAttentionModule(nn.Module):\n",
+    "    def __init__(self, emb_dim, n_heads, dropout):\n",
+    "        super().__init__()\n",
+    "        assert emb_dim % n_heads == 0, \"Embedding dimension must be divisible by the number of heads.\"\n",
+    "        self.n_heads = n_heads\n",
+    "        self.head_dim = emb_dim // n_heads\n",
+    "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        batch, seq_len, emb_dim = x.size()\n",
+    "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        context = flash_attention(Q, K, V)\n",
+    "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
+    "        return self.out_proj(context)\n",
+    "\n",
+    "# 4. Multi-Query Attention: Keys & Values tek projeksiyon\n",
+    "class MultiQueryAttention(nn.Module):\n",
+    "    def __init__(self, emb_dim, n_heads, dropout):\n",
+    "        super().__init__()\n",
+    "        self.n_heads = n_heads\n",
+    "        self.head_dim = emb_dim // n_heads\n",
+    "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.k_proj = nn.Linear(emb_dim, self.head_dim)\n",
+    "        self.v_proj = nn.Linear(emb_dim, self.head_dim)\n",
+    "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        batch, seq_len, emb_dim = x.size()\n",
+    "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        K = self.k_proj(x).unsqueeze(1).expand(batch, self.n_heads, seq_len, self.head_dim)\n",
+    "        V = self.v_proj(x).unsqueeze(1).expand(batch, self.n_heads, seq_len, self.head_dim)\n",
+    "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
+    "        attn = torch.softmax(scores, dim=-1)\n",
+    "        attn = self.dropout(attn)\n",
+    "        context = torch.matmul(attn, V)\n",
+    "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
+    "        return self.out_proj(context)\n",
+    "\n",
+    "# 5. ALiBi Attention: integrates relative position information by adding linear bias (Press et al., 2021)\n",
+    "class ALiBiAttention(nn.Module):\n",
+    "    def __init__(self, emb_dim, n_heads, dropout, alibi_scaling=-1.0):\n",
+    "        super().__init__()\n",
+    "        assert emb_dim % n_heads == 0, \"Embedding dimension must be divisible by the number of heads.\"\n",
+    "        self.n_heads = n_heads\n",
+    "        self.head_dim = emb_dim // n_heads\n",
+    "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "        self.alibi_scaling = alibi_scaling\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        batch, seq_len, emb_dim = x.size()\n",
+    "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
+    "        # ALiBi bias: B[i,j] = (j - i) * scale\n",
+    "        bias = torch.arange(seq_len, device=x.device).unsqueeze(0) - torch.arange(seq_len, device=x.device).unsqueeze(1)\n",
+    "        bias = self.alibi_scaling * bias.float()\n",
+    "        scores = scores + bias.unsqueeze(0).unsqueeze(0)\n",
+    "        attn = torch.softmax(scores, dim=-1)\n",
+    "        attn = self.dropout(attn)\n",
+    "        context = torch.matmul(attn, V)\n",
+    "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
+    "        return self.out_proj(context)\n",
+    "\n",
+    "#############################################\n",
+    "# --- FFN variants ---\n",
+    "#############################################\n",
+    "# 1. Standart FFN\n",
+    "class StandardFFN(nn.Module):\n",
+    "    def __init__(self, emb_dim, expansion=4, dropout=0.1):\n",
+    "        super().__init__()\n",
+    "        self.net = nn.Sequential(\n",
+    "            nn.Linear(emb_dim, expansion * emb_dim),\n",
+    "            nn.GELU(),\n",
+    "            nn.Dropout(dropout),\n",
+    "            nn.Linear(expansion * emb_dim, emb_dim)\n",
+    "        )\n",
+    "    def forward(self, x):\n",
+    "        return self.net(x)\n",
+    "\n",
+    "# 2. MoE FFN\n",
+    "class MoEFFN(nn.Module):\n",
+    "    def __init__(self, emb_dim, num_experts, expansion=4, dropout=0.1):\n",
+    "        super().__init__()\n",
+    "        self.num_experts = num_experts\n",
+    "        self.experts = nn.ModuleList([\n",
+    "            nn.Sequential(\n",
+    "                nn.Linear(emb_dim, expansion * emb_dim),\n",
+    "                nn.GELU(),\n",
+    "                nn.Dropout(dropout),\n",
+    "                nn.Linear(expansion * emb_dim, emb_dim)\n",
+    "            ) for _ in range(num_experts)\n",
+    "        ])\n",
+    "        self.gate = nn.Linear(emb_dim, num_experts)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        gate_scores = torch.softmax(self.gate(x), dim=-1)  # (batch, seq_len, num_experts)\n",
+    "        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=-1)  # (batch, seq_len, emb_dim, num_experts)\n",
+    "        gate_scores = gate_scores.unsqueeze(2)  # (batch, seq_len, 1, num_experts)\n",
+    "        return (expert_outputs * gate_scores).sum(dim=-1)\n",
+    "\n",
+    "#############################################\n",
+    "# --- Transformer block: selectable attention and FFN variants, dropout, pre-norm ---\n",
+    "#############################################\n",
+    "class TransformerBlock(nn.Module):\n",
+    "    def __init__(self, emb_dim, n_heads, attn_module, ffn_module, dropout, norm_type):\n",
+    "        super().__init__()\n",
+    "        self.norm1 = get_norm(norm_type, emb_dim)\n",
+    "        self.attn = attn_module(emb_dim, n_heads, dropout)\n",
+    "        self.norm2 = get_norm(norm_type, emb_dim)\n",
+    "        self.ffn = ffn_module(emb_dim, dropout=dropout)  # ffn_module: StandardFFN or MoEFFN (for MoE, lambda is used)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = x + self.attn(self.norm1(x))\n",
+    "        x = x + self.ffn(self.norm2(x))\n",
+    "        return x\n",
+    "\n",
+    "#############################################\n",
+    "# --- Transformer model: structure supporting different variants ---\n",
+    "#############################################\n",
+    "class TransformerModel(nn.Module):\n",
+    "    def __init__(self, config, attn_variant='standard', ffn_variant='standard'):\n",
+    "        super().__init__()\n",
+    "        self.token_embed = nn.Embedding(config.vocab_size, config.emb_dim)\n",
+    "        self.pos_embed   = nn.Embedding(config.max_length, config.emb_dim)\n",
+    "\n",
+    "        attn_dict = {\n",
+    "            'standard': StandardAttention,\n",
+    "            'rope': RoPEAttention,\n",
+    "            'flash': FlashAttentionModule,\n",
+    "            'multiquery': MultiQueryAttention,\n",
+    "            'alibi': ALiBiAttention\n",
+    "        }\n",
+    "        ffn_dict = {\n",
+    "            'standard': StandardFFN,\n",
+    "            'moe': lambda emb_dim, dropout: MoEFFN(emb_dim, config.num_experts, dropout=dropout)\n",
+    "        }\n",
+    "        self.layers = nn.ModuleList([\n",
+    "            TransformerBlock(config.emb_dim, config.n_heads, attn_dict[attn_variant], ffn_dict[ffn_variant], config.dropout, config.norm_type)\n",
+    "            for _ in range(config.n_layers)\n",
+    "        ])\n",
+    "        self.norm = get_norm(config.norm_type, config.emb_dim)\n",
+    "        self.output_proj = nn.Linear(config.emb_dim, config.vocab_size, bias=False)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        seq_len = x.size(1)\n",
+    "        x = self.token_embed(x) + self.pos_embed(torch.arange(seq_len, device=x.device))\n",
+    "        for layer in self.layers:\n",
+    "            x = layer(x)\n",
+    "        x = self.norm(x)\n",
+    "        return self.output_proj(x)\n",
+    "\n",
+    "#############################################\n",
+    "# --- Extra: model summary and parameter count function ---\n",
+    "#############################################\n",
+    "def model_summary(model):\n",
+    "    total_params = sum(p.numel() for p in model.parameters())\n",
+    "    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
+    "    print(f\"Toplam Parametre: {total_params:,}\")\n",
+    "    print(f\"Trainable parameters: {trainable:,}\")\n",
+    "\n",
+    "#############################################\n",
+    "# --- Ek: Greedy Decoding Fonksiyonu ---\n",
+    "#############################################\n",
+    "def greedy_decode(model, start_token, max_length, device):\n",
+    "    model.eval()\n",
+    "    generated = [start_token]\n",
+    "    input_seq = torch.tensor([generated], device=device)\n",
+    "    with torch.no_grad():\n",
+    "        for _ in range(max_length - 1):\n",
+    "            logits = model(input_seq)  # (batch, seq_len, vocab_size)\n",
+    "            next_token = torch.argmax(logits[0, -1, :]).item()\n",
+    "            generated.append(next_token)\n",
+    "            input_seq = torch.tensor([generated], device=device)\n",
+    "    model.train()\n",
+    "    return generated\n",
+    "\n",
+    "#############################################\n",
+    "# --- Extra: simple training loop ---\n",
+    "#############################################\n",
+    "def train_model(model, config, epochs=3):\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "    model.to(device)\n",
+    "    optimizer = optim.AdamW(model.parameters(), lr=1e-4)\n",
+    "    loss_fn = nn.CrossEntropyLoss()\n",
+    "    # Dummy dataset: rastgele token dizileri\n",
+    "    for epoch in range(epochs):\n",
+    "        model.train()\n",
+    "        dummy_input = torch.randint(0, config.vocab_size, (8, config.max_length), device=device)\n",
+    "        dummy_target = torch.randint(0, config.vocab_size, (8, config.max_length), device=device)\n",
+    "        optimizer.zero_grad()\n",
+    "        logits = model(dummy_input)  # (batch, seq_len, vocab_size)\n",
+    "        loss = loss_fn(logits.view(-1, config.vocab_size), dummy_target.view(-1))\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        print(f\"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}\")\n",
+    "\n",
+    "#############################################\n",
+    "# --- Detailed test functions (improved version of the previous one) ---\n",
+    "#############################################\n",
+    "def run_detailed_tests(config, variant_list):\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "    for variant in variant_list:\n",
+    "        attn_var = variant['attn']\n",
+    "        ffn_var = variant['ffn']\n",
+    "        print(f\"\\nTest: Attention = {attn_var}, FFN = {ffn_var}\")\n",
+    "        model = TransformerModel(config, attn_variant=attn_var, ffn_variant=ffn_var).to(device)\n",
+    "        model_summary(model)\n",
+    "        model.train()\n",
+    "        dummy_input = torch.randint(0, config.vocab_size, (4, config.max_length), device=device)\n",
+    "        logits = model(dummy_input)\n",
+    "        loss = nn.CrossEntropyLoss()(logits.view(-1, config.vocab_size),\n",
+    "                                     torch.randint(0, config.vocab_size, (4 * config.max_length,), device=device))\n",
+    "        loss.backward()\n",
+    "        print(f\"Loss: {loss.item():.4f}, Output shape: {logits.shape}\")\n",
+    "\n",
+    "        torch.cuda.synchronize() if device.type == 'cuda' else None\n",
+    "        start_time = time.time()\n",
+    "        for _ in range(10):\n",
+    "            _ = model(dummy_input)\n",
+    "        torch.cuda.synchronize() if device.type == 'cuda' else None\n",
+    "        avg_time = (time.time() - start_time) / 10.0\n",
+    "        print(f\"Average forward pass time: {avg_time:.6f} s\")\n",
+    "\n",
+    "        # Greedy decoding test (generates the first 10 tokens)\n",
+    "        start_token = dummy_input[0, 0].item()\n",
+    "        generated = greedy_decode(model, start_token, max_length=10, device=device)\n",
+    "        print(f\"Greedy Decode Output: {generated}\")\n",
+    "\n",
+    "#############################################\n",
+    "# --- Main Working Section: try different variants ---\n",
+    "#############################################\n",
+    "if __name__ == \"__main__\":\n",
+    "    # The configuration includes the norm type and dropout.\n",
+    "    config = Config(vocab_size=30522, emb_dim=768, max_length=128, n_layers=4, n_heads=12, dropout=0.1, norm_type='rmsnorm')\n",
+    "\n",
+    "    # Variants to try: different attention and FFN variants\n",
+    "    variant_list = [\n",
+    "        {'attn': 'standard', 'ffn': 'standard'},\n",
+    "        {'attn': 'rope',     'ffn': 'standard'},\n",
+    "        {'attn': 'flash',    'ffn': 'standard'},\n",
+    "        {'attn': 'multiquery', 'ffn': 'standard'},\n",
+    "        {'attn': 'alibi',    'ffn': 'standard'},\n",
+    "        {'attn': 'standard', 'ffn': 'moe'},\n",
+    "        {'attn': 'rope',     'ffn': 'moe'},\n",
+    "        {'attn': 'flash',    'ffn': 'moe'},\n",
+    "        {'attn': 'multiquery', 'ffn': 'moe'},\n",
+    "        {'attn': 'alibi',    'ffn': 'moe'},\n",
+    "    ]\n",
+    "\n",
+    "    print(\"=== Detailed Variant Tests ===\")\n",
+    "    run_detailed_tests(config, variant_list)\n",
+    "\n",
+    "    print(\"\\n=== Training Loop Test ===\")\n",
+    "    # Choose a variant (e.g., advanced variant: RoPE + MoE FFN)\n",
+    "    model = TransformerModel(config, attn_variant='rope', ffn_variant='moe').to(torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\"))\n",
+    "    train_model(model, config, epochs=3)\n",
+    "\n",
+    "    print(\"\\n=== Greedy Decoding Testi ===\")\n",
+    "    # Greedy decoding example: take the first token from dummy input and generate 20 tokens\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "    prompt_token = torch.randint(0, config.vocab_size, (1,)).item()\n",
+    "    generated_tokens = greedy_decode(model, prompt_token, max_length=20, device=device)\n",
+    "    print(\"Generated Tokens:\", generated_tokens)"
+   ],
+   "metadata": {
+    "trusted": true,
+    "execution": {
+     "iopub.status.busy": "2025-02-12T22:04:05.352069Z",
+     "iopub.execute_input": "2025-02-12T22:04:05.352443Z",
+     "iopub.status.idle": "2025-02-12T22:06:42.132452Z",
+     "shell.execute_reply.started": "2025-02-12T22:04:05.352413Z",
+     "shell.execute_reply": "2025-02-12T22:06:42.131334Z"
     },
-    {
-      "cell_type": "code",
-      "source": [
-        "import torch\n",
-        "import torch.nn as nn\n",
-        "import torch.optim as optim\n",
-        "import math\n",
-        "from datasets import load_dataset\n",
-        "from collections import defaultdict\n",
-        "from reportlab.lib.pagesizes import A4\n",
-        "from reportlab.pdfgen import canvas\n",
-        "\n",
-        "#############################################\n",
-        "# Turkish-Alpaca Veri Seti ve Tokenizer\n",
-        "#############################################\n",
-        "class TurkishAlpacaDataset:\n",
-        "    def __init__(self, config):\n",
-        "        # Hugging Face'den veri setini yükle\n",
-        "        dataset = load_dataset(\"TFLai/Turkish-Alpaca\")\n",
-        "        self.instructions = dataset['train']['instruction'][:100]  # Limit to 100 samples\n",
-        "        self.outputs = dataset['train']['output'][:100]            # Limit to 100 samples\n",
-        "\n",
-        "        # Tokenizer oluştur\n",
-        "        self.vocab = defaultdict(lambda: len(self.vocab))\n",
-        "        self.vocab['<pad>'] = 0  # Padding token'i ekle\n",
-        "\n",
-        "        # Tüm veriyi tokenize et\n",
-        "        self.tokenize_data()\n",
-        "\n",
-        "        # Inverse vocab oluştur\n",
-        "        self.inverse_vocab = {v: k for k, v in self.vocab.items()}\n",
-        "\n",
-        "        # Dynamically update vocab_size in config\n",
-        "        config.vocab_size = len(self.vocab)\n",
-        "        self.config = config\n",
-        "\n",
-        "    def tokenize_data(self):\n",
-        "        # Instruction ve Output'u tokenize et\n",
-        "        self.tokenized_instructions = []\n",
-        "        self.tokenized_outputs = []\n",
-        "\n",
-        "        for inst, out in zip(self.instructions, self.outputs):\n",
-        "            inst_tokens = [self.vocab[word] for word in inst.split()]\n",
-        "            out_tokens = [self.vocab[word] for word in out.split()]\n",
-        "            self.tokenized_instructions.append(inst_tokens)\n",
-        "            self.tokenized_outputs.append(out_tokens)\n",
-        "\n",
-        "    def get_batch(self, batch_size=4):\n",
-        "        # Rastgele bir batch oluştur\n",
-        "        indices = torch.randint(0, len(self.tokenized_instructions), (batch_size,))\n",
-        "        inputs, targets = [], []\n",
-        "\n",
-        "        for i in indices:\n",
-        "            input_tokens = self.tokenized_instructions[i][:-1]\n",
-        "            target_tokens = self.tokenized_outputs[i][1:]\n",
-        "            inputs.append(torch.tensor(input_tokens, dtype=torch.long))\n",
-        "            targets.append(torch.tensor(target_tokens, dtype=torch.long))\n",
-        "\n",
-        "        # Padding işlemi\n",
-        "        inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)\n",
-        "        targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)\n",
-        "        return inputs, targets\n",
-        "\n",
-        "\n",
-        "#############################################\n",
-        "# Greedy Decode Function\n",
-        "#############################################\n",
-        "def greedy_decode(model, start_token, max_length, device, temperature=1.0):\n",
-        "    \"\"\"\n",
-        "    Greedy decoding to generate sequences from a language model.\n",
-        "\n",
-        "    Args:\n",
-        "        model: The language model to use for generation.\n",
-        "        start_token: The token ID to start decoding from.\n",
-        "        max_length: Maximum length of the generated sequence.\n",
-        "        device: The device (CPU/GPU) where the model resides.\n",
-        "        temperature: Sampling temperature (optional, default=1.0).\n",
-        "\n",
-        "    Returns:\n",
-        "        List of generated token IDs.\n",
-        "    \"\"\"\n",
-        "    model.eval()\n",
-        "    with torch.no_grad():\n",
-        "        input_token = torch.tensor([[start_token]], dtype=torch.long).to(device)\n",
-        "        generated_tokens = [start_token]\n",
-        "\n",
-        "        for _ in range(max_length - 1):\n",
-        "            logits = model(input_token)\n",
-        "            next_token_logits = logits[:, -1, :] / temperature\n",
-        "            next_token = torch.argmax(next_token_logits, dim=-1).item()\n",
-        "\n",
-        "            if next_token == 0:  # Stop if <pad> token is generated\n",
-        "                break\n",
-        "\n",
-        "            generated_tokens.append(next_token)\n",
-        "            input_token = torch.cat([input_token, torch.tensor([[next_token]], dtype=torch.long).to(device)], dim=1)\n",
-        "\n",
-        "    return generated_tokens\n",
-        "\n",
-        "\n",
-        "#############################################\n",
-        "# Geliştirilmiş Eğitim ve Değerlendirme\n",
-        "#############################################\n",
-        "def train_and_evaluate(model, config, epochs=10):\n",
-        "    device = next(model.parameters()).device\n",
-        "    dataset = TurkishAlpacaDataset(config)\n",
-        "    optimizer = optim.AdamW(model.parameters(), lr=1e-4)  # Learning rate artırıldı\n",
-        "    loss_fn = nn.CrossEntropyLoss(ignore_index=0)\n",
-        "\n",
-        "    print(f\"\\n{'='*40}\")\n",
-        "    print(f\"🏁 {model.name} Eğitime Başlıyor...\")\n",
-        "    print(f\"🔢 Toplam Token Sayısı: {len(dataset.vocab)}\")\n",
-        "    print(f\"⚙️  Kullanılan Donanım: {'GPU' if device.type=='cuda' else 'CPU'}\")\n",
-        "    print(f\"{'='*40}\\n\")\n",
-        "\n",
-        "    for epoch in range(epochs):\n",
-        "        model.train()\n",
-        "        inputs, targets = dataset.get_batch(batch_size=8)  # Batch size artırıldı\n",
-        "        inputs, targets = inputs.to(device), targets.to(device)\n",
-        "\n",
-        "        optimizer.zero_grad()\n",
-        "        logits = model(inputs)\n",
-        "        logits = logits.view(-1, config.vocab_size)  # Reshape logits\n",
-        "        targets = targets.view(-1)                  # Reshape targets\n",
-        "        loss = loss_fn(logits, targets)\n",
-        "        loss.backward()\n",
-        "        optimizer.step()\n",
-        "\n",
-        "        # Eğitim Metrikleri\n",
-        "        preds = torch.argmax(logits, dim=-1)\n",
-        "        mask = targets != 0\n",
-        "        correct = (preds[mask] == targets[mask]).sum().item()\n",
-        "        total = mask.sum().item()\n",
-        "        acc = correct / total if total > 0 else 0\n",
-        "        ppl = math.exp(loss.item())\n",
-        "\n",
-        "        print(f\"Epok {epoch+1}/{epochs} | \"\n",
-        "              f\"Kayıp: {loss.item():.3f} | \"\n",
-        "              f\"Doğruluk: {acc:.1%} | \"\n",
-        "              f\"Perplexity: {ppl:.2f}\")\n",
-        "\n",
-        "    # Son Değerlendirme\n",
-        "    model.eval()\n",
-        "    with torch.no_grad():\n",
-        "        inputs, targets = dataset.get_batch(batch_size=8)\n",
-        "        inputs, targets = inputs.to(device), targets.to(device)\n",
-        "        logits = model(inputs)\n",
-        "        logits = logits.view(-1, config.vocab_size)  # Reshape logits\n",
-        "        targets = targets.view(-1)                  # Reshape targets\n",
-        "        loss = loss_fn(logits, targets)\n",
-        "\n",
-        "        # Metrik Hesaplama\n",
-        "        preds = torch.argmax(logits, dim=-1)\n",
-        "        mask = targets != 0\n",
-        "        correct = (preds[mask] == targets[mask]).sum().item()\n",
-        "        total = mask.sum().item()\n",
-        "        final_acc = correct / total if total > 0 else 0\n",
-        "        final_ppl = math.exp(loss.item())\n",
-        "\n",
-        "        # Örnek Üretim\n",
-        "        start_word = dataset.instructions[0].split()[0]\n",
-        "        input_token = dataset.vocab[start_word]\n",
-        "        generated = greedy_decode(model, input_token, max_length=config.max_length, device=device)\n",
-        "        generated_sentence = ' '.join([dataset.inverse_vocab.get(t, \"?\") for t in generated])\n",
-        "\n",
-        "    print(f\"\\n⭐ Final Performans ⭐\")\n",
-        "    print(f\"|{'Metric':<15}|{'Değer':<15}|\")\n",
-        "    print(f\"|{'-'*15}|{'-'*15}|\")\n",
-        "    print(f\"|{'Kayıp':<15}|{loss.item():.3f}|\")\n",
-        "    print(f\"|{'Doğruluk':<15}|{final_acc:.1%}|\")\n",
-        "    print(f\"|{'Perplexity':<15}|{final_ppl:.2f}|\")\n",
-        "    print(f\"\\n🔮 Örnek Çıktı: {generated_sentence}\")\n",
-        "\n",
-        "    # Metrikleri döndür\n",
-        "    return {\n",
-        "        'parameters': sum(p.numel() for p in model.parameters()),\n",
-        "        'trainable_parameters': sum(p.numel() for p in model.parameters() if p.requires_grad),\n",
-        "        'loss': loss.item(),\n",
-        "        'accuracy': final_acc,\n",
-        "        'perplexity': final_ppl,\n",
-        "        'sample_outputs': [generated_sentence]\n",
-        "    }\n",
-        "\n",
-        "\n",
-        "#############################################\n",
-        "# PDF Oluşturma Fonksiyonu (reportlab ile)\n",
-        "#############################################\n",
-        "def save_results_to_pdf(metrics, model_name):\n",
-        "    # PDF dosyasını oluştur\n",
-        "    pdf_path = f\"{model_name}_degerlendirme.pdf\"\n",
-        "    c = canvas.Canvas(pdf_path, pagesize=A4)\n",
-        "    width, height = A4\n",
-        "\n",
-        "    # Başlık\n",
-        "    c.setFont(\"Helvetica-Bold\", 16)\n",
-        "    c.drawString(50, height - 50, f\"Model Değerlendirme Raporu: {model_name}\")\n",
-        "\n",
-        "    # Metrikler\n",
-        "    c.setFont(\"Helvetica\", 12)\n",
-        "    y = height - 80\n",
-        "    c.drawString(50, y, \"📊 Performans Metrikleri\")\n",
-        "    y -= 20\n",
-        "    c.drawString(50, y, f\"Toplam Parametre Sayısı: {metrics['parameters']:,}\")\n",
-        "    y -= 20\n",
-        "    c.drawString(50, y, f\"Eğitilebilir Parametre Sayısı: {metrics['trainable_parameters']:,}\")\n",
-        "    y -= 20\n",
-        "    c.drawString(50, y, f\"Kayıp: {metrics['loss']:.3f}\")\n",
-        "    y -= 20\n",
-        "    c.drawString(50, y, f\"Doğruluk: {metrics['accuracy']:.1%}\")\n",
-        "    y -= 20\n",
-        "    c.drawString(50, y, f\"Perplexity: {metrics['perplexity']:.2f}\")\n",
-        "\n",
-        "    # Örnek Çıktılar\n",
-        "    y -= 30\n",
-        "    c.drawString(50, y, \"🔮 Örnek Çıktılar\")\n",
-        "    y -= 20\n",
-        "    for i, output in enumerate(metrics['sample_outputs']):\n",
-        "        c.drawString(50, y, f\"Örnek {i+1}: {output}\")\n",
-        "        y -= 20\n",
-        "\n",
-        "    # PDF'i kaydet\n",
-        "    c.save()\n",
-        "    print(f\"📄 {model_name} için rapor PDF olarak kaydedildi: {pdf_path}\")\n",
-        "\n",
-        "\n",
-        "#############################################\n",
-        "# Dummy Transformer Model for Testing\n",
-        "#############################################\n",
-        "class TransformerModel(nn.Module):\n",
-        "    def __init__(self, config, attn_type, ffn_type):\n",
-        "        super().__init__()\n",
-        "        self.name = f\"{attn_type}-{ffn_type}\"\n",
-        "        self.embedding = nn.Embedding(config.vocab_size, config.emb_dim)\n",
-        "        self.transformer = nn.Transformer(\n",
-        "            d_model=config.emb_dim,\n",
-        "            nhead=config.n_heads,\n",
-        "            num_encoder_layers=config.n_layers,\n",
-        "            num_decoder_layers=config.n_layers,\n",
-        "            dim_feedforward=config.emb_dim * 4,\n",
-        "            dropout=config.dropout\n",
-        "        )\n",
-        "        self.fc_out = nn.Linear(config.emb_dim, config.vocab_size)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        x = self.embedding(x)\n",
-        "        x = self.transformer(x, x)\n",
-        "        x = self.fc_out(x)\n",
-        "        return x\n",
-        "\n",
-        "\n",
-        "if __name__ == \"__main__\":\n",
-        "    # Konfigürasyon\n",
-        "    class Config:\n",
-        "        def __init__(self):\n",
-        "            self.vocab_size = 100  # Bu değer dinamik olarak güncellenecek\n",
-        "            self.emb_dim = 256     # Embedding boyutu artırıldı\n",
-        "            self.max_length = 32   # Maksimum uzunluk artırıldı\n",
-        "\n",
-        "            # Veri setinden maksimum uzunluğu hesapla\n",
-        "            dataset = load_dataset(\"TFLai/Turkish-Alpaca\")\n",
-        "            instructions = dataset['train']['instruction'][:100]  # Limit to 100 samples\n",
-        "            outputs = dataset['train']['output'][:100]            # Limit to 100 samples\n",
-        "\n",
-        "            instruction_lengths = [len(inst.split()) for inst in instructions]\n",
-        "            output_lengths = [len(out.split()) for out in outputs]\n",
-        "\n",
-        "            max_instruction_length = max(instruction_lengths)\n",
-        "            max_output_length = max(output_lengths)\n",
-        "\n",
-        "            # max_length'ı instruction ve output'un maksimum uzunluğuna göre ayarla\n",
-        "            self.max_length = max(max_instruction_length, max_output_length) + 10  # Ekstra pay bırak\n",
-        "\n",
-        "            self.n_layers = 4      # Katman sayısı artırıldı\n",
-        "            self.n_heads = 8       # Head sayısı artırıldı\n",
-        "            self.dropout = 0.1\n",
-        "            self.norm_type = 'rmsnorm'\n",
-        "            self.num_experts = 2\n",
-        "\n",
-        "    config = Config()\n",
-        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "\n",
-        "    # Test Edilecek Modeller\n",
-        "    experiments = [\n",
-        "        {'attn': 'standard', 'ffn': 'standard', 'name': 'Standart Model'},\n",
-        "        {'attn': 'rope', 'ffn': 'standard', 'name': 'RoPE Dikkat'},\n",
-        "        {'attn': 'alibi', 'ffn': 'moe', 'name': 'ALiBi + MoE'},\n",
-        "        {'attn': 'multiquery', 'ffn': 'moe', 'name': 'Multi-Query MoE'}\n",
-        "    ]\n",
-        "\n",
-        "    # Deneyleri Çalıştır\n",
-        "    results = []\n",
-        "    for exp in experiments:\n",
-        "        print(f\"\\n{'='*40}\")\n",
-        "        print(f\"🧪 {exp['name']} Değerlendiriliyor...\")\n",
-        "        print(f\"{'='*40}\")\n",
-        "\n",
-        "        model = TransformerModel(config, exp['attn'], exp['ffn']).to(device)\n",
-        "        model.name = exp['name']\n",
-        "\n",
-        "        # Eğitim ve Değerlendirme\n",
-        "        metrics = train_and_evaluate(model, config, epochs=20)  # Epoch sayısı artırıldı\n",
-        "        results.append((exp['name'], metrics))\n",
-        "\n",
-        "        # PDF Raporu Oluştur\n",
-        "        save_results_to_pdf(metrics, exp['name'])\n",
-        "\n",
-        "    # Tüm Sonuçları Karşılaştır\n",
-        "    print(\"\\n📊 Tüm Modellerin Karşılaştırması:\")\n",
-        "    print(f\"|{'Model':<20}|{'Parametre':<10}|{'Doğruluk':<10}|{'Perplexity':<12}|\")\n",
-        "    print(f\"|{'-'*20}|{'-'*10}|{'-'*10}|{'-'*12}|\")\n",
-        "    for name, metrics in results:\n",
-        "        print(f\"|{name:<20}|{metrics['parameters']:<10,}|{metrics['accuracy']:<10.1%}|{metrics['perplexity']:<12.2f}|\")"
-      ],
-      "metadata": {
-        "trusted": true,
-        "execution": {
-          "iopub.status.busy": "2025-02-12T22:11:53.299779Z",
-          "iopub.execute_input": "2025-02-12T22:11:53.300209Z",
-          "iopub.status.idle": "2025-02-12T22:11:55.659114Z",
-          "shell.execute_reply.started": "2025-02-12T22:11:53.300168Z",
-          "shell.execute_reply": "2025-02-12T22:11:55.657535Z"
-        },
-        "id": "Gdew4V4fw_I4"
-      },
-      "outputs": [],
-      "execution_count": null
+    "id": "4Q_jYSvEw_I3"
+   },
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "#!pip install evaluate reportlab"
+   ],
+   "metadata": {
+    "trusted": true,
+    "execution": {
+     "iopub.status.busy": "2025-02-12T22:06:55.854229Z",
+     "iopub.execute_input": "2025-02-12T22:06:55.854996Z",
+     "iopub.status.idle": "2025-02-12T22:06:55.861061Z",
+     "shell.execute_reply.started": "2025-02-12T22:06:55.854884Z",
+     "shell.execute_reply": "2025-02-12T22:06:55.859524Z"
     },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "İşlem için CPU ve GPU yetersiz o yüzden bu uyarı geliyor"
-      ],
-      "metadata": {
-        "id": "6f-KvTbfw_I4"
-      }
+    "id": "6ZXigQy4w_I4"
+   },
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import math\n",
+    "from datasets import load_dataset\n",
+    "from collections import defaultdict\n",
+    "from reportlab.lib.pagesizes import A4\n",
+    "from reportlab.pdfgen import canvas\n",
+    "\n",
+    "#############################################\n",
+    "# Turkish-Alpaca Veri Seti ve Tokenizer\n",
+    "#############################################\n",
+    "class TurkishAlpacaDataset:\n",
+    "    def __init__(self, config):\n",
+    "        # Load the dataset from Hugging Face\n",
+    "        dataset = load_dataset(\"TFLai/Turkish-Alpaca\")\n",
+    "        self.instructions = dataset['train']['instruction'][:100]  # Limit to 100 samples\n",
+    "        self.outputs = dataset['train']['output'][:100]            # Limit to 100 samples\n",
+    "\n",
+    "        # Create tokenizer\n",
+    "        self.vocab = defaultdict(lambda: len(self.vocab))\n",
+    "        self.vocab['<pad>'] = 0  # Padding token'i ekle\n",
+    "\n",
+    "        # Tokenize all the data\n",
+    "        self.tokenize_data()\n",
+    "\n",
+    "        # Build the inverse vocabulary\n",
+    "        self.inverse_vocab = {v: k for k, v in self.vocab.items()}\n",
+    "\n",
+    "        # Dynamically update vocab_size in config\n",
+    "        config.vocab_size = len(self.vocab)\n",
+    "        self.config = config\n",
+    "\n",
+    "    def tokenize_data(self):\n",
+    "        # Instruction ve Output'u tokenize et\n",
+    "        self.tokenized_instructions = []\n",
+    "        self.tokenized_outputs = []\n",
+    "\n",
+    "        for inst, out in zip(self.instructions, self.outputs):\n",
+    "            inst_tokens = [self.vocab[word] for word in inst.split()]\n",
+    "            out_tokens = [self.vocab[word] for word in out.split()]\n",
+    "            self.tokenized_instructions.append(inst_tokens)\n",
+    "            self.tokenized_outputs.append(out_tokens)\n",
+    "\n",
+    "    def get_batch(self, batch_size=4):\n",
+    "        # Create a random batch\n",
+    "        indices = torch.randint(0, len(self.tokenized_instructions), (batch_size,))\n",
+    "        inputs, targets = [], []\n",
+    "\n",
+    "        for i in indices:\n",
+    "            input_tokens = self.tokenized_instructions[i][:-1]\n",
+    "            target_tokens = self.tokenized_outputs[i][1:]\n",
+    "            inputs.append(torch.tensor(input_tokens, dtype=torch.long))\n",
+    "            targets.append(torch.tensor(target_tokens, dtype=torch.long))\n",
+    "\n",
+    "        # Padding operation\n",
+    "        inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)\n",
+    "        targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)\n",
+    "        return inputs, targets\n",
+    "\n",
+    "\n",
+    "#############################################\n",
+    "# Greedy Decode Function\n",
+    "#############################################\n",
+    "def greedy_decode(model, start_token, max_length, device, temperature=1.0):\n",
+    "    \"\"\"\n",
+    "    Greedy decoding to generate sequences from a language model.\n",
+    "\n",
+    "    Args:\n",
+    "        model: The language model to use for generation.\n",
+    "        start_token: The token ID to start decoding from.\n",
+    "        max_length: Maximum length of the generated sequence.\n",
+    "        device: The device (CPU/GPU) where the model resides.\n",
+    "        temperature: Sampling temperature (optional, default=1.0).\n",
+    "\n",
+    "    Returns:\n",
+    "        List of generated token IDs.\n",
+    "    \"\"\"\n",
+    "    model.eval()\n",
+    "    with torch.no_grad():\n",
+    "        input_token = torch.tensor([[start_token]], dtype=torch.long).to(device)\n",
+    "        generated_tokens = [start_token]\n",
+    "\n",
+    "        for _ in range(max_length - 1):\n",
+    "            logits = model(input_token)\n",
+    "            next_token_logits = logits[:, -1, :] / temperature\n",
+    "            next_token = torch.argmax(next_token_logits, dim=-1).item()\n",
+    "\n",
+    "            if next_token == 0:  # Stop if <pad> token is generated\n",
+    "                break\n",
+    "\n",
+    "            generated_tokens.append(next_token)\n",
+    "            input_token = torch.cat([input_token, torch.tensor([[next_token]], dtype=torch.long).to(device)], dim=1)\n",
+    "\n",
+    "    return generated_tokens\n",
+    "\n",
+    "\n",
+    "#############################################\n",
+    "# Improved training and evaluation\n",
+    "#############################################\n",
+    "def train_and_evaluate(model, config, epochs=10):\n",
+    "    device = next(model.parameters()).device\n",
+    "    dataset = TurkishAlpacaDataset(config)\n",
+    "    optimizer = optim.AdamW(model.parameters(), lr=1e-4)  # Increased learning rate\n",
+    "    loss_fn = nn.CrossEntropyLoss(ignore_index=0)\n",
+    "\n",
+    "    print(f\"\\n{'='*40}\")\n",
+    "    print(f\"🏁 {model.name} is starting training...\")\n",
+    "    print(f\"🔢 Toplam Token Count: {len(dataset.vocab)}\")\n",
+    "    print(f\"⚙️  Hardware in use: {'GPU' if device.type=='cuda' else 'CPU'}\")\n",
+    "    print(f\"{'='*40}\\n\")\n",
+    "\n",
+    "    for epoch in range(epochs):\n",
+    "        model.train()\n",
+    "        inputs, targets = dataset.get_batch(batch_size=8)  # Increased batch size\n",
+    "        inputs, targets = inputs.to(device), targets.to(device)\n",
+    "\n",
+    "        optimizer.zero_grad()\n",
+    "        logits = model(inputs)\n",
+    "        logits = logits.view(-1, config.vocab_size)  # Reshape logits\n",
+    "        targets = targets.view(-1)                  # Reshape targets\n",
+    "        loss = loss_fn(logits, targets)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "\n",
+    "        # Training metrics\n",
+    "        preds = torch.argmax(logits, dim=-1)\n",
+    "        mask = targets != 0\n",
+    "        correct = (preds[mask] == targets[mask]).sum().item()\n",
+    "        total = mask.sum().item()\n",
+    "        acc = correct / total if total > 0 else 0\n",
+    "        ppl = math.exp(loss.item())\n",
+    "\n",
+    "        print(f\"Epok {epoch+1}/{epochs} | \"\n",
+    "              f\"Loss: {loss.item():.3f} | \"\n",
+    "              f\"Accuracy: {acc:.1%} | \"\n",
+    "              f\"Perplexity: {ppl:.2f}\")\n",
+    "\n",
+    "    # Final evaluation\n",
+    "    model.eval()\n",
+    "    with torch.no_grad():\n",
+    "        inputs, targets = dataset.get_batch(batch_size=8)\n",
+    "        inputs, targets = inputs.to(device), targets.to(device)\n",
+    "        logits = model(inputs)\n",
+    "        logits = logits.view(-1, config.vocab_size)  # Reshape logits\n",
+    "        targets = targets.view(-1)                  # Reshape targets\n",
+    "        loss = loss_fn(logits, targets)\n",
+    "\n",
+    "        # Metrik Hesaplama\n",
+    "        preds = torch.argmax(logits, dim=-1)\n",
+    "        mask = targets != 0\n",
+    "        correct = (preds[mask] == targets[mask]).sum().item()\n",
+    "        total = mask.sum().item()\n",
+    "        final_acc = correct / total if total > 0 else 0\n",
+    "        final_ppl = math.exp(loss.item())\n",
+    "\n",
+    "        # Example generation\n",
+    "        start_word = dataset.instructions[0].split()[0]\n",
+    "        input_token = dataset.vocab[start_word]\n",
+    "        generated = greedy_decode(model, input_token, max_length=config.max_length, device=device)\n",
+    "        generated_sentence = ' '.join([dataset.inverse_vocab.get(t, \"?\") for t in generated])\n",
+    "\n",
+    "    print(f\"\\n⭐ Final Performans ⭐\")\n",
+    "    print(f\"|{'Metric':<15}|{'Value':<15}|\")\n",
+    "    print(f\"|{'-'*15}|{'-'*15}|\")\n",
+    "    print(f\"|{'Loss':<15}|{loss.item():.3f}|\")\n",
+    "    print(f\"|{'Accuracy':<15}|{final_acc:.1%}|\")\n",
+    "    print(f\"|{'Perplexity':<15}|{final_ppl:.2f}|\")\n",
+    "    print(f\"\\n🔮 Example Output: {generated_sentence}\")\n",
+    "\n",
+    "    # Return the metrics\n",
+    "    return {\n",
+    "        'parameters': sum(p.numel() for p in model.parameters()),\n",
+    "        'trainable_parameters': sum(p.numel() for p in model.parameters() if p.requires_grad),\n",
+    "        'loss': loss.item(),\n",
+    "        'accuracy': final_acc,\n",
+    "        'perplexity': final_ppl,\n",
+    "        'sample_outputs': [generated_sentence]\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "#############################################\n",
+    "# PDF generation function (with reportlab)\n",
+    "#############################################\n",
+    "def save_results_to_pdf(metrics, model_name):\n",
+    "    # Create the PDF file\n",
+    "    pdf_path = f\"{model_name}_degerlendirme.pdf\"\n",
+    "    c = canvas.Canvas(pdf_path, pagesize=A4)\n",
+    "    width, height = A4\n",
+    "\n",
+    "    # Title\n",
+    "    c.setFont(\"Helvetica-Bold\", 16)\n",
+    "    c.drawString(50, height - 50, f\"Model Evaluation Report: {model_name}\")\n",
+    "\n",
+    "    # Metrikler\n",
+    "    c.setFont(\"Helvetica\", 12)\n",
+    "    y = height - 80\n",
+    "    c.drawString(50, y, \"📊 Performans Metrikleri\")\n",
+    "    y -= 20\n",
+    "    c.drawString(50, y, f\"Total Number of Parameters: {metrics['parameters']:,}\")\n",
+    "    y -= 20\n",
+    "    c.drawString(50, y, f\"Number of Trainable Parameters: {metrics['trainable_parameters']:,}\")\n",
+    "    y -= 20\n",
+    "    c.drawString(50, y, f\"Loss: {metrics['loss']:.3f}\")\n",
+    "    y -= 20\n",
+    "    c.drawString(50, y, f\"Accuracy: {metrics['accuracy']:.1%}\")\n",
+    "    y -= 20\n",
+    "    c.drawString(50, y, f\"Perplexity: {metrics['perplexity']:.2f}\")\n",
+    "\n",
+    "    # Example outputs\n",
+    "    y -= 30\n",
+    "    c.drawString(50, y, \"🔮 Example Outputs\")\n",
+    "    y -= 20\n",
+    "    for i, output in enumerate(metrics['sample_outputs']):\n",
+    "        c.drawString(50, y, f\"Example {i+1}: {output}\")\n",
+    "        y -= 20\n",
+    "\n",
+    "    # PDF'i kaydet\n",
+    "    c.save()\n",
+    "    print(f\"📄 Saved the report for {model_name} as a PDF: {pdf_path}\")\n",
+    "\n",
+    "\n",
+    "#############################################\n",
+    "# Dummy Transformer Model for Testing\n",
+    "#############################################\n",
+    "class TransformerModel(nn.Module):\n",
+    "    def __init__(self, config, attn_type, ffn_type):\n",
+    "        super().__init__()\n",
+    "        self.name = f\"{attn_type}-{ffn_type}\"\n",
+    "        self.embedding = nn.Embedding(config.vocab_size, config.emb_dim)\n",
+    "        self.transformer = nn.Transformer(\n",
+    "            d_model=config.emb_dim,\n",
+    "            nhead=config.n_heads,\n",
+    "            num_encoder_layers=config.n_layers,\n",
+    "            num_decoder_layers=config.n_layers,\n",
+    "            dim_feedforward=config.emb_dim * 4,\n",
+    "            dropout=config.dropout\n",
+    "        )\n",
+    "        self.fc_out = nn.Linear(config.emb_dim, config.vocab_size)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.embedding(x)\n",
+    "        x = self.transformer(x, x)\n",
+    "        x = self.fc_out(x)\n",
+    "        return x\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    # Configuration\n",
+    "    class Config:\n",
+    "        def __init__(self):\n",
+    "            self.vocab_size = 100  # This value will be updated dynamically\n",
+    "            self.emb_dim = 256     # Embedding dimension increased\n",
+    "            self.max_length = 32   # Maximum length increased\n",
+    "\n",
+    "            # Compute the maximum length from the dataset\n",
+    "            dataset = load_dataset(\"TFLai/Turkish-Alpaca\")\n",
+    "            instructions = dataset['train']['instruction'][:100]  # Limit to 100 samples\n",
+    "            outputs = dataset['train']['output'][:100]            # Limit to 100 samples\n",
+    "\n",
+    "            instruction_lengths = [len(inst.split()) for inst in instructions]\n",
+    "            output_lengths = [len(out.split()) for out in outputs]\n",
+    "\n",
+    "            max_instruction_length = max(instruction_lengths)\n",
+    "            max_output_length = max(output_lengths)\n",
+    "\n",
+    "            # Set max_length based on the maximum instruction and output length\n",
+    "            self.max_length = max(max_instruction_length, max_output_length) + 10  # Leave additional buffer\n",
+    "\n",
+    "            self.n_layers = 4      # Number of layers increased\n",
+    "            self.n_heads = 8       # Number of heads increased\n",
+    "            self.dropout = 0.1\n",
+    "            self.norm_type = 'rmsnorm'\n",
+    "            self.num_experts = 2\n",
+    "\n",
+    "    config = Config()\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "    # Test Edilecek Modeller\n",
+    "    experiments = [\n",
+    "        {'attn': 'standard', 'ffn': 'standard', 'name': 'Standart Model'},\n",
+    "        {'attn': 'rope', 'ffn': 'standard', 'name': 'RoPE Dikkat'},\n",
+    "        {'attn': 'alibi', 'ffn': 'moe', 'name': 'ALiBi + MoE'},\n",
+    "        {'attn': 'multiquery', 'ffn': 'moe', 'name': 'Multi-Query MoE'}\n",
+    "    ]\n",
+    "\n",
+    "    # Run the experiments\n",
+    "    results = []\n",
+    "    for exp in experiments:\n",
+    "        print(f\"\\n{'='*40}\")\n",
+    "        print(f\"🧪 Evaluating {exp['name']}...\")\n",
+    "        print(f\"{'='*40}\")\n",
+    "\n",
+    "        model = TransformerModel(config, exp['attn'], exp['ffn']).to(device)\n",
+    "        model.name = exp['name']\n",
+    "\n",
+    "        # Training and evaluation\n",
+    "        metrics = train_and_evaluate(model, config, epochs=20)  # Increased number of epochs\n",
+    "        results.append((exp['name'], metrics))\n",
+    "\n",
+    "        # Generate the PDF report\n",
+    "        save_results_to_pdf(metrics, exp['name'])\n",
+    "\n",
+    "    # Compare all results\n",
+    "    print(\"\\n📊 Comparison of All Models:\")\n",
+    "    print(f\"|{'Model':<20}|{'Parametre':<10}|{'Accuracy':<10}|{'Perplexity':<12}|\")\n",
+    "    print(f\"|{'-'*20}|{'-'*10}|{'-'*10}|{'-'*12}|\")\n",
+    "    for name, metrics in results:\n",
+    "        print(f\"|{name:<20}|{metrics['parameters']:<10,}|{metrics['accuracy']:<10.1%}|{metrics['perplexity']:<12.2f}|\")"
+   ],
+   "metadata": {
+    "trusted": true,
+    "execution": {
+     "iopub.status.busy": "2025-02-12T22:11:53.299779Z",
+     "iopub.execute_input": "2025-02-12T22:11:53.300209Z",
+     "iopub.status.idle": "2025-02-12T22:11:55.659114Z",
+     "shell.execute_reply.started": "2025-02-12T22:11:53.300168Z",
+     "shell.execute_reply": "2025-02-12T22:11:55.657535Z"
     },
-    {
-      "cell_type": "code",
-      "source": [],
-      "metadata": {
-        "trusted": true,
-        "id": "HhxJ5m_Dw_I5"
-      },
-      "outputs": [],
-      "execution_count": null
-    }
-  ]
+    "id": "Gdew4V4fw_I4"
+   },
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "This warning appears because both the CPU and GPU are insufficient for the operation"
+   ],
+   "metadata": {
+    "id": "6f-KvTbfw_I4"
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [],
+   "metadata": {
+    "trusted": true,
+    "id": "HhxJ5m_Dw_I5"
+   },
+   "outputs": [],
+   "execution_count": null
+  }
+ ]
 }
\ No newline at end of file
diff --git a/Genel-4/llada.ipynb b/Genel-4/llada.ipynb
index cd17eb2..0f003e1 100644
--- a/Genel-4/llada.ipynb
+++ b/Genel-4/llada.ipynb
@@ -5,8 +5,8 @@
    "id": "9e726e7e",
    "metadata": {},
    "source": [
-    "# Difüzyon Temelli Metin Üretimi (SE Data Set ile)\n",
-    "Bu notebook, HuggingFace'den alınan `salihturkoglu/se_data_set` veri setinin `instruction` ve `response` sütunlarını kullanarak difüzyon temelli metin üretimini gösterir."
+    "# Diffusion-based text generation (with the SE data set)\n",
+    "This notebook demonstrates diffusion-based text generation using the `instruction` and `response` columns of the `salihturkoglu/se_data_set` dataset from HuggingFace."
    ]
   },
   {
@@ -14,13 +14,13 @@
    "id": "fed42a70",
    "metadata": {},
    "source": [
-    "## 1. Veri Setini İndir ve Hazırla\n",
-    "Veri seti HuggingFace Datasets ile yüklenir. Her bir örnek için `instruction` giriş, `response` ise hedef metindir."
+    "## 1. Download and prepare the dataset\n",
+    "Load the dataset with HuggingFace Datasets. For each sample, `instruction` is the input and `response` is the target text."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "d67b069a",
    "metadata": {},
    "outputs": [],
@@ -32,10 +32,10 @@
     "from torch.utils.data import DataLoader, Dataset\n",
     "import numpy as np\n",
     "\n",
-    "# HuggingFace veri setini yükle\n",
+    "# Load the HuggingFace dataset\n",
     "dataset = load_dataset('salihturkoglu/se_data_set', split='train')\n",
     "\n",
-    "# Tüm örnekleri kullan (877 satır var)\n",
+    "# Use all examples (877 rows)\n",
     "instructions = [ex['instruction'] for ex in dataset]\n",
     "responses = [ex['response'] for ex in dataset]"
    ]
@@ -45,13 +45,13 @@
    "id": "5f0ef5f0",
    "metadata": {},
    "source": [
-    "## 2. Tokenizer ve Sözlük Oluşturma\n",
-    "Tüm metinlerden bir kelime sözlüğü oluşturulur ve metinler tokenlara çevrilir."
+    "## 2. Build the tokenizer and vocabulary\n",
+    "Build a vocabulary from all text and convert the sentences into tokens."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "447747d9",
    "metadata": {},
    "outputs": [],
@@ -61,7 +61,7 @@
     "def tokenize(text):\n",
     "    return text.lower().strip().split()\n",
     "\n",
-    "# Sözlük oluştur\n",
+    "# Build the vocabulary\n",
     "PAD_TOKEN = '<PAD>'\n",
     "UNK_TOKEN = '<UNK>'\n",
     "all_texts = instructions + responses\n",
@@ -86,12 +86,12 @@
    "metadata": {},
    "source": [
     "## 3. PyTorch Dataset ve DataLoader\n",
-    "Instruction ve response çiftlerini uygun şekilde tensor haline getirir."
+    "Convert instruction and response pairs into tensors."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "bb7245eb",
    "metadata": {},
    "outputs": [],
@@ -123,12 +123,12 @@
    "id": "651c71c8",
    "metadata": {},
    "source": [
-    "## 4. Difüzyon Süreci: Gürültü Ekleme ve Çıkarma"
+    "## 4. Diffusion process: add and remove noise"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "d4b3c522",
    "metadata": {},
    "outputs": [],
@@ -146,12 +146,12 @@
    "id": "5997f6c3",
    "metadata": {},
    "source": [
-    "## 5. Model Tanımı"
+    "## 5. Model definition"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "69d48719",
    "metadata": {},
    "outputs": [],
@@ -178,37 +178,15 @@
    "id": "e7b7055b",
    "metadata": {},
    "source": [
-    "## 6. Eğitim Süreci"
+    "## 6. Training process"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "4bc574d4",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Epoch 1, Loss: 7.2675\n",
-      "Epoch 2, Loss: 6.5326\n",
-      "Epoch 3, Loss: 5.9995\n",
-      "Epoch 4, Loss: 5.6588\n",
-      "Epoch 5, Loss: 5.4110\n",
-      "Epoch 6, Loss: 5.2345\n",
-      "Epoch 7, Loss: 5.0627\n",
-      "Epoch 8, Loss: 4.9098\n",
-      "Epoch 9, Loss: 4.7601\n",
-      "Epoch 10, Loss: 4.6605\n",
-      "Epoch 11, Loss: 4.5139\n",
-      "Epoch 12, Loss: 4.4244\n",
-      "Epoch 13, Loss: 4.3298\n",
-      "Epoch 14, Loss: 4.2664\n",
-      "Epoch 15, Loss: 4.1858\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
     "model = DiffusionTextModel(len(vocab)).to(device)\n",
@@ -240,40 +218,22 @@
    "id": "6edb3735",
    "metadata": {},
    "source": [
-    "## 7. Metin Üretimi (Difüzyon ile Response Oluşturma)"
+    "## 7. Text generation (producing a response with diffusion)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "a29fd606",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Instruction: Yazılım Mühendisliği bölümünün ders programını görebilir miyim?\n",
-      "Gerçek Response: Güncel ders programını bölüm web sitesindeki duyurular bölümündeki en güncel ders programı duyurusuna ulaşarak görebilirsiniz.\n",
-      "Model Response: öğrenci ve eğitimi ve fazla i̇şyeri i̇şyeri öğrenci eğitimi i̇şyeri bilgi i̇şyeri ise, ile ders ile ders en ve en eğitimi ile ders bölümün ile eğitimi öğrenci ders için i̇şyeri ve eğitimi\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\emreq\\Desktop\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\modules\\transformer.py:508: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. We recommend specifying layout=torch.jagged when constructing a nested tensor, as this layout receives active development, has better operator coverage, and works with torch.compile. (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\aten\\src\\ATen\\NestedTensorImpl.cpp:182.)\n",
-      "  output = torch._nested_tensor_from_mask(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def generate_response(model, instruction, steps=8, max_len=32):\n",
     "    model.eval()\n",
     "    inp = encode(instruction)[:max_len]\n",
     "    inp = inp + [vocab['<PAD>']] * (max_len - len(inp))\n",
     "    inp_tensor = torch.tensor([inp], dtype=torch.long, device=device)\n",
-    "    # Başlangıçta tamamen rastgele bir dizi\n",
+    "    # Initially a completely random sequence\n",
     "    generated = torch.randint(2, len(vocab), (1, max_len), device=device)\n",
     "    for step in range(steps):\n",
     "        mask = (generated == vocab['<PAD>'])\n",
@@ -286,10 +246,10 @@
     "    tokens = generated[0].tolist()\n",
     "    return decode(tokens)\n",
     "\n",
-    "# Örnek bir instruction ile response üret\n",
+    "# Generate a response with a sample instruction\n",
     "test_instruction = instructions[0]\n",
     "print('Instruction:', test_instruction)\n",
-    "print('Gerçek Response:', responses[0])\n",
+    "print('Ground-truth response:', responses[0])\n",
     "print('Model Response:', generate_response(model, test_instruction))"
    ]
   },
@@ -299,31 +259,21 @@
    "metadata": {},
    "source": [
     "## 9. Test: Herhangi Bir Soru ile Modeli Deneyin\n",
-    "Aşağıdaki hücrede, istediğiniz bir soruyu `test_instruction` değişkenine yazarak modelin cevabını görebilirsiniz."
+    "In the cell below, set the `test_instruction` variable to any question to view the model's answer."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "a9e1eacd",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Instruction: Ders kaydı yaparken üst sınıftan ders alabilir miyim?\n",
-      "Gerçek Response: Yok\n",
-      "Model Response: akademik veya yapılır. öğrenci öğrenci ve ders ders ve ders olan üzerinden ve öğrenci akademik veya öğrenci teknoloji ve veya yönetimi için ders öğrenci veya ve ve bölümün bilgi öğrenci veya ders en olan resmi öğrenci akademik akademik akademik sistemi öğrenci en öğrenci ders öğrenci ile bir öğrenci öğrenci ve bir istediği öğrenci ve ve genellikle sayfasından tüm sistemi öğrenci olmak ilgili değişim öğrenci ders ders öğrenci ve sistemi onaylandıktan öğrenci ders genellikle akademik ve veya ders ders üniversite öğrenci hizmetlerinden fakültesi ve içindeki ve ile bilgi öğrenci ders ve olanakları bir öğrenci akademik veya ve ders bilgi ders öğrenci (katkı ve 4. için ve için da bilgi istediği bilgiye ve fazla ve bir sağlamak ve akademik olan ve ve akademik ve ve sağlanır. öğrenci bir web öğrenci\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "# Test etmek istediğiniz soruyu buradan değiştirebilirsiniz.\n",
-    "test_instruction = \"Ders kaydı yaparken üst sınıftan ders alabilir miyim?\"\n",
+    "# You can change the question to test here.\n",
+    "test_instruction = \"Can I take a course from an upper grade during course registration?\"\n",
     "\n",
     "print('Instruction:', test_instruction)\n",
-    "print('Gerçek Response:', responses[instructions.index(test_instruction)] if test_instruction in instructions else \"Yok\")\n",
+    "print('Ground-truth response:', responses[instructions.index(test_instruction)] if test_instruction in instructions else \"None\")\n",
     "print('Model Response:', generate_response(model, test_instruction, max_len=max_len))"
    ]
   },
@@ -333,23 +283,15 @@
    "metadata": {},
    "source": [
     "## 10. Modelin Test Edilmesi\n",
-    "Aşağıdaki hücrede modelin test verisi üzerinde ne kadar doğru response üretebildiği ölçülür. Basit bir doğruluk metriği olarak, modelin response üretiminde orijinal response ile token bazında ne kadar örtüştüğü hesaplanır."
+    "The cell below measures how accurately the model can produce responses on the test data. As a simple accuracy metric, it computes how closely the generated response matches the original response token by token."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "83d0357a",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Test doğruluğu: 1.71% (60/3509)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def evaluate_diffusion_model(model, dataset, n_samples=100, steps=8):\n",
     "    model.eval()\n",
@@ -372,7 +314,7 @@
     "        total += mask.sum().item()\n",
     "        correct += ((generated == tgt) & mask).sum().item()\n",
     "    accuracy = correct / total if total > 0 else 0.0\n",
-    "    print(f\"Test doğruluğu: {accuracy:.2%} ({correct}/{total})\")\n",
+    "    print(f\"Test accuracy: {accuracy:.2%} ({correct}/{total})\")\n",
     "\n",
     "# Test et\n",
     "evaluate_diffusion_model(model, dataset, n_samples=100, steps=8)"
@@ -400,4 +342,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git "a/Genel-5/Projeksiyon_Katmanlar\304\261.ipynb" "b/Genel-5/Projeksiyon_Katmanlar\304\261.ipynb"
index f8610b8..90d75e0 100644
--- "a/Genel-5/Projeksiyon_Katmanlar\304\261.ipynb"
+++ "b/Genel-5/Projeksiyon_Katmanlar\304\261.ipynb"
@@ -1,133 +1,124 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "authorship_tag": "ABX9TyMR0Xdi2l6f+Bd4d9tZW0gS",
-      "include_colab_link": true
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "provenance": [],
+   "authorship_tag": "ABX9TyMR0Xdi2l6f+Bd4d9tZW0gS",
+   "include_colab_link": true
   },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "view-in-github",
-        "colab_type": "text"
-      },
-      "source": [
-        "<a href=\"https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/Projeksiyon_Katmanlar%C4%B1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import torch\n",
-        "import torch.nn as nn\n",
-        "\n",
-        "# Model parametreleri\n",
-        "d_model = 64   # Modelin gizli boyutu\n",
-        "d_ff = 256     # Besleme ileri (feed-forward) boyutu\n",
-        "seq_len = 10   # Girdi sırasının uzunluğu\n",
-        "batch_size = 8 # Batch boyutu"
-      ],
-      "metadata": {
-        "id": "wpPyAbV0zfeE"
-      },
-      "execution_count": 2,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Girdi tensorü (örnek veriler)\n",
-        "input_tensor = torch.rand(batch_size, seq_len, d_model)  # Rastgele veri"
-      ],
-      "metadata": {
-        "id": "kDYRBKL6zgmz"
-      },
-      "execution_count": 3,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Projeksiyon Katmanları\n",
-        "class ProjectionLayers(nn.Module):\n",
-        "    def __init__(self, d_model, d_ff):\n",
-        "        super().__init__()\n",
-        "        # Query, Key, Value Projections\n",
-        "        self.q_proj = nn.Linear(d_model, d_model)\n",
-        "        self.k_proj = nn.Linear(d_model, d_model)\n",
-        "        self.v_proj = nn.Linear(d_model, d_model)\n",
-        "        self.o_proj = nn.Linear(d_model, d_model)\n",
-        "\n",
-        "        # Feed-forward Projections\n",
-        "        self.gate_proj = nn.Linear(d_model, d_model)\n",
-        "        self.up_proj = nn.Linear(d_model, d_ff)\n",
-        "        self.down_proj = nn.Linear(d_ff, d_model)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        # Attention Projections\n",
-        "        q = self.q_proj(x)  # Sorgu\n",
-        "        k = self.k_proj(x)  # Anahtar\n",
-        "        v = self.v_proj(x)  # Değer\n",
-        "\n",
-        "        # Dot-product attention için hazırlık\n",
-        "        attention_scores = torch.matmul(q, k.transpose(-2, -1)) / (d_model ** 0.5)\n",
-        "        attention_weights = torch.softmax(attention_scores, dim=-1)\n",
-        "        attention_output = torch.matmul(attention_weights, v)\n",
-        "\n",
-        "        # Output Projeksiyon\n",
-        "        output = self.o_proj(attention_output)\n",
-        "\n",
-        "        # Feed-forward katmanı\n",
-        "        gated = torch.sigmoid(self.gate_proj(output)) * output  # Gated mekanizma\n",
-        "        upsampled = self.up_proj(gated)  # Daha yüksek boyut\n",
-        "        downsampled = self.down_proj(upsampled)  # Boyut küçültme\n",
-        "        return downsampled"
-      ],
-      "metadata": {
-        "id": "0wAkujy5zjME"
-      },
-      "execution_count": 4,
-      "outputs": []
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "view-in-github",
+    "colab_type": "text"
+   },
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/Projeksiyon_Katmanlar%C4%B1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "# Model parametreleri\n",
+    "d_model = 64   # Modelin gizli boyutu\n",
+    "d_ff = 256     # Besleme ileri (feed-forward) boyutu\n",
+    "seq_len = 10   # Length of the input sequence\n",
+    "batch_size = 8 # Batch boyutu"
+   ],
+   "metadata": {
+    "id": "wpPyAbV0zfeE"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# Input tensor (example data)\n",
+    "input_tensor = torch.rand(batch_size, seq_len, d_model)  # Rastgele veri"
+   ],
+   "metadata": {
+    "id": "kDYRBKL6zgmz"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# Projection layers\n",
+    "class ProjectionLayers(nn.Module):\n",
+    "    def __init__(self, d_model, d_ff):\n",
+    "        super().__init__()\n",
+    "        # Query, Key, Value Projections\n",
+    "        self.q_proj = nn.Linear(d_model, d_model)\n",
+    "        self.k_proj = nn.Linear(d_model, d_model)\n",
+    "        self.v_proj = nn.Linear(d_model, d_model)\n",
+    "        self.o_proj = nn.Linear(d_model, d_model)\n",
+    "\n",
+    "        # Feed-forward Projections\n",
+    "        self.gate_proj = nn.Linear(d_model, d_model)\n",
+    "        self.up_proj = nn.Linear(d_model, d_ff)\n",
+    "        self.down_proj = nn.Linear(d_ff, d_model)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        # Attention Projections\n",
+    "        q = self.q_proj(x)  # Sorgu\n",
+    "        k = self.k_proj(x)  # Anahtar\n",
+    "        v = self.v_proj(x)  # Value\n",
+    "\n",
+    "        # Prepare for dot-product attention\n",
+    "        attention_scores = torch.matmul(q, k.transpose(-2, -1)) / (d_model ** 0.5)\n",
+    "        attention_weights = torch.softmax(attention_scores, dim=-1)\n",
+    "        attention_output = torch.matmul(attention_weights, v)\n",
+    "\n",
+    "        # Output Projeksiyon\n",
+    "        output = self.o_proj(attention_output)\n",
+    "\n",
+    "        # Feed-forward layer\n",
+    "        gated = torch.sigmoid(self.gate_proj(output)) * output  # Gated mekanizma\n",
+    "        upsampled = self.up_proj(gated)  # Higher dimensional projection\n",
+    "        downsampled = self.down_proj(upsampled)  # Reduce the dimensionality\n",
+    "        return downsampled"
+   ],
+   "metadata": {
+    "id": "0wAkujy5zjME"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
-    {
-      "cell_type": "code",
-      "execution_count": 5,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "fBZsytFczPvM",
-        "outputId": "7f0ddf0f-90e7-4f2e-f63b-a3f6d2007797"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Girdi Boyutu:  torch.Size([8, 10, 64])\n",
-            "Çıktı Boyutu:  torch.Size([8, 10, 64])\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Modeli tanımla ve çalıştır\n",
-        "model = ProjectionLayers(d_model=d_model, d_ff=d_ff)\n",
-        "output_tensor = model(input_tensor)\n",
-        "\n",
-        "print(\"Girdi Boyutu: \", input_tensor.shape)\n",
-        "print(\"Çıktı Boyutu: \", output_tensor.shape)\n"
-      ]
-    }
-  ]
+    "id": "fBZsytFczPvM",
+    "outputId": "7f0ddf0f-90e7-4f2e-f63b-a3f6d2007797"
+   },
+   "outputs": [],
+   "source": [
+    "# Define and run the model\n",
+    "model = ProjectionLayers(d_model=d_model, d_ff=d_ff)\n",
+    "output_tensor = model(input_tensor)\n",
+    "\n",
+    "print(\"Input Boyutu: \", input_tensor.shape)\n",
+    "print(\"Output Boyutu: \", output_tensor.shape)\n"
+   ]
+  }
+ ]
 }
\ No newline at end of file
diff --git a/Genel-5/SLM_+_COT_FINETUNE.ipynb b/Genel-5/SLM_+_COT_FINETUNE.ipynb
index 24f1c68..f95ec86 100644
--- a/Genel-5/SLM_+_COT_FINETUNE.ipynb
+++ b/Genel-5/SLM_+_COT_FINETUNE.ipynb
@@ -1,272 +1,272 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "authorship_tag": "ABX9TyPiG+IsBhY9lUPrholCiKS5",
-      "include_colab_link": true
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "provenance": [],
+   "authorship_tag": "ABX9TyPiG+IsBhY9lUPrholCiKS5",
+   "include_colab_link": true
   },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "view-in-github",
-        "colab_type": "text"
-      },
-      "source": [
-        "<a href=\"https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/SLM_%2B_COT_FINETUNE.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "from datasets import load_dataset\n",
-        "from transformers import (\n",
-        "    AutoModelForCausalLM,\n",
-        "    AutoTokenizer,\n",
-        "    TrainingArguments,\n",
-        "    Trainer,\n",
-        "    DataCollatorForLanguageModeling\n",
-        ")\n",
-        "import torch\n",
-        "from torch.utils.data import Dataset\n",
-        "from tqdm import tqdm"
-      ],
-      "metadata": {
-        "id": "Di-Z5MYNDD6o"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "class ConversationDataset(Dataset):\n",
-        "    def __init__(self, dataset, tokenizer, max_length=512):\n",
-        "        self.examples = []\n",
-        "\n",
-        "        print(\"Processing conversations...\")\n",
-        "        for item in tqdm(dataset):\n",
-        "            # Format conversation\n",
-        "            conversation = \"\"\n",
-        "            for turn in item['chosen']:\n",
-        "                role = turn['role']\n",
-        "                content = turn['content']\n",
-        "                if role == 'user':\n",
-        "                    conversation += f\"Human: {content}\\n\"\n",
-        "                else:\n",
-        "                    conversation += f\"Assistant: {content}\\n\"\n",
-        "\n",
-        "            # Tokenize\n",
-        "            encodings = tokenizer(\n",
-        "                conversation,\n",
-        "                truncation=True,\n",
-        "                max_length=max_length,\n",
-        "                padding=\"max_length\",\n",
-        "                return_tensors=\"pt\"\n",
-        "            )\n",
-        "\n",
-        "            self.examples.append({\n",
-        "                \"input_ids\": encodings[\"input_ids\"][0],\n",
-        "                \"attention_mask\": encodings[\"attention_mask\"][0],\n",
-        "                \"labels\": encodings[\"input_ids\"][0].clone()\n",
-        "            })\n",
-        "\n",
-        "    def __len__(self):\n",
-        "        return len(self.examples)\n",
-        "\n",
-        "    def __getitem__(self, idx):\n",
-        "        return self.examples[idx]"
-      ],
-      "metadata": {
-        "id": "uLZ6RwUwcs_2"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "def main():\n",
-        "    # Load dataset\n",
-        "    print(\"Loading dataset...\")\n",
-        "    dataset = load_dataset(\"kenhktsui/longtalk-cot-v0.1\")\n",
-        "\n",
-        "    # Kullan küçük bir subset (test için)\n",
-        "    dataset['train'] = dataset['train'].select(range(1000))\n",
-        "\n",
-        "    # Load model and tokenizer\n",
-        "    print(\"Loading model and tokenizer...\")\n",
-        "    model_name = \"HuggingFaceTB/SmolLM-135M\"\n",
-        "    model = AutoModelForCausalLM.from_pretrained(model_name)\n",
-        "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
-        "\n",
-        "    if tokenizer.pad_token is None:\n",
-        "        tokenizer.pad_token = tokenizer.eos_token\n",
-        "        model.config.pad_token_id = tokenizer.pad_token_id\n",
-        "\n",
-        "    # Split dataset\n",
-        "    print(\"Splitting dataset...\")\n",
-        "    train_size = int(0.9 * len(dataset['train']))\n",
-        "    val_size = len(dataset['train']) - train_size\n",
-        "    train_dataset, val_dataset = torch.utils.data.random_split(\n",
-        "        dataset['train'],\n",
-        "        [train_size, val_size]\n",
-        "    )\n",
-        "\n",
-        "    # Prepare datasets\n",
-        "    print(\"Preparing training dataset...\")\n",
-        "    train_dataset = ConversationDataset(train_dataset, tokenizer)\n",
-        "    print(\"Preparing validation dataset...\")\n",
-        "    eval_dataset = ConversationDataset(val_dataset, tokenizer)\n",
-        "\n",
-        "    # Training arguments\n",
-        "    # Training arguments\n",
-        "    training_args = TrainingArguments(\n",
-        "    output_dir=\"./results\",\n",
-        "    num_train_epochs=3,\n",
-        "    per_device_train_batch_size=4,\n",
-        "    per_device_eval_batch_size=4,\n",
-        "    warmup_steps=500,\n",
-        "    weight_decay=0.01,\n",
-        "    logging_dir=\"./logs\",\n",
-        "    logging_steps=100,\n",
-        "    evaluation_strategy=\"steps\",\n",
-        "    eval_steps=500,\n",
-        "    save_strategy=\"steps\",\n",
-        "    save_steps=500,\n",
-        "    load_best_model_at_end=True,\n",
-        "    gradient_accumulation_steps=4,\n",
-        "    fp16=True,\n",
-        "    report_to=\"none\"\n",
-        "    )\n",
-        "\n",
-        "    # Initialize trainer\n",
-        "    trainer = Trainer(\n",
-        "        model=model,\n",
-        "        args=training_args,\n",
-        "        train_dataset=train_dataset,\n",
-        "        eval_dataset=eval_dataset,\n",
-        "        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n",
-        "    )\n",
-        "\n",
-        "    # Start training\n",
-        "    trainer.train()\n",
-        "\n",
-        "    # Save model\n",
-        "    model_save_path = \"./fine_tuned_smolLM\"\n",
-        "    trainer.save_model(model_save_path)\n",
-        "    tokenizer.save_pretrained(model_save_path)"
-      ],
-      "metadata": {
-        "id": "4qdPDUcXdboZ"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "if __name__ == \"__main__\":\n",
-        "    main()"
-      ],
-      "metadata": {
-        "id": "Y9a_q7mxdfLA"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Model değerlendirme için gerekli fonksiyonlar ve test kodu\n",
-        "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
-        "import torch\n",
-        "\n",
-        "def model_yukle():\n",
-        "    model_path = \"./fine_tuned_smolLM\"\n",
-        "    model = AutoModelForCausalLM.from_pretrained(model_path)\n",
-        "    tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
-        "    return model, tokenizer\n",
-        "\n",
-        "def yanit_uret(prompt, model, tokenizer, max_length=250):\n",
-        "    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-        "    model = model.to(device)\n",
-        "\n",
-        "    # CoT formatında prompt hazırla\n",
-        "    formatted_prompt = f\"Question: {prompt}\\nLet's solve this step by step:\\n\"\n",
-        "\n",
-        "    inputs = tokenizer(formatted_prompt, return_tensors=\"pt\").to(device)\n",
-        "\n",
-        "    with torch.no_grad():\n",
-        "        outputs = model.generate(\n",
-        "            inputs[\"input_ids\"],\n",
-        "            max_length=max_length,\n",
-        "            num_return_sequences=1,\n",
-        "            temperature=0.7,\n",
-        "            top_p=0.9,\n",
-        "            do_sample=True,\n",
-        "            pad_token_id=tokenizer.pad_token_id\n",
-        "        )\n",
-        "\n",
-        "    return tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
-        "\n",
-        "# Chain-of-Thought tarzında test soruları\n",
-        "test_ornekleri = [\n",
-        "    \"What is the sum of the first 5 prime numbers?\",\n",
-        "    \"What is 1 + 1?\"\n",
-        "]\n",
-        "\n",
-        "# Modeli yükle ve test et\n",
-        "print(\"Model Evaluation Results:\")\n",
-        "print(\"-\" * 70)\n",
-        "\n",
-        "model, tokenizer = model_yukle()\n",
-        "\n",
-        "for ornek in test_ornekleri:\n",
-        "    print(f\"\\nQuestion: {ornek}\")\n",
-        "    yanit = yanit_uret(ornek, model, tokenizer)\n",
-        "    print(f\"Response:\\n{yanit}\")\n",
-        "    print(\"-\" * 70)"
-      ],
-      "metadata": {
-        "id": "3lrV75_AlX14"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "dataset = load_dataset(\"kenhktsui/longtalk-cot-v0.1\")\n",
-        "dataset[\"train\"][0]"
-      ],
-      "metadata": {
-        "id": "67UiY-lbqrBV"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "dataset[\"train\"].shape"
-      ],
-      "metadata": {
-        "id": "hDrgPIDwqtYF"
-      },
-      "execution_count": null,
-      "outputs": []
-    }
-  ]
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "view-in-github",
+    "colab_type": "text"
+   },
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/SLM_%2B_COT_FINETUNE.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import os\n",
+    "from datasets import load_dataset\n",
+    "from transformers import (\n",
+    "    AutoModelForCausalLM,\n",
+    "    AutoTokenizer,\n",
+    "    TrainingArguments,\n",
+    "    Trainer,\n",
+    "    DataCollatorForLanguageModeling\n",
+    ")\n",
+    "import torch\n",
+    "from torch.utils.data import Dataset\n",
+    "from tqdm import tqdm"
+   ],
+   "metadata": {
+    "id": "Di-Z5MYNDD6o"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "class ConversationDataset(Dataset):\n",
+    "    def __init__(self, dataset, tokenizer, max_length=512):\n",
+    "        self.examples = []\n",
+    "\n",
+    "        print(\"Processing conversations...\")\n",
+    "        for item in tqdm(dataset):\n",
+    "            # Format conversation\n",
+    "            conversation = \"\"\n",
+    "            for turn in item['chosen']:\n",
+    "                role = turn['role']\n",
+    "                content = turn['content']\n",
+    "                if role == 'user':\n",
+    "                    conversation += f\"Human: {content}\\n\"\n",
+    "                else:\n",
+    "                    conversation += f\"Assistant: {content}\\n\"\n",
+    "\n",
+    "            # Tokenize\n",
+    "            encodings = tokenizer(\n",
+    "                conversation,\n",
+    "                truncation=True,\n",
+    "                max_length=max_length,\n",
+    "                padding=\"max_length\",\n",
+    "                return_tensors=\"pt\"\n",
+    "            )\n",
+    "\n",
+    "            self.examples.append({\n",
+    "                \"input_ids\": encodings[\"input_ids\"][0],\n",
+    "                \"attention_mask\": encodings[\"attention_mask\"][0],\n",
+    "                \"labels\": encodings[\"input_ids\"][0].clone()\n",
+    "            })\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.examples)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        return self.examples[idx]"
+   ],
+   "metadata": {
+    "id": "uLZ6RwUwcs_2"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "def main():\n",
+    "    # Load dataset\n",
+    "    print(\"Loading dataset...\")\n",
+    "    dataset = load_dataset(\"kenhktsui/longtalk-cot-v0.1\")\n",
+    "\n",
+    "    # Use a small subset (for testing)\n",
+    "    dataset['train'] = dataset['train'].select(range(1000))\n",
+    "\n",
+    "    # Load model and tokenizer\n",
+    "    print(\"Loading model and tokenizer...\")\n",
+    "    model_name = \"HuggingFaceTB/SmolLM-135M\"\n",
+    "    model = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "\n",
+    "    if tokenizer.pad_token is None:\n",
+    "        tokenizer.pad_token = tokenizer.eos_token\n",
+    "        model.config.pad_token_id = tokenizer.pad_token_id\n",
+    "\n",
+    "    # Split dataset\n",
+    "    print(\"Splitting dataset...\")\n",
+    "    train_size = int(0.9 * len(dataset['train']))\n",
+    "    val_size = len(dataset['train']) - train_size\n",
+    "    train_dataset, val_dataset = torch.utils.data.random_split(\n",
+    "        dataset['train'],\n",
+    "        [train_size, val_size]\n",
+    "    )\n",
+    "\n",
+    "    # Prepare datasets\n",
+    "    print(\"Preparing training dataset...\")\n",
+    "    train_dataset = ConversationDataset(train_dataset, tokenizer)\n",
+    "    print(\"Preparing validation dataset...\")\n",
+    "    eval_dataset = ConversationDataset(val_dataset, tokenizer)\n",
+    "\n",
+    "    # Training arguments\n",
+    "    # Training arguments\n",
+    "    training_args = TrainingArguments(\n",
+    "    output_dir=\"./results\",\n",
+    "    num_train_epochs=3,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    warmup_steps=500,\n",
+    "    weight_decay=0.01,\n",
+    "    logging_dir=\"./logs\",\n",
+    "    logging_steps=100,\n",
+    "    evaluation_strategy=\"steps\",\n",
+    "    eval_steps=500,\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=500,\n",
+    "    load_best_model_at_end=True,\n",
+    "    gradient_accumulation_steps=4,\n",
+    "    fp16=True,\n",
+    "    report_to=\"none\"\n",
+    "    )\n",
+    "\n",
+    "    # Initialize trainer\n",
+    "    trainer = Trainer(\n",
+    "        model=model,\n",
+    "        args=training_args,\n",
+    "        train_dataset=train_dataset,\n",
+    "        eval_dataset=eval_dataset,\n",
+    "        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n",
+    "    )\n",
+    "\n",
+    "    # Start training\n",
+    "    trainer.train()\n",
+    "\n",
+    "    # Save model\n",
+    "    model_save_path = \"./fine_tuned_smolLM\"\n",
+    "    trainer.save_model(model_save_path)\n",
+    "    tokenizer.save_pretrained(model_save_path)"
+   ],
+   "metadata": {
+    "id": "4qdPDUcXdboZ"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "    main()"
+   ],
+   "metadata": {
+    "id": "Y9a_q7mxdfLA"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# Functions and test code needed for model evaluation\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "import torch\n",
+    "\n",
+    "def model_yukle():\n",
+    "    model_path = \"./fine_tuned_smolLM\"\n",
+    "    model = AutoModelForCausalLM.from_pretrained(model_path)\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
+    "    return model, tokenizer\n",
+    "\n",
+    "def yanit_uret(prompt, model, tokenizer, max_length=250):\n",
+    "    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "    model = model.to(device)\n",
+    "\n",
+    "    # Prepare a CoT-style prompt\n",
+    "    formatted_prompt = f\"Question: {prompt}\\nLet's solve this step by step:\\n\"\n",
+    "\n",
+    "    inputs = tokenizer(formatted_prompt, return_tensors=\"pt\").to(device)\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model.generate(\n",
+    "            inputs[\"input_ids\"],\n",
+    "            max_length=max_length,\n",
+    "            num_return_sequences=1,\n",
+    "            temperature=0.7,\n",
+    "            top_p=0.9,\n",
+    "            do_sample=True,\n",
+    "            pad_token_id=tokenizer.pad_token_id\n",
+    "        )\n",
+    "\n",
+    "    return tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
+    "\n",
+    "# Chain-of-Thought style test questions\n",
+    "test_ornekleri = [\n",
+    "    \"What is the sum of the first 5 prime numbers?\",\n",
+    "    \"What is 1 + 1?\"\n",
+    "]\n",
+    "\n",
+    "# Load and test the model\n",
+    "print(\"Model Evaluation Results:\")\n",
+    "print(\"-\" * 70)\n",
+    "\n",
+    "model, tokenizer = model_yukle()\n",
+    "\n",
+    "for ornek in test_ornekleri:\n",
+    "    print(f\"\\nQuestion: {ornek}\")\n",
+    "    yanit = yanit_uret(ornek, model, tokenizer)\n",
+    "    print(f\"Response:\\n{yanit}\")\n",
+    "    print(\"-\" * 70)"
+   ],
+   "metadata": {
+    "id": "3lrV75_AlX14"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "dataset = load_dataset(\"kenhktsui/longtalk-cot-v0.1\")\n",
+    "dataset[\"train\"][0]"
+   ],
+   "metadata": {
+    "id": "67UiY-lbqrBV"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "dataset[\"train\"].shape"
+   ],
+   "metadata": {
+    "id": "hDrgPIDwqtYF"
+   },
+   "execution_count": null,
+   "outputs": []
+  }
+ ]
 }
\ No newline at end of file
diff --git a/Genel-5/Transformer_Attention_FFN_Varyantlari_Performans_T.ipynb b/Genel-5/Transformer_Attention_FFN_Varyantlari_Performans_T.ipynb
index 7b5c5df..45d3d85 100644
--- a/Genel-5/Transformer_Attention_FFN_Varyantlari_Performans_T.ipynb
+++ b/Genel-5/Transformer_Attention_FFN_Varyantlari_Performans_T.ipynb
@@ -1,1673 +1,1673 @@
 {
-  "metadata": {
-    "kernelspec": {
-      "language": "python",
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python",
-      "version": "3.10.12",
-      "mimetype": "text/x-python",
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "pygments_lexer": "ipython3",
-      "nbconvert_exporter": "python",
-      "file_extension": ".py"
-    },
-    "kaggle": {
-      "accelerator": "none",
-      "dataSources": [],
-      "dockerImageVersionId": 30887,
-      "isInternetEnabled": true,
-      "language": "python",
-      "sourceType": "notebook",
-      "isGpuEnabled": false
-    },
-    "colab": {
-      "provenance": [],
-      "include_colab_link": true
-    }
+ "metadata": {
+  "kernelspec": {
+   "language": "python",
+   "display_name": "Python 3",
+   "name": "python3"
   },
-  "nbformat_minor": 0,
-  "nbformat": 4,
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "view-in-github",
-        "colab_type": "text"
-      },
-      "source": [
-        "<a href=\"https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/Transformer_Attention_FFN_Varyantlari_Performans_T.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).\n",
-        "# Source for \"Build a Large Language Model From Scratch\"\n",
-        "#   - https://www.manning.com/books/build-a-large-language-model-from-scratch\n",
-        "# Code: https://github.com/rasbt/LLMs-from-scratch\n",
-        "\n",
-        "# This file collects all the relevant code that we covered thus far\n",
-        "# throughout Chapters 2-4.\n",
-        "# This file can be run as a standalone script.\n",
-        "\n",
-        "import tiktoken\n",
-        "import torch\n",
-        "import torch.nn as nn\n",
-        "from torch.utils.data import Dataset, DataLoader\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "\n",
-        "#####################################\n",
-        "# Chapter 2\n",
-        "#####################################\n",
-        "\n",
-        "class GPTDatasetV1(Dataset):\n",
-        "    def __init__(self, txt, tokenizer, max_length, stride):\n",
-        "        self.input_ids = []\n",
-        "        self.target_ids = []\n",
-        "\n",
-        "        # Tokenize the entire text\n",
-        "        token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
-        "\n",
-        "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
-        "        for i in range(0, len(token_ids) - max_length, stride):\n",
-        "            input_chunk = token_ids[i:i + max_length]\n",
-        "            target_chunk = token_ids[i + 1: i + max_length + 1]\n",
-        "            self.input_ids.append(torch.tensor(input_chunk))\n",
-        "            self.target_ids.append(torch.tensor(target_chunk))\n",
-        "\n",
-        "    def __len__(self):\n",
-        "        return len(self.input_ids)\n",
-        "\n",
-        "    def __getitem__(self, idx):\n",
-        "        return self.input_ids[idx], self.target_ids[idx]\n",
-        "\n",
-        "\n",
-        "def create_dataloader_v1(txt, batch_size=4, max_length=256,\n",
-        "                         stride=128, shuffle=True, drop_last=True, num_workers=0):\n",
-        "    # Initialize the tokenizer\n",
-        "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
-        "\n",
-        "    # Create dataset\n",
-        "    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n",
-        "\n",
-        "    # Create dataloader\n",
-        "    dataloader = DataLoader(\n",
-        "        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)\n",
-        "\n",
-        "    return dataloader\n",
-        "\n",
-        "\n",
-        "#####################################\n",
-        "# Chapter 3\n",
-        "#####################################\n",
-        "\n",
-        "class MultiHeadAttention(nn.Module):\n",
-        "    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):\n",
-        "        super().__init__()\n",
-        "        assert d_out % num_heads == 0, \"d_out must be divisible by n_heads\"\n",
-        "\n",
-        "        self.d_out = d_out\n",
-        "        self.num_heads = num_heads\n",
-        "        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim\n",
-        "\n",
-        "        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
-        "        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
-        "        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
-        "        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        b, num_tokens, d_in = x.shape\n",
-        "\n",
-        "        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)\n",
-        "        queries = self.W_query(x)\n",
-        "        values = self.W_value(x)\n",
-        "\n",
-        "        # We implicitly split the matrix by adding a `num_heads` dimension\n",
-        "        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)\n",
-        "        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)\n",
-        "        values = values.view(b, num_tokens, self.num_heads, self.head_dim)\n",
-        "        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)\n",
-        "\n",
-        "        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)\n",
-        "        keys = keys.transpose(1, 2)\n",
-        "        queries = queries.transpose(1, 2)\n",
-        "        values = values.transpose(1, 2)\n",
-        "\n",
-        "        # Compute scaled dot-product attention (aka self-attention) with a causal mask\n",
-        "        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head\n",
-        "\n",
-        "        # Original mask truncated to the number of tokens and converted to boolean\n",
-        "        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n",
-        "\n",
-        "        # Use the mask to fill attention scores\n",
-        "        attn_scores.masked_fill_(mask_bool, -torch.inf)\n",
-        "\n",
-        "        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)\n",
-        "        attn_weights = self.dropout(attn_weights)\n",
-        "\n",
-        "        # Shape: (b, num_tokens, num_heads, head_dim)\n",
-        "        context_vec = (attn_weights @ values).transpose(1, 2)\n",
-        "\n",
-        "        # Combine heads, where self.d_out = self.num_heads * self.head_dim\n",
-        "        context_vec = context_vec.reshape(b, num_tokens, self.d_out)\n",
-        "        context_vec = self.out_proj(context_vec)  # optional projection\n",
-        "\n",
-        "        return context_vec\n",
-        "\n",
-        "\n",
-        "#####################################\n",
-        "# Chapter 4\n",
-        "#####################################\n",
-        "\n",
-        "class LayerNorm(nn.Module):\n",
-        "    def __init__(self, emb_dim):\n",
-        "        super().__init__()\n",
-        "        self.eps = 1e-5\n",
-        "        self.scale = nn.Parameter(torch.ones(emb_dim))\n",
-        "        self.shift = nn.Parameter(torch.zeros(emb_dim))\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        mean = x.mean(dim=-1, keepdim=True)\n",
-        "        var = x.var(dim=-1, keepdim=True, unbiased=False)\n",
-        "        norm_x = (x - mean) / torch.sqrt(var + self.eps)\n",
-        "        return self.scale * norm_x + self.shift\n",
-        "\n",
-        "\n",
-        "class GELU(nn.Module):\n",
-        "    def __init__(self):\n",
-        "        super().__init__()\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        return 0.5 * x * (1 + torch.tanh(\n",
-        "            torch.sqrt(torch.tensor(2.0 / torch.pi)) *\n",
-        "            (x + 0.044715 * torch.pow(x, 3))\n",
-        "        ))\n",
-        "\n",
-        "\n",
-        "class FeedForward(nn.Module):\n",
-        "    def __init__(self, cfg):\n",
-        "        super().__init__()\n",
-        "        self.layers = nn.Sequential(\n",
-        "            nn.Linear(cfg[\"emb_dim\"], 4 * cfg[\"emb_dim\"]),\n",
-        "            GELU(),\n",
-        "            nn.Linear(4 * cfg[\"emb_dim\"], cfg[\"emb_dim\"]),\n",
-        "        )\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        return self.layers(x)\n",
-        "\n",
-        "\n",
-        "class TransformerBlock(nn.Module):\n",
-        "    def __init__(self, cfg):\n",
-        "        super().__init__()\n",
-        "        self.att = MultiHeadAttention(\n",
-        "            d_in=cfg[\"emb_dim\"],\n",
-        "            d_out=cfg[\"emb_dim\"],\n",
-        "            context_length=cfg[\"context_length\"],\n",
-        "            num_heads=cfg[\"n_heads\"],\n",
-        "            dropout=cfg[\"drop_rate\"],\n",
-        "            qkv_bias=cfg[\"qkv_bias\"])\n",
-        "        self.ff = FeedForward(cfg)\n",
-        "        self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
-        "        self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
-        "        self.drop_shortcut = nn.Dropout(cfg[\"drop_rate\"])\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        # Shortcut connection for attention block\n",
-        "        shortcut = x\n",
-        "        x = self.norm1(x)\n",
-        "        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]\n",
-        "        x = self.drop_shortcut(x)\n",
-        "        x = x + shortcut  # Add the original input back\n",
-        "\n",
-        "        # Shortcut connection for feed-forward block\n",
-        "        shortcut = x\n",
-        "        x = self.norm2(x)\n",
-        "        x = self.ff(x)\n",
-        "        x = self.drop_shortcut(x)\n",
-        "        x = x + shortcut  # Add the original input back\n",
-        "\n",
-        "        return x\n",
-        "\n",
-        "\n",
-        "class GPTModel(nn.Module):\n",
-        "    def __init__(self, cfg):\n",
-        "        super().__init__()\n",
-        "        self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
-        "        self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
-        "        self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
-        "\n",
-        "        self.trf_blocks = nn.Sequential(\n",
-        "            *[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n",
-        "\n",
-        "        self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n",
-        "        self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False)\n",
-        "\n",
-        "    def forward(self, in_idx):\n",
-        "        batch_size, seq_len = in_idx.shape\n",
-        "        tok_embeds = self.tok_emb(in_idx)\n",
-        "        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
-        "        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]\n",
-        "        x = self.drop_emb(x)\n",
-        "        x = self.trf_blocks(x)\n",
-        "        x = self.final_norm(x)\n",
-        "        logits = self.out_head(x)\n",
-        "        return logits\n",
-        "\n",
-        "\n",
-        "def generate_text_simple(model, idx, max_new_tokens, context_size):\n",
-        "    # idx is (B, T) array of indices in the current context\n",
-        "    for _ in range(max_new_tokens):\n",
-        "\n",
-        "        # Crop current context if it exceeds the supported context size\n",
-        "        # E.g., if LLM supports only 5 tokens, and the context size is 10\n",
-        "        # then only the last 5 tokens are used as context\n",
-        "        idx_cond = idx[:, -context_size:]\n",
-        "\n",
-        "        # Get the predictions\n",
-        "        with torch.no_grad():\n",
-        "            logits = model(idx_cond)\n",
-        "\n",
-        "        # Focus only on the last time step\n",
-        "        # (batch, n_token, vocab_size) becomes (batch, vocab_size)\n",
-        "        logits = logits[:, -1, :]\n",
-        "\n",
-        "        # Get the idx of the vocab entry with the highest logits value\n",
-        "        idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch, 1)\n",
-        "\n",
-        "        # Append sampled index to the running sequence\n",
-        "        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)\n",
-        "\n",
-        "    return idx\n",
-        "\n",
-        "\n",
-        "#####################################\n",
-        "# Chapter 5\n",
-        "####################################\n",
-        "\n",
-        "\n",
-        "def calc_loss_batch(input_batch, target_batch, model, device):\n",
-        "    input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n",
-        "    logits = model(input_batch)\n",
-        "    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())\n",
-        "    return loss\n",
-        "\n",
-        "\n",
-        "def calc_loss_loader(data_loader, model, device, num_batches=None):\n",
-        "    total_loss = 0.\n",
-        "    if len(data_loader) == 0:\n",
-        "        return float(\"nan\")\n",
-        "    elif num_batches is None:\n",
-        "        num_batches = len(data_loader)\n",
-        "    else:\n",
-        "        num_batches = min(num_batches, len(data_loader))\n",
-        "    for i, (input_batch, target_batch) in enumerate(data_loader):\n",
-        "        if i < num_batches:\n",
-        "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
-        "            total_loss += loss.item()\n",
-        "        else:\n",
-        "            break\n",
-        "    return total_loss / num_batches\n",
-        "\n",
-        "\n",
-        "def evaluate_model(model, train_loader, val_loader, device, eval_iter):\n",
-        "    model.eval()\n",
-        "    with torch.no_grad():\n",
-        "        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)\n",
-        "        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)\n",
-        "    model.train()\n",
-        "    return train_loss, val_loss\n",
-        "\n",
-        "\n",
-        "def generate_and_print_sample(model, tokenizer, device, start_context):\n",
-        "    model.eval()\n",
-        "    context_size = model.pos_emb.weight.shape[0]\n",
-        "    encoded = text_to_token_ids(start_context, tokenizer).to(device)\n",
-        "    with torch.no_grad():\n",
-        "        token_ids = generate_text_simple(\n",
-        "            model=model, idx=encoded,\n",
-        "            max_new_tokens=50, context_size=context_size)\n",
-        "        decoded_text = token_ids_to_text(token_ids, tokenizer)\n",
-        "        print(decoded_text.replace(\"\\n\", \" \"))  # Compact print format\n",
-        "    model.train()\n",
-        "\n",
-        "\n",
-        "def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):\n",
-        "    fig, ax1 = plt.subplots(figsize=(5, 3))\n",
-        "\n",
-        "    # Plot training and validation loss against epochs\n",
-        "    ax1.plot(epochs_seen, train_losses, label=\"Training loss\")\n",
-        "    ax1.plot(epochs_seen, val_losses, linestyle=\"-.\", label=\"Validation loss\")\n",
-        "    ax1.set_xlabel(\"Epochs\")\n",
-        "    ax1.set_ylabel(\"Loss\")\n",
-        "    ax1.legend(loc=\"upper right\")\n",
-        "\n",
-        "    # Create a second x-axis for tokens seen\n",
-        "    ax2 = ax1.twiny()  # Create a second x-axis that shares the same y-axis\n",
-        "    ax2.plot(tokens_seen, train_losses, alpha=0)  # Invisible plot for aligning ticks\n",
-        "    ax2.set_xlabel(\"Tokens seen\")\n",
-        "\n",
-        "    fig.tight_layout()  # Adjust layout to make room\n",
-        "    # plt.show()\n",
-        "\n",
-        "\n",
-        "def text_to_token_ids(text, tokenizer):\n",
-        "    encoded = tokenizer.encode(text)\n",
-        "    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension\n",
-        "    return encoded_tensor\n",
-        "\n",
-        "\n",
-        "def token_ids_to_text(token_ids, tokenizer):\n",
-        "    flat = token_ids.squeeze(0)  # remove batch dimension\n",
-        "    return tokenizer.decode(flat.tolist())\n"
-      ],
-      "metadata": {
-        "trusted": true,
-        "execution": {
-          "iopub.status.busy": "2025-02-12T22:03:47.011719Z",
-          "iopub.execute_input": "2025-02-12T22:03:47.01204Z",
-          "iopub.status.idle": "2025-02-12T22:03:47.049181Z",
-          "shell.execute_reply.started": "2025-02-12T22:03:47.012016Z",
-          "shell.execute_reply": "2025-02-12T22:03:47.0478Z"
-        },
-        "id": "ML-KCXoIw_I0"
-      },
-      "outputs": [],
-      "execution_count": null
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import time\n",
-        "import torch\n",
-        "import torch.optim as optim\n",
-        "\n",
-        "#####################################\n",
-        "# Ayarlar ve Veri Hazırlığı\n",
-        "#####################################\n",
-        "\n",
-        "def load_data():\n",
-        "    # Demo amaçlı küçük bir metin. Gerçek uygulamada daha büyük bir corpus kullanılmalı.\n",
-        "    text = (\"Once upon a time, in a land far, far away, there was a kingdom where magic was common \"\n",
-        "            \"and adventure awaited around every corner. \") * 100  # metni tekrarlayarak uzunluyoruz\n",
-        "    return text\n",
-        "\n",
-        "def prepare_dataloaders(text, batch_size=4, max_length=128, stride=64):\n",
-        "    # Eğitim ve doğrulama için veriyi bölelim (örneğin, %90 eğitim, %10 doğrulama)\n",
-        "    split_idx = int(0.9 * len(text))\n",
-        "    train_text = text[:split_idx]\n",
-        "    val_text = text[split_idx:]\n",
-        "    train_loader = create_dataloader_v1(train_text, batch_size=batch_size,\n",
-        "                                        max_length=max_length, stride=stride)\n",
-        "    val_loader = create_dataloader_v1(val_text, batch_size=batch_size,\n",
-        "                                      max_length=max_length, stride=stride)\n",
-        "    return train_loader, val_loader\n",
-        "\n",
-        "#####################################\n",
-        "# Model Eğitimi\n",
-        "#####################################\n",
-        "\n",
-        "def train_model(model, train_loader, val_loader, device, epochs=5, eval_iter=10):\n",
-        "    optimizer = optim.Adam(model.parameters(), lr=3e-4)\n",
-        "    model.to(device)\n",
-        "\n",
-        "    for epoch in range(epochs):\n",
-        "        model.train()\n",
-        "        epoch_loss = 0.0\n",
-        "        start_time = time.time()\n",
-        "\n",
-        "        for batch_idx, (input_batch, target_batch) in enumerate(train_loader):\n",
-        "            optimizer.zero_grad()\n",
-        "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
-        "            loss.backward()\n",
-        "            optimizer.step()\n",
-        "            epoch_loss += loss.item()\n",
-        "\n",
-        "            if (batch_idx + 1) % 10 == 0:\n",
-        "                print(f\"Epoch {epoch+1} Batch {batch_idx+1}, Loss: {loss.item():.4f}\")\n",
-        "\n",
-        "        avg_loss = epoch_loss / len(train_loader)\n",
-        "        elapsed = time.time() - start_time\n",
-        "        print(f\"Epoch {epoch+1} tamamlandı (süre: {elapsed:.2f}s), ort. loss: {avg_loss:.4f}\")\n",
-        "\n",
-        "        # Kısa bir değerlendirme: eğitim ve doğrulama loss değerlerini hesapla\n",
-        "        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)\n",
-        "        print(f\"Epoch {epoch+1} değerlendirme: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}\\n\")\n",
-        "\n",
-        "#####################################\n",
-        "# Metin Üretimi\n",
-        "#####################################\n",
-        "\n",
-        "def generate_sample(model, tokenizer, device, prompt, max_new_tokens=50):\n",
-        "    print(\"Üretilen metin örneği:\\n\")\n",
-        "    generate_and_print_sample(model, tokenizer, device, prompt)\n",
-        "\n",
-        "#####################################\n",
-        "# Ana Fonksiyon\n",
-        "#####################################\n",
-        "\n",
-        "def main():\n",
-        "    # Cihaz seçimi (GPU varsa kullanılır)\n",
-        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "    print(f\"Kullanılan cihaz: {device}\")\n",
-        "\n",
-        "    # Veri hazırlığı\n",
-        "    text = load_data()\n",
-        "    train_loader, val_loader = prepare_dataloaders(text, batch_size=4, max_length=128, stride=64)\n",
-        "\n",
-        "    # Tokenizer ve model konfigürasyonu\n",
-        "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
-        "    cfg = {\n",
-        "        \"vocab_size\": tokenizer.n_vocab,  # Tokenizer'ın sözlüğündeki kelime sayısı\n",
-        "        \"emb_dim\": 128,                   # Küçük bir embedding boyutu (demo amaçlı)\n",
-        "        \"context_length\": 128,            # Maksimum dizi uzunluğu\n",
-        "        \"drop_rate\": 0.1,\n",
-        "        \"n_layers\": 8,                    # Katman sayısı\n",
-        "        \"n_heads\": 4,                     # Çoklu başlık sayısı (emb_dim'in tam böleni olmalı)\n",
-        "        \"qkv_bias\": True,\n",
-        "    }\n",
-        "\n",
-        "    # Model oluşturulması\n",
-        "    model = GPTModel(cfg)\n",
-        "\n",
-        "    # Modelin eğitimi\n",
-        "    train_model(model, train_loader, val_loader, device, epochs=5, eval_iter=10)\n",
-        "\n",
-        "    # Eğitim bittikten sonra, bir başlangıç prompt'u ile metin üretimi yapalım\n",
-        "    prompt = \"Once upon a time \"\n",
-        "    generate_sample(model, tokenizer, device, prompt, max_new_tokens=50)\n",
-        "\n",
-        "if __name__ == \"__main__\":\n",
-        "    main()\n"
-      ],
-      "metadata": {
-        "trusted": true,
-        "execution": {
-          "iopub.status.busy": "2025-02-12T18:01:02.043298Z",
-          "iopub.execute_input": "2025-02-12T18:01:02.043586Z",
-          "iopub.status.idle": "2025-02-12T18:01:04.598147Z",
-          "shell.execute_reply.started": "2025-02-12T18:01:02.043567Z",
-          "shell.execute_reply": "2025-02-12T18:01:04.597344Z"
-        },
-        "id": "DEJyddCuw_I3"
-      },
-      "outputs": [],
-      "execution_count": null
+  "language_info": {
+   "name": "python",
+   "version": "3.10.12",
+   "mimetype": "text/x-python",
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "pygments_lexer": "ipython3",
+   "nbconvert_exporter": "python",
+   "file_extension": ".py"
+  },
+  "kaggle": {
+   "accelerator": "none",
+   "dataSources": [],
+   "dockerImageVersionId": 30887,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook",
+   "isGpuEnabled": false
+  },
+  "colab": {
+   "provenance": [],
+   "include_colab_link": true
+  }
+ },
+ "nbformat_minor": 0,
+ "nbformat": 4,
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "view-in-github",
+    "colab_type": "text"
+   },
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/Transformer_Attention_FFN_Varyantlari_Performans_T.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).\n",
+    "# Source for \"Build a Large Language Model From Scratch\"\n",
+    "#   - https://www.manning.com/books/build-a-large-language-model-from-scratch\n",
+    "# Code: https://github.com/rasbt/LLMs-from-scratch\n",
+    "\n",
+    "# This file collects all the relevant code that we covered thus far\n",
+    "# throughout Chapters 2-4.\n",
+    "# This file can be run as a standalone script.\n",
+    "\n",
+    "import tiktoken\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "\n",
+    "#####################################\n",
+    "# Chapter 2\n",
+    "#####################################\n",
+    "\n",
+    "class GPTDatasetV1(Dataset):\n",
+    "    def __init__(self, txt, tokenizer, max_length, stride):\n",
+    "        self.input_ids = []\n",
+    "        self.target_ids = []\n",
+    "\n",
+    "        # Tokenize the entire text\n",
+    "        token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
+    "\n",
+    "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
+    "        for i in range(0, len(token_ids) - max_length, stride):\n",
+    "            input_chunk = token_ids[i:i + max_length]\n",
+    "            target_chunk = token_ids[i + 1: i + max_length + 1]\n",
+    "            self.input_ids.append(torch.tensor(input_chunk))\n",
+    "            self.target_ids.append(torch.tensor(target_chunk))\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.input_ids)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        return self.input_ids[idx], self.target_ids[idx]\n",
+    "\n",
+    "\n",
+    "def create_dataloader_v1(txt, batch_size=4, max_length=256,\n",
+    "                         stride=128, shuffle=True, drop_last=True, num_workers=0):\n",
+    "    # Initialize the tokenizer\n",
+    "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
+    "\n",
+    "    # Create dataset\n",
+    "    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n",
+    "\n",
+    "    # Create dataloader\n",
+    "    dataloader = DataLoader(\n",
+    "        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)\n",
+    "\n",
+    "    return dataloader\n",
+    "\n",
+    "\n",
+    "#####################################\n",
+    "# Chapter 3\n",
+    "#####################################\n",
+    "\n",
+    "class MultiHeadAttention(nn.Module):\n",
+    "    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):\n",
+    "        super().__init__()\n",
+    "        assert d_out % num_heads == 0, \"d_out must be divisible by n_heads\"\n",
+    "\n",
+    "        self.d_out = d_out\n",
+    "        self.num_heads = num_heads\n",
+    "        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim\n",
+    "\n",
+    "        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
+    "        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
+    "        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
+    "        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        b, num_tokens, d_in = x.shape\n",
+    "\n",
+    "        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)\n",
+    "        queries = self.W_query(x)\n",
+    "        values = self.W_value(x)\n",
+    "\n",
+    "        # We implicitly split the matrix by adding a `num_heads` dimension\n",
+    "        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)\n",
+    "        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)\n",
+    "        values = values.view(b, num_tokens, self.num_heads, self.head_dim)\n",
+    "        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)\n",
+    "\n",
+    "        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)\n",
+    "        keys = keys.transpose(1, 2)\n",
+    "        queries = queries.transpose(1, 2)\n",
+    "        values = values.transpose(1, 2)\n",
+    "\n",
+    "        # Compute scaled dot-product attention (aka self-attention) with a causal mask\n",
+    "        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head\n",
+    "\n",
+    "        # Original mask truncated to the number of tokens and converted to boolean\n",
+    "        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n",
+    "\n",
+    "        # Use the mask to fill attention scores\n",
+    "        attn_scores.masked_fill_(mask_bool, -torch.inf)\n",
+    "\n",
+    "        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)\n",
+    "        attn_weights = self.dropout(attn_weights)\n",
+    "\n",
+    "        # Shape: (b, num_tokens, num_heads, head_dim)\n",
+    "        context_vec = (attn_weights @ values).transpose(1, 2)\n",
+    "\n",
+    "        # Combine heads, where self.d_out = self.num_heads * self.head_dim\n",
+    "        context_vec = context_vec.reshape(b, num_tokens, self.d_out)\n",
+    "        context_vec = self.out_proj(context_vec)  # optional projection\n",
+    "\n",
+    "        return context_vec\n",
+    "\n",
+    "\n",
+    "#####################################\n",
+    "# Chapter 4\n",
+    "#####################################\n",
+    "\n",
+    "class LayerNorm(nn.Module):\n",
+    "    def __init__(self, emb_dim):\n",
+    "        super().__init__()\n",
+    "        self.eps = 1e-5\n",
+    "        self.scale = nn.Parameter(torch.ones(emb_dim))\n",
+    "        self.shift = nn.Parameter(torch.zeros(emb_dim))\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        mean = x.mean(dim=-1, keepdim=True)\n",
+    "        var = x.var(dim=-1, keepdim=True, unbiased=False)\n",
+    "        norm_x = (x - mean) / torch.sqrt(var + self.eps)\n",
+    "        return self.scale * norm_x + self.shift\n",
+    "\n",
+    "\n",
+    "class GELU(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return 0.5 * x * (1 + torch.tanh(\n",
+    "            torch.sqrt(torch.tensor(2.0 / torch.pi)) *\n",
+    "            (x + 0.044715 * torch.pow(x, 3))\n",
+    "        ))\n",
+    "\n",
+    "\n",
+    "class FeedForward(nn.Module):\n",
+    "    def __init__(self, cfg):\n",
+    "        super().__init__()\n",
+    "        self.layers = nn.Sequential(\n",
+    "            nn.Linear(cfg[\"emb_dim\"], 4 * cfg[\"emb_dim\"]),\n",
+    "            GELU(),\n",
+    "            nn.Linear(4 * cfg[\"emb_dim\"], cfg[\"emb_dim\"]),\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.layers(x)\n",
+    "\n",
+    "\n",
+    "class TransformerBlock(nn.Module):\n",
+    "    def __init__(self, cfg):\n",
+    "        super().__init__()\n",
+    "        self.att = MultiHeadAttention(\n",
+    "            d_in=cfg[\"emb_dim\"],\n",
+    "            d_out=cfg[\"emb_dim\"],\n",
+    "            context_length=cfg[\"context_length\"],\n",
+    "            num_heads=cfg[\"n_heads\"],\n",
+    "            dropout=cfg[\"drop_rate\"],\n",
+    "            qkv_bias=cfg[\"qkv_bias\"])\n",
+    "        self.ff = FeedForward(cfg)\n",
+    "        self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
+    "        self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
+    "        self.drop_shortcut = nn.Dropout(cfg[\"drop_rate\"])\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        # Shortcut connection for attention block\n",
+    "        shortcut = x\n",
+    "        x = self.norm1(x)\n",
+    "        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]\n",
+    "        x = self.drop_shortcut(x)\n",
+    "        x = x + shortcut  # Add the original input back\n",
+    "\n",
+    "        # Shortcut connection for feed-forward block\n",
+    "        shortcut = x\n",
+    "        x = self.norm2(x)\n",
+    "        x = self.ff(x)\n",
+    "        x = self.drop_shortcut(x)\n",
+    "        x = x + shortcut  # Add the original input back\n",
+    "\n",
+    "        return x\n",
+    "\n",
+    "\n",
+    "class GPTModel(nn.Module):\n",
+    "    def __init__(self, cfg):\n",
+    "        super().__init__()\n",
+    "        self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
+    "        self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
+    "        self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
+    "\n",
+    "        self.trf_blocks = nn.Sequential(\n",
+    "            *[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n",
+    "\n",
+    "        self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n",
+    "        self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False)\n",
+    "\n",
+    "    def forward(self, in_idx):\n",
+    "        batch_size, seq_len = in_idx.shape\n",
+    "        tok_embeds = self.tok_emb(in_idx)\n",
+    "        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
+    "        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]\n",
+    "        x = self.drop_emb(x)\n",
+    "        x = self.trf_blocks(x)\n",
+    "        x = self.final_norm(x)\n",
+    "        logits = self.out_head(x)\n",
+    "        return logits\n",
+    "\n",
+    "\n",
+    "def generate_text_simple(model, idx, max_new_tokens, context_size):\n",
+    "    # idx is (B, T) array of indices in the current context\n",
+    "    for _ in range(max_new_tokens):\n",
+    "\n",
+    "        # Crop current context if it exceeds the supported context size\n",
+    "        # E.g., if LLM supports only 5 tokens, and the context size is 10\n",
+    "        # then only the last 5 tokens are used as context\n",
+    "        idx_cond = idx[:, -context_size:]\n",
+    "\n",
+    "        # Get the predictions\n",
+    "        with torch.no_grad():\n",
+    "            logits = model(idx_cond)\n",
+    "\n",
+    "        # Focus only on the last time step\n",
+    "        # (batch, n_token, vocab_size) becomes (batch, vocab_size)\n",
+    "        logits = logits[:, -1, :]\n",
+    "\n",
+    "        # Get the idx of the vocab entry with the highest logits value\n",
+    "        idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch, 1)\n",
+    "\n",
+    "        # Append sampled index to the running sequence\n",
+    "        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)\n",
+    "\n",
+    "    return idx\n",
+    "\n",
+    "\n",
+    "#####################################\n",
+    "# Chapter 5\n",
+    "####################################\n",
+    "\n",
+    "\n",
+    "def calc_loss_batch(input_batch, target_batch, model, device):\n",
+    "    input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n",
+    "    logits = model(input_batch)\n",
+    "    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())\n",
+    "    return loss\n",
+    "\n",
+    "\n",
+    "def calc_loss_loader(data_loader, model, device, num_batches=None):\n",
+    "    total_loss = 0.\n",
+    "    if len(data_loader) == 0:\n",
+    "        return float(\"nan\")\n",
+    "    elif num_batches is None:\n",
+    "        num_batches = len(data_loader)\n",
+    "    else:\n",
+    "        num_batches = min(num_batches, len(data_loader))\n",
+    "    for i, (input_batch, target_batch) in enumerate(data_loader):\n",
+    "        if i < num_batches:\n",
+    "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
+    "            total_loss += loss.item()\n",
+    "        else:\n",
+    "            break\n",
+    "    return total_loss / num_batches\n",
+    "\n",
+    "\n",
+    "def evaluate_model(model, train_loader, val_loader, device, eval_iter):\n",
+    "    model.eval()\n",
+    "    with torch.no_grad():\n",
+    "        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)\n",
+    "        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)\n",
+    "    model.train()\n",
+    "    return train_loss, val_loss\n",
+    "\n",
+    "\n",
+    "def generate_and_print_sample(model, tokenizer, device, start_context):\n",
+    "    model.eval()\n",
+    "    context_size = model.pos_emb.weight.shape[0]\n",
+    "    encoded = text_to_token_ids(start_context, tokenizer).to(device)\n",
+    "    with torch.no_grad():\n",
+    "        token_ids = generate_text_simple(\n",
+    "            model=model, idx=encoded,\n",
+    "            max_new_tokens=50, context_size=context_size)\n",
+    "        decoded_text = token_ids_to_text(token_ids, tokenizer)\n",
+    "        print(decoded_text.replace(\"\\n\", \" \"))  # Compact print format\n",
+    "    model.train()\n",
+    "\n",
+    "\n",
+    "def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):\n",
+    "    fig, ax1 = plt.subplots(figsize=(5, 3))\n",
+    "\n",
+    "    # Plot training and validation loss against epochs\n",
+    "    ax1.plot(epochs_seen, train_losses, label=\"Training loss\")\n",
+    "    ax1.plot(epochs_seen, val_losses, linestyle=\"-.\", label=\"Validation loss\")\n",
+    "    ax1.set_xlabel(\"Epochs\")\n",
+    "    ax1.set_ylabel(\"Loss\")\n",
+    "    ax1.legend(loc=\"upper right\")\n",
+    "\n",
+    "    # Create a second x-axis for tokens seen\n",
+    "    ax2 = ax1.twiny()  # Create a second x-axis that shares the same y-axis\n",
+    "    ax2.plot(tokens_seen, train_losses, alpha=0)  # Invisible plot for aligning ticks\n",
+    "    ax2.set_xlabel(\"Tokens seen\")\n",
+    "\n",
+    "    fig.tight_layout()  # Adjust layout to make room\n",
+    "    # plt.show()\n",
+    "\n",
+    "\n",
+    "def text_to_token_ids(text, tokenizer):\n",
+    "    encoded = tokenizer.encode(text)\n",
+    "    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension\n",
+    "    return encoded_tensor\n",
+    "\n",
+    "\n",
+    "def token_ids_to_text(token_ids, tokenizer):\n",
+    "    flat = token_ids.squeeze(0)  # remove batch dimension\n",
+    "    return tokenizer.decode(flat.tolist())\n"
+   ],
+   "metadata": {
+    "trusted": true,
+    "execution": {
+     "iopub.status.busy": "2025-02-12T22:03:47.011719Z",
+     "iopub.execute_input": "2025-02-12T22:03:47.01204Z",
+     "iopub.status.idle": "2025-02-12T22:03:47.049181Z",
+     "shell.execute_reply.started": "2025-02-12T22:03:47.012016Z",
+     "shell.execute_reply": "2025-02-12T22:03:47.0478Z"
     },
-    {
-      "cell_type": "code",
-      "source": [
-        "import time\n",
-        "import math\n",
-        "import torch\n",
-        "import torch.nn as nn\n",
-        "import torch.optim as optim\n",
-        "from torch.utils.data import Dataset, DataLoader\n",
-        "import matplotlib.pyplot as plt\n",
-        "import tiktoken\n",
-        "from datasets import load_dataset  # Hugging Face datasets library\n",
-        "import re\n",
-        "\n",
-        "#####################################\n",
-        "# Rotary Positional Embeddings (ROPE) Implementation\n",
-        "#####################################\n",
-        "def apply_rotary_pos_emb(x):\n",
-        "    \"\"\"\n",
-        "    Apply Rotary Positional Embeddings (ROPE) to the input tensor.\n",
-        "\n",
-        "    Args:\n",
-        "        x (torch.Tensor): Input tensor of shape (batch, num_heads, seq_len, head_dim).\n",
-        "\n",
-        "    Returns:\n",
-        "        torch.Tensor: Tensor with ROPE applied.\n",
-        "    \"\"\"\n",
-        "    batch, n_heads, seq_len, head_dim = x.shape\n",
-        "    assert head_dim % 2 == 0, \"head_dim must be even for ROPE\"\n",
-        "\n",
-        "    # Calculate inverse frequencies and positions\n",
-        "    inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2, device=x.device).float() / head_dim))\n",
-        "    positions = torch.arange(seq_len, device=x.device).float()\n",
-        "    sinusoid_inp = torch.einsum(\"i,j->ij\", positions, inv_freq)  # (seq_len, head_dim/2)\n",
-        "    sin = torch.sin(sinusoid_inp)[None, None, :, :]  # (1, 1, seq_len, head_dim/2)\n",
-        "    cos = torch.cos(sinusoid_inp)[None, None, :, :]  # (1, 1, seq_len, head_dim/2)\n",
-        "\n",
-        "    # Split the input tensor into two halves and apply ROPE\n",
-        "    x1, x2 = x[..., :head_dim//2], x[..., head_dim//2:]\n",
-        "    x_rotated = torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)\n",
-        "    return x_rotated\n",
-        "\n",
-        "#####################################\n",
-        "# Dataset and DataLoader: Wikitext (Hugging Face)\n",
-        "#####################################\n",
-        "class GPTDatasetV1(Dataset):\n",
-        "    def __init__(self, text, tokenizer, max_length, stride):\n",
-        "        self.input_ids = []\n",
-        "        self.target_ids = []\n",
-        "\n",
-        "        # Tokenize the text\n",
-        "        token_ids = tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n",
-        "\n",
-        "        # Create input-target pairs\n",
-        "        for i in range(0, len(token_ids) - max_length, stride):\n",
-        "            input_chunk = token_ids[i:i + max_length]\n",
-        "            target_chunk = token_ids[i + 1: i + max_length + 1]\n",
-        "            self.input_ids.append(torch.tensor(input_chunk))\n",
-        "            self.target_ids.append(torch.tensor(target_chunk))\n",
-        "\n",
-        "    def __len__(self):\n",
-        "        return len(self.input_ids)\n",
-        "\n",
-        "    def __getitem__(self, idx):\n",
-        "        return self.input_ids[idx], self.target_ids[idx]\n",
-        "\n",
-        "def create_dataloader_v1(text, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):\n",
-        "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
-        "    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)\n",
-        "    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)\n",
-        "    return dataloader\n",
-        "\n",
-        "def load_wikitext_data(num_lines=10000, dataset_name=\"wikitext\", subset=\"wikitext-103-raw-v1\"):\n",
-        "    \"\"\"\n",
-        "    Load Wikitext data from Hugging Face and concatenate the first `num_lines` lines into a single text.\n",
-        "\n",
-        "    Args:\n",
-        "        num_lines (int): Number of lines to load.\n",
-        "        dataset_name (str): Name of the dataset.\n",
-        "        subset (str): Subset of the dataset.\n",
-        "\n",
-        "    Returns:\n",
-        "        str: Concatenated text.\n",
-        "    \"\"\"\n",
-        "    ds = load_dataset(dataset_name, subset)\n",
-        "    text_lines = ds[\"train\"][\"text\"][:num_lines]\n",
-        "    text = \"\\n\".join(text_lines)\n",
-        "    return text\n",
-        "\n",
-        "def preprocess_text(text):\n",
-        "    \"\"\"\n",
-        "    Preprocess the text data by removing unwanted characters and normalizing whitespace.\n",
-        "\n",
-        "    Args:\n",
-        "        text (str): Input text.\n",
-        "\n",
-        "    Returns:\n",
-        "        str: Preprocessed text.\n",
-        "    \"\"\"\n",
-        "    # Remove special characters and digits, and normalize whitespace\n",
-        "    text = re.sub(r'[^A-Za-z\\s]', '', text)\n",
-        "    text = re.sub(r'\\s+', ' ', text).strip()\n",
-        "    return text\n",
-        "\n",
-        "#####################################\n",
-        "# Advanced Model Components (GPTModel)\n",
-        "#####################################\n",
-        "\n",
-        "class MultiHeadAttention(nn.Module):\n",
-        "    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False, use_rope=False):\n",
-        "        super().__init__()\n",
-        "        assert d_out % num_heads == 0, \"d_out must be divisible by num_heads\"\n",
-        "        self.d_out = d_out\n",
-        "        self.num_heads = num_heads\n",
-        "        self.head_dim = d_out // num_heads\n",
-        "\n",
-        "        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
-        "        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
-        "        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
-        "        self.out_proj = nn.Linear(d_out, d_out)\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "        self.use_rope = use_rope\n",
-        "        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        b, num_tokens, _ = x.shape\n",
-        "\n",
-        "        keys = self.W_key(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)\n",
-        "        queries = self.W_query(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)\n",
-        "        values = self.W_value(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)\n",
-        "\n",
-        "        if self.use_rope:\n",
-        "            queries = apply_rotary_pos_emb(queries)\n",
-        "            keys = apply_rotary_pos_emb(keys)\n",
-        "\n",
-        "        attn_scores = queries @ keys.transpose(2, 3)\n",
-        "        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n",
-        "        attn_scores.masked_fill_(mask_bool, float(\"-inf\"))\n",
-        "\n",
-        "        attn_weights = torch.softmax(attn_scores / math.sqrt(self.head_dim), dim=-1)\n",
-        "        attn_weights = self.dropout(attn_weights)\n",
-        "\n",
-        "        context_vec = (attn_weights @ values).transpose(1, 2).reshape(b, num_tokens, self.d_out)\n",
-        "        context_vec = self.out_proj(context_vec)\n",
-        "        return context_vec\n",
-        "\n",
-        "class LayerNorm(nn.Module):\n",
-        "    def __init__(self, emb_dim):\n",
-        "        super().__init__()\n",
-        "        self.eps = 1e-5\n",
-        "        self.scale = nn.Parameter(torch.ones(emb_dim))\n",
-        "        self.shift = nn.Parameter(torch.zeros(emb_dim))\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        mean = x.mean(dim=-1, keepdim=True)\n",
-        "        var = x.var(dim=-1, keepdim=True, unbiased=False)\n",
-        "        norm_x = (x - mean) / torch.sqrt(var + self.eps)\n",
-        "        return self.scale * norm_x + self.shift\n",
-        "\n",
-        "class GELU(nn.Module):\n",
-        "    def forward(self, x):\n",
-        "        return 0.5 * x * (1 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))\n",
-        "\n",
-        "class FeedForward(nn.Module):\n",
-        "    def __init__(self, cfg):\n",
-        "        super().__init__()\n",
-        "        self.layers = nn.Sequential(\n",
-        "            nn.Linear(cfg[\"emb_dim\"], 4 * cfg[\"emb_dim\"]),\n",
-        "            GELU(),\n",
-        "            nn.Linear(4 * cfg[\"emb_dim\"], cfg[\"emb_dim\"]),\n",
-        "        )\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        return self.layers(x)\n",
-        "\n",
-        "class TransformerBlock(nn.Module):\n",
-        "    def __init__(self, cfg):\n",
-        "        super().__init__()\n",
-        "        self.att = MultiHeadAttention(\n",
-        "            d_in=cfg[\"emb_dim\"],\n",
-        "            d_out=cfg[\"emb_dim\"],\n",
-        "            context_length=cfg[\"context_length\"],\n",
-        "            num_heads=cfg[\"n_heads\"],\n",
-        "            dropout=cfg[\"drop_rate\"],\n",
-        "            qkv_bias=cfg[\"qkv_bias\"],\n",
-        "            use_rope=cfg.get(\"use_rope\", False)\n",
-        "        )\n",
-        "        self.ff = FeedForward(cfg)\n",
-        "        self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
-        "        self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
-        "        self.drop_shortcut = nn.Dropout(cfg[\"drop_rate\"])\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        shortcut = x\n",
-        "        x = self.norm1(x)\n",
-        "        x = self.att(x)\n",
-        "        x = self.drop_shortcut(x)\n",
-        "        x = x + shortcut\n",
-        "\n",
-        "        shortcut = x\n",
-        "        x = self.norm2(x)\n",
-        "        x = self.ff(x)\n",
-        "        x = self.drop_shortcut(x)\n",
-        "        x = x + shortcut\n",
-        "        return x\n",
-        "\n",
-        "class GPTModel(nn.Module):\n",
-        "    def __init__(self, cfg):\n",
-        "        super().__init__()\n",
-        "        self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
-        "        self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
-        "        self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
-        "\n",
-        "        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n",
-        "\n",
-        "        self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n",
-        "        self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False)\n",
-        "\n",
-        "    def forward(self, in_idx):\n",
-        "        batch_size, seq_len = in_idx.shape\n",
-        "        tok_embeds = self.tok_emb(in_idx)\n",
-        "        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
-        "        x = tok_embeds + pos_embeds\n",
-        "        x = self.drop_emb(x)\n",
-        "        x = self.trf_blocks(x)\n",
-        "        x = self.final_norm(x)\n",
-        "        logits = self.out_head(x)\n",
-        "        return logits\n",
-        "\n",
-        "#####################################\n",
-        "# Training and Evaluation Functions\n",
-        "#####################################\n",
-        "def calc_loss_batch(input_batch, target_batch, model, device):\n",
-        "    input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n",
-        "    logits = model(input_batch)\n",
-        "    loss = nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())\n",
-        "    return loss\n",
-        "\n",
-        "def calc_loss_loader(data_loader, model, device, num_batches=None):\n",
-        "    total_loss = 0.0\n",
-        "    if len(data_loader) == 0:\n",
-        "        return float(\"nan\")\n",
-        "    num_batches = num_batches if num_batches is not None else len(data_loader)\n",
-        "    num_batches = min(num_batches, len(data_loader))\n",
-        "    for i, (input_batch, target_batch) in enumerate(data_loader):\n",
-        "        if i < num_batches:\n",
-        "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
-        "            total_loss += loss.item()\n",
-        "        else:\n",
-        "            break\n",
-        "    return total_loss / num_batches\n",
-        "\n",
-        "def evaluate_model(model, train_loader, val_loader, device, eval_iter):\n",
-        "    model.eval()\n",
-        "    with torch.no_grad():\n",
-        "        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)\n",
-        "        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)\n",
-        "    model.train()\n",
-        "    return train_loss, val_loss\n",
-        "\n",
-        "def generate_text_simple(model, idx, max_new_tokens, context_size):\n",
-        "    for _ in range(max_new_tokens):\n",
-        "        idx_cond = idx[:, -context_size:]\n",
-        "        with torch.no_grad():\n",
-        "            logits = model(idx_cond)\n",
-        "        logits = logits[:, -1, :]\n",
-        "        idx_next = torch.argmax(logits, dim=-1, keepdim=True)\n",
-        "        idx = torch.cat((idx, idx_next), dim=1)\n",
-        "    return idx\n",
-        "\n",
-        "def generate_and_print_sample(model, tokenizer, device, start_context):\n",
-        "    model.eval()\n",
-        "    context_size = model.pos_emb.weight.shape[0]\n",
-        "    encoded = text_to_token_ids(start_context, tokenizer).to(device)\n",
-        "    with torch.no_grad():\n",
-        "        token_ids = generate_text_simple(model=model, idx=encoded, max_new_tokens=50, context_size=context_size)\n",
-        "        decoded_text = token_ids_to_text(token_ids, tokenizer)\n",
-        "        print(decoded_text.replace(\"\\n\", \" \"))\n",
-        "    model.train()\n",
-        "\n",
-        "def text_to_token_ids(text, tokenizer):\n",
-        "    encoded = tokenizer.encode(text)\n",
-        "    return torch.tensor(encoded).unsqueeze(0)\n",
-        "\n",
-        "def token_ids_to_text(token_ids, tokenizer):\n",
-        "    flat = token_ids.squeeze(0)\n",
-        "    return tokenizer.decode(flat.tolist())\n",
-        "\n",
-        "def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):\n",
-        "    fig, ax1 = plt.subplots(figsize=(5, 3))\n",
-        "    ax1.plot(epochs_seen, train_losses, label=\"Training loss\")\n",
-        "    ax1.plot(epochs_seen, val_losses, linestyle=\"-.\", label=\"Validation loss\")\n",
-        "    ax1.set_xlabel(\"Epochs\")\n",
-        "    ax1.set_ylabel(\"Loss\")\n",
-        "    ax1.legend(loc=\"upper right\")\n",
-        "    ax2 = ax1.twiny()\n",
-        "    ax2.plot(tokens_seen, train_losses, alpha=0)\n",
-        "    ax2.set_xlabel(\"Tokens seen\")\n",
-        "    fig.tight_layout()\n",
-        "    plt.show()\n",
-        "\n",
-        "#####################################\n",
-        "# Model Training\n",
-        "#####################################\n",
-        "def train_model(model, train_loader, val_loader, device, epochs=30, eval_iter=20, lr=1e-4):\n",
-        "    optimizer = optim.Adam(model.parameters(), lr=lr)\n",
-        "    model.to(device)\n",
-        "    for epoch in range(epochs):\n",
-        "        model.train()\n",
-        "        epoch_loss = 0.0\n",
-        "        start_time = time.time()\n",
-        "\n",
-        "        for batch_idx, (input_batch, target_batch) in enumerate(train_loader):\n",
-        "            optimizer.zero_grad()\n",
-        "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
-        "            loss.backward()\n",
-        "            optimizer.step()\n",
-        "            epoch_loss += loss.item()\n",
-        "            if (batch_idx + 1) % 10 == 0:\n",
-        "                print(f\"Epoch {epoch+1} Batch {batch_idx+1}, Loss: {loss.item():.4f}\")\n",
-        "\n",
-        "        avg_loss = epoch_loss / len(train_loader)\n",
-        "        elapsed = time.time() - start_time\n",
-        "        print(f\"Epoch {epoch+1} completed (time: {elapsed:.2f}s), avg. loss: {avg_loss:.4f}\")\n",
-        "        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)\n",
-        "        print(f\"Epoch {epoch+1} evaluation: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}\\n\")\n",
-        "\n",
-        "#####################################\n",
-        "# Text Generation\n",
-        "#####################################\n",
-        "def generate_sample(model, tokenizer, device, prompt, max_new_tokens=50):\n",
-        "    print(\"Generated text sample:\\n\")\n",
-        "    generate_and_print_sample(model, tokenizer, device, prompt)\n",
-        "\n",
-        "#####################################\n",
-        "# Main Function\n",
-        "#####################################\n",
-        "def main():\n",
-        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "    print(f\"Using device: {device}\")\n",
-        "\n",
-        "    # Load Wikitext data from Hugging Face and use the first 50k lines\n",
-        "    text = load_wikitext_data(num_lines=50000, dataset_name=\"wikitext\", subset=\"wikitext-103-raw-v1\")\n",
-        "\n",
-        "    # Preprocess the text data\n",
-        "    text = preprocess_text(text)\n",
-        "\n",
-        "    # Split data into training and validation sets (e.g., 90% train, 10% validation)\n",
-        "    split_idx = int(0.9 * len(text))\n",
-        "    train_text = text[:split_idx]\n",
-        "    val_text = text[split_idx:]\n",
-        "    train_loader = create_dataloader_v1(train_text, batch_size=8, max_length=256, stride=128)\n",
-        "    val_loader = create_dataloader_v1(val_text, batch_size=8, max_length=256, stride=128)\n",
-        "\n",
-        "    # Tokenizer and advanced model configuration\n",
-        "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
-        "    cfg = {\n",
-        "        \"vocab_size\": tokenizer.n_vocab,\n",
-        "        \"emb_dim\": 256,\n",
-        "        \"context_length\": 256,\n",
-        "        \"drop_rate\": 0.1,\n",
-        "        \"n_layers\": 6,\n",
-        "        \"n_heads\": 8,\n",
-        "        \"qkv_bias\": True,\n",
-        "        \"use_rope\": True,\n",
-        "    }\n",
-        "\n",
-        "    model = GPTModel(cfg)\n",
-        "    train_model(model, train_loader, val_loader, device, epochs=1, eval_iter=25, lr=1e-5)\n",
-        "\n",
-        "    # Generate text after training with a given prompt\n",
-        "    prompt = \"Valkyria Chronicles III \"\n",
-        "    generate_sample(model, tokenizer, device, prompt, max_new_tokens=100)\n",
-        "\n",
-        "if __name__ == \"__main__\":\n",
-        "    main()\n"
-      ],
-      "metadata": {
-        "trusted": true,
-        "execution": {
-          "iopub.status.busy": "2025-02-12T20:33:49.198352Z",
-          "iopub.execute_input": "2025-02-12T20:33:49.198767Z",
-          "iopub.status.idle": "2025-02-12T20:38:21.836553Z",
-          "shell.execute_reply.started": "2025-02-12T20:33:49.198738Z",
-          "shell.execute_reply": "2025-02-12T20:38:21.83572Z"
-        },
-        "id": "2E8o3nAXw_I3"
-      },
-      "outputs": [],
-      "execution_count": null
+    "id": "ML-KCXoIw_I0"
+   },
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import time\n",
+    "import torch\n",
+    "import torch.optim as optim\n",
+    "\n",
+    "#####################################\n",
+    "# Settings and data preparation\n",
+    "#####################################\n",
+    "\n",
+    "def load_data():\n",
+    "    # Small sample text for demonstration. Use a larger corpus in real applications.\n",
+    "    text = (\"Once upon a time, in a land far, far away, there was a kingdom where magic was common \"\n",
+    "            \"and adventure awaited around every corner. \") * 100  # metni tekrarlayarak uzunluyoruz\n",
+    "    return text\n",
+    "\n",
+    "def prepare_dataloaders(text, batch_size=4, max_length=128, stride=64):\n",
+    "    # Split the data for training and validation (e.g., 90% training, 10% validation)\n",
+    "    split_idx = int(0.9 * len(text))\n",
+    "    train_text = text[:split_idx]\n",
+    "    val_text = text[split_idx:]\n",
+    "    train_loader = create_dataloader_v1(train_text, batch_size=batch_size,\n",
+    "                                        max_length=max_length, stride=stride)\n",
+    "    val_loader = create_dataloader_v1(val_text, batch_size=batch_size,\n",
+    "                                      max_length=max_length, stride=stride)\n",
+    "    return train_loader, val_loader\n",
+    "\n",
+    "#####################################\n",
+    "# Model Trainingi\n",
+    "#####################################\n",
+    "\n",
+    "def train_model(model, train_loader, val_loader, device, epochs=5, eval_iter=10):\n",
+    "    optimizer = optim.Adam(model.parameters(), lr=3e-4)\n",
+    "    model.to(device)\n",
+    "\n",
+    "    for epoch in range(epochs):\n",
+    "        model.train()\n",
+    "        epoch_loss = 0.0\n",
+    "        start_time = time.time()\n",
+    "\n",
+    "        for batch_idx, (input_batch, target_batch) in enumerate(train_loader):\n",
+    "            optimizer.zero_grad()\n",
+    "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "            epoch_loss += loss.item()\n",
+    "\n",
+    "            if (batch_idx + 1) % 10 == 0:\n",
+    "                print(f\"Epoch {epoch+1} Batch {batch_idx+1}, Loss: {loss.item():.4f}\")\n",
+    "\n",
+    "        avg_loss = epoch_loss / len(train_loader)\n",
+    "        elapsed = time.time() - start_time\n",
+    "        print(f\"Epoch {epoch+1} completed (time: {elapsed:.2f}s), avg. loss: {avg_loss:.4f}\")\n",
+    "\n",
+    "        # Brief evaluation: compute training and validation losses\n",
+    "        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)\n",
+    "        print(f\"Epoch {epoch+1} evaluation: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}\\n\")\n",
+    "\n",
+    "#####################################\n",
+    "# Text generation\n",
+    "#####################################\n",
+    "\n",
+    "def generate_sample(model, tokenizer, device, prompt, max_new_tokens=50):\n",
+    "    print(\"Generated text example:\\n\")\n",
+    "    generate_and_print_sample(model, tokenizer, device, prompt)\n",
+    "\n",
+    "#####################################\n",
+    "# Ana Fonksiyon\n",
+    "#####################################\n",
+    "\n",
+    "def main():\n",
+    "    # Device selection (use GPU if available)\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "    print(f\"Device in use: {device}\")\n",
+    "\n",
+    "    # Data preparation\n",
+    "    text = load_data()\n",
+    "    train_loader, val_loader = prepare_dataloaders(text, batch_size=4, max_length=128, stride=64)\n",
+    "\n",
+    "    # Tokenizer and model configuration\n",
+    "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
+    "    cfg = {\n",
+    "        \"vocab_size\": tokenizer.n_vocab,  # Number of tokens in the tokenizer vocabulary\n",
+    "        \"emb_dim\": 128,                   # Small embedding dimension (for demo purposes)\n",
+    "        \"context_length\": 128,            # Maximum sequence length\n",
+    "        \"drop_rate\": 0.1,\n",
+    "        \"n_layers\": 8,                    # Number of layers\n",
+    "        \"n_heads\": 4,                     # Number of heads (must divide emb_dim exactly)\n",
+    "        \"qkv_bias\": True,\n",
+    "    }\n",
+    "\n",
+    "    # Model construction\n",
+    "    model = GPTModel(cfg)\n",
+    "\n",
+    "    # Model training\n",
+    "    train_model(model, train_loader, val_loader, device, epochs=5, eval_iter=10)\n",
+    "\n",
+    "    # After training, generate text with an initial prompt\n",
+    "    prompt = \"Once upon a time \"\n",
+    "    generate_sample(model, tokenizer, device, prompt, max_new_tokens=50)\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()\n"
+   ],
+   "metadata": {
+    "trusted": true,
+    "execution": {
+     "iopub.status.busy": "2025-02-12T18:01:02.043298Z",
+     "iopub.execute_input": "2025-02-12T18:01:02.043586Z",
+     "iopub.status.idle": "2025-02-12T18:01:04.598147Z",
+     "shell.execute_reply.started": "2025-02-12T18:01:02.043567Z",
+     "shell.execute_reply": "2025-02-12T18:01:04.597344Z"
     },
-    {
-      "cell_type": "code",
-      "source": [
-        "import torch\n",
-        "import torch.nn as nn\n",
-        "import torch.optim as optim\n",
-        "import math\n",
-        "import time\n",
-        "\n",
-        "#############################################\n",
-        "# 1. Alternatif Normalizasyon: RMSNorm\n",
-        "#############################################\n",
-        "class RMSNorm(nn.Module):\n",
-        "    def __init__(self, emb_dim, eps=1e-8):\n",
-        "        super().__init__()\n",
-        "        self.eps = eps\n",
-        "        self.scale = nn.Parameter(torch.ones(emb_dim))\n",
-        "    def forward(self, x):\n",
-        "        # x shape: (..., emb_dim)\n",
-        "        norm_x = x / torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)\n",
-        "        return self.scale * norm_x\n",
-        "\n",
-        "def get_norm(norm_type, emb_dim):\n",
-        "    if norm_type == 'layernorm':\n",
-        "        return nn.LayerNorm(emb_dim)\n",
-        "    elif norm_type == 'rmsnorm':\n",
-        "        return RMSNorm(emb_dim)\n",
-        "    else:\n",
-        "        raise ValueError(\"Unknown normalization type\")\n",
-        "\n",
-        "#############################################\n",
-        "# 2. Ortak Konfigürasyon\n",
-        "#############################################\n",
-        "class Config:\n",
-        "    def __init__(self, vocab_size=30522, emb_dim=768, max_length=512, n_layers=4, n_heads=12,\n",
-        "                 dropout=0.1, norm_type='layernorm'):\n",
-        "        self.vocab_size = vocab_size\n",
-        "        self.emb_dim = emb_dim\n",
-        "        self.max_length = max_length\n",
-        "        self.n_layers = n_layers\n",
-        "        self.n_heads = n_heads\n",
-        "        self.dropout = dropout\n",
-        "        self.norm_type = norm_type  # 'layernorm' veya 'rmsnorm'\n",
-        "        # Advanced varyantlar için ek parametreler:\n",
-        "        self.latent_dim = emb_dim // 2   # RoPE ve latent projeksiyon için\n",
-        "        self.num_experts = 4             # MoE FFN’de kullanılacak uzman sayısı\n",
-        "\n",
-        "#############################################\n",
-        "# --- Attention Modülleri ---\n",
-        "#############################################\n",
-        "# 1. Standard Dot-Product Attention\n",
-        "class StandardAttention(nn.Module):\n",
-        "    def __init__(self, emb_dim, n_heads, dropout):\n",
-        "        super().__init__()\n",
-        "        assert emb_dim % n_heads == 0, \"Embedding boyutu baş sayısına tam bölünmeli.\"\n",
-        "        self.n_heads = n_heads\n",
-        "        self.head_dim = emb_dim // n_heads\n",
-        "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        batch, seq_len, emb_dim = x.size()\n",
-        "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
-        "        attn = torch.softmax(scores, dim=-1)\n",
-        "        attn = self.dropout(attn)\n",
-        "        context = torch.matmul(attn, V)\n",
-        "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
-        "        return self.out_proj(context)\n",
-        "\n",
-        "# 2. RoPE Attention\n",
-        "def apply_rope(x, base=10000):\n",
-        "    # x: (batch, n_heads, seq_len, head_dim)\n",
-        "    batch, n_heads, seq_len, head_dim = x.shape\n",
-        "    inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device=x.device).float() / head_dim))\n",
-        "    pos = torch.arange(seq_len, device=x.device).float()\n",
-        "    sinusoid_inp = torch.einsum(\"i,j->ij\", pos, inv_freq)  # (seq_len, head_dim/2)\n",
-        "    sin = torch.sin(sinusoid_inp).unsqueeze(0).unsqueeze(0)\n",
-        "    cos = torch.cos(sinusoid_inp).unsqueeze(0).unsqueeze(0)\n",
-        "    x1 = x[..., :head_dim//2]\n",
-        "    x2 = x[..., head_dim//2:]\n",
-        "    return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)\n",
-        "\n",
-        "class RoPEAttention(nn.Module):\n",
-        "    def __init__(self, emb_dim, n_heads, dropout):\n",
-        "        super().__init__()\n",
-        "        assert emb_dim % n_heads == 0, \"Embedding boyutu baş sayısına tam bölünmeli.\"\n",
-        "        self.n_heads = n_heads\n",
-        "        self.head_dim = emb_dim // n_heads\n",
-        "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        batch, seq_len, emb_dim = x.size()\n",
-        "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        Q = apply_rope(Q)\n",
-        "        K = apply_rope(K)\n",
-        "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
-        "        attn = torch.softmax(scores, dim=-1)\n",
-        "        attn = self.dropout(attn)\n",
-        "        context = torch.matmul(attn, V)\n",
-        "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
-        "        return self.out_proj(context)\n",
-        "\n",
-        "# 3. FlashAttention benzeri Attention (placeholder)\n",
-        "def flash_attention(Q, K, V):\n",
-        "    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(Q.size(-1))\n",
-        "    attn = torch.softmax(scores, dim=-1)\n",
-        "    return torch.matmul(attn, V)\n",
-        "\n",
-        "class FlashAttentionModule(nn.Module):\n",
-        "    def __init__(self, emb_dim, n_heads, dropout):\n",
-        "        super().__init__()\n",
-        "        assert emb_dim % n_heads == 0, \"Embedding boyutu baş sayısına tam bölünmeli.\"\n",
-        "        self.n_heads = n_heads\n",
-        "        self.head_dim = emb_dim // n_heads\n",
-        "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        batch, seq_len, emb_dim = x.size()\n",
-        "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        context = flash_attention(Q, K, V)\n",
-        "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
-        "        return self.out_proj(context)\n",
-        "\n",
-        "# 4. Multi-Query Attention: Keys & Values tek projeksiyon\n",
-        "class MultiQueryAttention(nn.Module):\n",
-        "    def __init__(self, emb_dim, n_heads, dropout):\n",
-        "        super().__init__()\n",
-        "        self.n_heads = n_heads\n",
-        "        self.head_dim = emb_dim // n_heads\n",
-        "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.k_proj = nn.Linear(emb_dim, self.head_dim)\n",
-        "        self.v_proj = nn.Linear(emb_dim, self.head_dim)\n",
-        "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        batch, seq_len, emb_dim = x.size()\n",
-        "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        K = self.k_proj(x).unsqueeze(1).expand(batch, self.n_heads, seq_len, self.head_dim)\n",
-        "        V = self.v_proj(x).unsqueeze(1).expand(batch, self.n_heads, seq_len, self.head_dim)\n",
-        "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
-        "        attn = torch.softmax(scores, dim=-1)\n",
-        "        attn = self.dropout(attn)\n",
-        "        context = torch.matmul(attn, V)\n",
-        "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
-        "        return self.out_proj(context)\n",
-        "\n",
-        "# 5. ALiBi Attention: Lineer bias ekleyerek göreceli pozisyon bilgisini entegre eder (Press et al., 2021)\n",
-        "class ALiBiAttention(nn.Module):\n",
-        "    def __init__(self, emb_dim, n_heads, dropout, alibi_scaling=-1.0):\n",
-        "        super().__init__()\n",
-        "        assert emb_dim % n_heads == 0, \"Embedding boyutu, baş sayısına tam bölünmeli.\"\n",
-        "        self.n_heads = n_heads\n",
-        "        self.head_dim = emb_dim // n_heads\n",
-        "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
-        "        self.dropout = nn.Dropout(dropout)\n",
-        "        self.alibi_scaling = alibi_scaling\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        batch, seq_len, emb_dim = x.size()\n",
-        "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
-        "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
-        "        # ALiBi bias: B[i,j] = (j - i) * scale\n",
-        "        bias = torch.arange(seq_len, device=x.device).unsqueeze(0) - torch.arange(seq_len, device=x.device).unsqueeze(1)\n",
-        "        bias = self.alibi_scaling * bias.float()\n",
-        "        scores = scores + bias.unsqueeze(0).unsqueeze(0)\n",
-        "        attn = torch.softmax(scores, dim=-1)\n",
-        "        attn = self.dropout(attn)\n",
-        "        context = torch.matmul(attn, V)\n",
-        "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
-        "        return self.out_proj(context)\n",
-        "\n",
-        "#############################################\n",
-        "# --- FFN Varyantları ---\n",
-        "#############################################\n",
-        "# 1. Standart FFN\n",
-        "class StandardFFN(nn.Module):\n",
-        "    def __init__(self, emb_dim, expansion=4, dropout=0.1):\n",
-        "        super().__init__()\n",
-        "        self.net = nn.Sequential(\n",
-        "            nn.Linear(emb_dim, expansion * emb_dim),\n",
-        "            nn.GELU(),\n",
-        "            nn.Dropout(dropout),\n",
-        "            nn.Linear(expansion * emb_dim, emb_dim)\n",
-        "        )\n",
-        "    def forward(self, x):\n",
-        "        return self.net(x)\n",
-        "\n",
-        "# 2. MoE FFN\n",
-        "class MoEFFN(nn.Module):\n",
-        "    def __init__(self, emb_dim, num_experts, expansion=4, dropout=0.1):\n",
-        "        super().__init__()\n",
-        "        self.num_experts = num_experts\n",
-        "        self.experts = nn.ModuleList([\n",
-        "            nn.Sequential(\n",
-        "                nn.Linear(emb_dim, expansion * emb_dim),\n",
-        "                nn.GELU(),\n",
-        "                nn.Dropout(dropout),\n",
-        "                nn.Linear(expansion * emb_dim, emb_dim)\n",
-        "            ) for _ in range(num_experts)\n",
-        "        ])\n",
-        "        self.gate = nn.Linear(emb_dim, num_experts)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        gate_scores = torch.softmax(self.gate(x), dim=-1)  # (batch, seq_len, num_experts)\n",
-        "        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=-1)  # (batch, seq_len, emb_dim, num_experts)\n",
-        "        gate_scores = gate_scores.unsqueeze(2)  # (batch, seq_len, 1, num_experts)\n",
-        "        return (expert_outputs * gate_scores).sum(dim=-1)\n",
-        "\n",
-        "#############################################\n",
-        "# --- Transformer Bloğu: Seçilebilir Attention ve FFN varyantları, Dropout, Pre-Norm ---\n",
-        "#############################################\n",
-        "class TransformerBlock(nn.Module):\n",
-        "    def __init__(self, emb_dim, n_heads, attn_module, ffn_module, dropout, norm_type):\n",
-        "        super().__init__()\n",
-        "        self.norm1 = get_norm(norm_type, emb_dim)\n",
-        "        self.attn = attn_module(emb_dim, n_heads, dropout)\n",
-        "        self.norm2 = get_norm(norm_type, emb_dim)\n",
-        "        self.ffn = ffn_module(emb_dim, dropout=dropout)  # ffn_module: StandardFFN or MoEFFN (for MoE, lambda is used)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        x = x + self.attn(self.norm1(x))\n",
-        "        x = x + self.ffn(self.norm2(x))\n",
-        "        return x\n",
-        "\n",
-        "#############################################\n",
-        "# --- Transformer Modeli: Farklı varyantların seçilebildiği yapı ---\n",
-        "#############################################\n",
-        "class TransformerModel(nn.Module):\n",
-        "    def __init__(self, config, attn_variant='standard', ffn_variant='standard'):\n",
-        "        super().__init__()\n",
-        "        self.token_embed = nn.Embedding(config.vocab_size, config.emb_dim)\n",
-        "        self.pos_embed   = nn.Embedding(config.max_length, config.emb_dim)\n",
-        "\n",
-        "        attn_dict = {\n",
-        "            'standard': StandardAttention,\n",
-        "            'rope': RoPEAttention,\n",
-        "            'flash': FlashAttentionModule,\n",
-        "            'multiquery': MultiQueryAttention,\n",
-        "            'alibi': ALiBiAttention\n",
-        "        }\n",
-        "        ffn_dict = {\n",
-        "            'standard': StandardFFN,\n",
-        "            'moe': lambda emb_dim, dropout: MoEFFN(emb_dim, config.num_experts, dropout=dropout)\n",
-        "        }\n",
-        "        self.layers = nn.ModuleList([\n",
-        "            TransformerBlock(config.emb_dim, config.n_heads, attn_dict[attn_variant], ffn_dict[ffn_variant], config.dropout, config.norm_type)\n",
-        "            for _ in range(config.n_layers)\n",
-        "        ])\n",
-        "        self.norm = get_norm(config.norm_type, config.emb_dim)\n",
-        "        self.output_proj = nn.Linear(config.emb_dim, config.vocab_size, bias=False)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        seq_len = x.size(1)\n",
-        "        x = self.token_embed(x) + self.pos_embed(torch.arange(seq_len, device=x.device))\n",
-        "        for layer in self.layers:\n",
-        "            x = layer(x)\n",
-        "        x = self.norm(x)\n",
-        "        return self.output_proj(x)\n",
-        "\n",
-        "#############################################\n",
-        "# --- Ek: Model Özeti ve Parametre Sayısı Fonksiyonu ---\n",
-        "#############################################\n",
-        "def model_summary(model):\n",
-        "    total_params = sum(p.numel() for p in model.parameters())\n",
-        "    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
-        "    print(f\"Toplam Parametre: {total_params:,}\")\n",
-        "    print(f\"Eğitilebilir Parametre: {trainable:,}\")\n",
-        "\n",
-        "#############################################\n",
-        "# --- Ek: Greedy Decoding Fonksiyonu ---\n",
-        "#############################################\n",
-        "def greedy_decode(model, start_token, max_length, device):\n",
-        "    model.eval()\n",
-        "    generated = [start_token]\n",
-        "    input_seq = torch.tensor([generated], device=device)\n",
-        "    with torch.no_grad():\n",
-        "        for _ in range(max_length - 1):\n",
-        "            logits = model(input_seq)  # (batch, seq_len, vocab_size)\n",
-        "            next_token = torch.argmax(logits[0, -1, :]).item()\n",
-        "            generated.append(next_token)\n",
-        "            input_seq = torch.tensor([generated], device=device)\n",
-        "    model.train()\n",
-        "    return generated\n",
-        "\n",
-        "#############################################\n",
-        "# --- Ek: Basit Eğitim Döngüsü (Training Loop) ---\n",
-        "#############################################\n",
-        "def train_model(model, config, epochs=3):\n",
-        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "    model.to(device)\n",
-        "    optimizer = optim.AdamW(model.parameters(), lr=1e-4)\n",
-        "    loss_fn = nn.CrossEntropyLoss()\n",
-        "    # Dummy dataset: rastgele token dizileri\n",
-        "    for epoch in range(epochs):\n",
-        "        model.train()\n",
-        "        dummy_input = torch.randint(0, config.vocab_size, (8, config.max_length), device=device)\n",
-        "        dummy_target = torch.randint(0, config.vocab_size, (8, config.max_length), device=device)\n",
-        "        optimizer.zero_grad()\n",
-        "        logits = model(dummy_input)  # (batch, seq_len, vocab_size)\n",
-        "        loss = loss_fn(logits.view(-1, config.vocab_size), dummy_target.view(-1))\n",
-        "        loss.backward()\n",
-        "        optimizer.step()\n",
-        "        print(f\"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}\")\n",
-        "\n",
-        "#############################################\n",
-        "# --- Detaylı Test Fonksiyonları (Önceki Versiyonun Geliştirilmiş Hali) ---\n",
-        "#############################################\n",
-        "def run_detailed_tests(config, variant_list):\n",
-        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "    for variant in variant_list:\n",
-        "        attn_var = variant['attn']\n",
-        "        ffn_var = variant['ffn']\n",
-        "        print(f\"\\nTest: Attention = {attn_var}, FFN = {ffn_var}\")\n",
-        "        model = TransformerModel(config, attn_variant=attn_var, ffn_variant=ffn_var).to(device)\n",
-        "        model_summary(model)\n",
-        "        model.train()\n",
-        "        dummy_input = torch.randint(0, config.vocab_size, (4, config.max_length), device=device)\n",
-        "        logits = model(dummy_input)\n",
-        "        loss = nn.CrossEntropyLoss()(logits.view(-1, config.vocab_size),\n",
-        "                                     torch.randint(0, config.vocab_size, (4 * config.max_length,), device=device))\n",
-        "        loss.backward()\n",
-        "        print(f\"Loss: {loss.item():.4f}, Output shape: {logits.shape}\")\n",
-        "\n",
-        "        torch.cuda.synchronize() if device.type == 'cuda' else None\n",
-        "        start_time = time.time()\n",
-        "        for _ in range(10):\n",
-        "            _ = model(dummy_input)\n",
-        "        torch.cuda.synchronize() if device.type == 'cuda' else None\n",
-        "        avg_time = (time.time() - start_time) / 10.0\n",
-        "        print(f\"Ortalama ileri geçiş süresi: {avg_time:.6f} sn\")\n",
-        "\n",
-        "        # Greedy decoding test (ilk 10 token üretiliyor)\n",
-        "        start_token = dummy_input[0, 0].item()\n",
-        "        generated = greedy_decode(model, start_token, max_length=10, device=device)\n",
-        "        print(f\"Greedy Decode Çıktısı: {generated}\")\n",
-        "\n",
-        "#############################################\n",
-        "# --- Ana Çalışma Bölümü: Farklı varyantları deneyelim ---\n",
-        "#############################################\n",
-        "if __name__ == \"__main__\":\n",
-        "    # Konfigürasyona norm tipi ve dropout eklenmiştir.\n",
-        "    config = Config(vocab_size=30522, emb_dim=768, max_length=128, n_layers=4, n_heads=12, dropout=0.1, norm_type='rmsnorm')\n",
-        "\n",
-        "    # Denenecek varyantlar: farklı attention ve FFN varyantları\n",
-        "    variant_list = [\n",
-        "        {'attn': 'standard', 'ffn': 'standard'},\n",
-        "        {'attn': 'rope',     'ffn': 'standard'},\n",
-        "        {'attn': 'flash',    'ffn': 'standard'},\n",
-        "        {'attn': 'multiquery', 'ffn': 'standard'},\n",
-        "        {'attn': 'alibi',    'ffn': 'standard'},\n",
-        "        {'attn': 'standard', 'ffn': 'moe'},\n",
-        "        {'attn': 'rope',     'ffn': 'moe'},\n",
-        "        {'attn': 'flash',    'ffn': 'moe'},\n",
-        "        {'attn': 'multiquery', 'ffn': 'moe'},\n",
-        "        {'attn': 'alibi',    'ffn': 'moe'},\n",
-        "    ]\n",
-        "\n",
-        "    print(\"=== Detaylı Varyant Testleri ===\")\n",
-        "    run_detailed_tests(config, variant_list)\n",
-        "\n",
-        "    print(\"\\n=== Eğitim Döngüsü Testi ===\")\n",
-        "    # Bir varyant seçelim (örneğin, gelişmiş varyant: RoPE + MoE FFN)\n",
-        "    model = TransformerModel(config, attn_variant='rope', ffn_variant='moe').to(torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\"))\n",
-        "    train_model(model, config, epochs=3)\n",
-        "\n",
-        "    print(\"\\n=== Greedy Decoding Testi ===\")\n",
-        "    # Greedy decoding örneği: İlk tokenı dummy inputtan alıp 20 token üretelim\n",
-        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "    prompt_token = torch.randint(0, config.vocab_size, (1,)).item()\n",
-        "    generated_tokens = greedy_decode(model, prompt_token, max_length=20, device=device)\n",
-        "    print(\"Üretilen Tokenlar:\", generated_tokens)"
-      ],
-      "metadata": {
-        "trusted": true,
-        "execution": {
-          "iopub.status.busy": "2025-02-12T22:04:05.352069Z",
-          "iopub.execute_input": "2025-02-12T22:04:05.352443Z",
-          "iopub.status.idle": "2025-02-12T22:06:42.132452Z",
-          "shell.execute_reply.started": "2025-02-12T22:04:05.352413Z",
-          "shell.execute_reply": "2025-02-12T22:06:42.131334Z"
-        },
-        "id": "4Q_jYSvEw_I3"
-      },
-      "outputs": [],
-      "execution_count": null
+    "id": "DEJyddCuw_I3"
+   },
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import time\n",
+    "import math\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "import matplotlib.pyplot as plt\n",
+    "import tiktoken\n",
+    "from datasets import load_dataset  # Hugging Face datasets library\n",
+    "import re\n",
+    "\n",
+    "#####################################\n",
+    "# Rotary Positional Embeddings (ROPE) Implementation\n",
+    "#####################################\n",
+    "def apply_rotary_pos_emb(x):\n",
+    "    \"\"\"\n",
+    "    Apply Rotary Positional Embeddings (ROPE) to the input tensor.\n",
+    "\n",
+    "    Args:\n",
+    "        x (torch.Tensor): Input tensor of shape (batch, num_heads, seq_len, head_dim).\n",
+    "\n",
+    "    Returns:\n",
+    "        torch.Tensor: Tensor with ROPE applied.\n",
+    "    \"\"\"\n",
+    "    batch, n_heads, seq_len, head_dim = x.shape\n",
+    "    assert head_dim % 2 == 0, \"head_dim must be even for ROPE\"\n",
+    "\n",
+    "    # Calculate inverse frequencies and positions\n",
+    "    inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2, device=x.device).float() / head_dim))\n",
+    "    positions = torch.arange(seq_len, device=x.device).float()\n",
+    "    sinusoid_inp = torch.einsum(\"i,j->ij\", positions, inv_freq)  # (seq_len, head_dim/2)\n",
+    "    sin = torch.sin(sinusoid_inp)[None, None, :, :]  # (1, 1, seq_len, head_dim/2)\n",
+    "    cos = torch.cos(sinusoid_inp)[None, None, :, :]  # (1, 1, seq_len, head_dim/2)\n",
+    "\n",
+    "    # Split the input tensor into two halves and apply ROPE\n",
+    "    x1, x2 = x[..., :head_dim//2], x[..., head_dim//2:]\n",
+    "    x_rotated = torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)\n",
+    "    return x_rotated\n",
+    "\n",
+    "#####################################\n",
+    "# Dataset and DataLoader: Wikitext (Hugging Face)\n",
+    "#####################################\n",
+    "class GPTDatasetV1(Dataset):\n",
+    "    def __init__(self, text, tokenizer, max_length, stride):\n",
+    "        self.input_ids = []\n",
+    "        self.target_ids = []\n",
+    "\n",
+    "        # Tokenize the text\n",
+    "        token_ids = tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n",
+    "\n",
+    "        # Create input-target pairs\n",
+    "        for i in range(0, len(token_ids) - max_length, stride):\n",
+    "            input_chunk = token_ids[i:i + max_length]\n",
+    "            target_chunk = token_ids[i + 1: i + max_length + 1]\n",
+    "            self.input_ids.append(torch.tensor(input_chunk))\n",
+    "            self.target_ids.append(torch.tensor(target_chunk))\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.input_ids)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        return self.input_ids[idx], self.target_ids[idx]\n",
+    "\n",
+    "def create_dataloader_v1(text, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):\n",
+    "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
+    "    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)\n",
+    "    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)\n",
+    "    return dataloader\n",
+    "\n",
+    "def load_wikitext_data(num_lines=10000, dataset_name=\"wikitext\", subset=\"wikitext-103-raw-v1\"):\n",
+    "    \"\"\"\n",
+    "    Load Wikitext data from Hugging Face and concatenate the first `num_lines` lines into a single text.\n",
+    "\n",
+    "    Args:\n",
+    "        num_lines (int): Number of lines to load.\n",
+    "        dataset_name (str): Name of the dataset.\n",
+    "        subset (str): Subset of the dataset.\n",
+    "\n",
+    "    Returns:\n",
+    "        str: Concatenated text.\n",
+    "    \"\"\"\n",
+    "    ds = load_dataset(dataset_name, subset)\n",
+    "    text_lines = ds[\"train\"][\"text\"][:num_lines]\n",
+    "    text = \"\\n\".join(text_lines)\n",
+    "    return text\n",
+    "\n",
+    "def preprocess_text(text):\n",
+    "    \"\"\"\n",
+    "    Preprocess the text data by removing unwanted characters and normalizing whitespace.\n",
+    "\n",
+    "    Args:\n",
+    "        text (str): Input text.\n",
+    "\n",
+    "    Returns:\n",
+    "        str: Preprocessed text.\n",
+    "    \"\"\"\n",
+    "    # Remove special characters and digits, and normalize whitespace\n",
+    "    text = re.sub(r'[^A-Za-z\\s]', '', text)\n",
+    "    text = re.sub(r'\\s+', ' ', text).strip()\n",
+    "    return text\n",
+    "\n",
+    "#####################################\n",
+    "# Advanced Model Components (GPTModel)\n",
+    "#####################################\n",
+    "\n",
+    "class MultiHeadAttention(nn.Module):\n",
+    "    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False, use_rope=False):\n",
+    "        super().__init__()\n",
+    "        assert d_out % num_heads == 0, \"d_out must be divisible by num_heads\"\n",
+    "        self.d_out = d_out\n",
+    "        self.num_heads = num_heads\n",
+    "        self.head_dim = d_out // num_heads\n",
+    "\n",
+    "        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
+    "        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
+    "        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)\n",
+    "        self.out_proj = nn.Linear(d_out, d_out)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "        self.use_rope = use_rope\n",
+    "        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        b, num_tokens, _ = x.shape\n",
+    "\n",
+    "        keys = self.W_key(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)\n",
+    "        queries = self.W_query(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)\n",
+    "        values = self.W_value(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)\n",
+    "\n",
+    "        if self.use_rope:\n",
+    "            queries = apply_rotary_pos_emb(queries)\n",
+    "            keys = apply_rotary_pos_emb(keys)\n",
+    "\n",
+    "        attn_scores = queries @ keys.transpose(2, 3)\n",
+    "        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n",
+    "        attn_scores.masked_fill_(mask_bool, float(\"-inf\"))\n",
+    "\n",
+    "        attn_weights = torch.softmax(attn_scores / math.sqrt(self.head_dim), dim=-1)\n",
+    "        attn_weights = self.dropout(attn_weights)\n",
+    "\n",
+    "        context_vec = (attn_weights @ values).transpose(1, 2).reshape(b, num_tokens, self.d_out)\n",
+    "        context_vec = self.out_proj(context_vec)\n",
+    "        return context_vec\n",
+    "\n",
+    "class LayerNorm(nn.Module):\n",
+    "    def __init__(self, emb_dim):\n",
+    "        super().__init__()\n",
+    "        self.eps = 1e-5\n",
+    "        self.scale = nn.Parameter(torch.ones(emb_dim))\n",
+    "        self.shift = nn.Parameter(torch.zeros(emb_dim))\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        mean = x.mean(dim=-1, keepdim=True)\n",
+    "        var = x.var(dim=-1, keepdim=True, unbiased=False)\n",
+    "        norm_x = (x - mean) / torch.sqrt(var + self.eps)\n",
+    "        return self.scale * norm_x + self.shift\n",
+    "\n",
+    "class GELU(nn.Module):\n",
+    "    def forward(self, x):\n",
+    "        return 0.5 * x * (1 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))\n",
+    "\n",
+    "class FeedForward(nn.Module):\n",
+    "    def __init__(self, cfg):\n",
+    "        super().__init__()\n",
+    "        self.layers = nn.Sequential(\n",
+    "            nn.Linear(cfg[\"emb_dim\"], 4 * cfg[\"emb_dim\"]),\n",
+    "            GELU(),\n",
+    "            nn.Linear(4 * cfg[\"emb_dim\"], cfg[\"emb_dim\"]),\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.layers(x)\n",
+    "\n",
+    "class TransformerBlock(nn.Module):\n",
+    "    def __init__(self, cfg):\n",
+    "        super().__init__()\n",
+    "        self.att = MultiHeadAttention(\n",
+    "            d_in=cfg[\"emb_dim\"],\n",
+    "            d_out=cfg[\"emb_dim\"],\n",
+    "            context_length=cfg[\"context_length\"],\n",
+    "            num_heads=cfg[\"n_heads\"],\n",
+    "            dropout=cfg[\"drop_rate\"],\n",
+    "            qkv_bias=cfg[\"qkv_bias\"],\n",
+    "            use_rope=cfg.get(\"use_rope\", False)\n",
+    "        )\n",
+    "        self.ff = FeedForward(cfg)\n",
+    "        self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
+    "        self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
+    "        self.drop_shortcut = nn.Dropout(cfg[\"drop_rate\"])\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        shortcut = x\n",
+    "        x = self.norm1(x)\n",
+    "        x = self.att(x)\n",
+    "        x = self.drop_shortcut(x)\n",
+    "        x = x + shortcut\n",
+    "\n",
+    "        shortcut = x\n",
+    "        x = self.norm2(x)\n",
+    "        x = self.ff(x)\n",
+    "        x = self.drop_shortcut(x)\n",
+    "        x = x + shortcut\n",
+    "        return x\n",
+    "\n",
+    "class GPTModel(nn.Module):\n",
+    "    def __init__(self, cfg):\n",
+    "        super().__init__()\n",
+    "        self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
+    "        self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
+    "        self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
+    "\n",
+    "        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n",
+    "\n",
+    "        self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n",
+    "        self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False)\n",
+    "\n",
+    "    def forward(self, in_idx):\n",
+    "        batch_size, seq_len = in_idx.shape\n",
+    "        tok_embeds = self.tok_emb(in_idx)\n",
+    "        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
+    "        x = tok_embeds + pos_embeds\n",
+    "        x = self.drop_emb(x)\n",
+    "        x = self.trf_blocks(x)\n",
+    "        x = self.final_norm(x)\n",
+    "        logits = self.out_head(x)\n",
+    "        return logits\n",
+    "\n",
+    "#####################################\n",
+    "# Training and Evaluation Functions\n",
+    "#####################################\n",
+    "def calc_loss_batch(input_batch, target_batch, model, device):\n",
+    "    input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n",
+    "    logits = model(input_batch)\n",
+    "    loss = nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())\n",
+    "    return loss\n",
+    "\n",
+    "def calc_loss_loader(data_loader, model, device, num_batches=None):\n",
+    "    total_loss = 0.0\n",
+    "    if len(data_loader) == 0:\n",
+    "        return float(\"nan\")\n",
+    "    num_batches = num_batches if num_batches is not None else len(data_loader)\n",
+    "    num_batches = min(num_batches, len(data_loader))\n",
+    "    for i, (input_batch, target_batch) in enumerate(data_loader):\n",
+    "        if i < num_batches:\n",
+    "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
+    "            total_loss += loss.item()\n",
+    "        else:\n",
+    "            break\n",
+    "    return total_loss / num_batches\n",
+    "\n",
+    "def evaluate_model(model, train_loader, val_loader, device, eval_iter):\n",
+    "    model.eval()\n",
+    "    with torch.no_grad():\n",
+    "        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)\n",
+    "        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)\n",
+    "    model.train()\n",
+    "    return train_loss, val_loss\n",
+    "\n",
+    "def generate_text_simple(model, idx, max_new_tokens, context_size):\n",
+    "    for _ in range(max_new_tokens):\n",
+    "        idx_cond = idx[:, -context_size:]\n",
+    "        with torch.no_grad():\n",
+    "            logits = model(idx_cond)\n",
+    "        logits = logits[:, -1, :]\n",
+    "        idx_next = torch.argmax(logits, dim=-1, keepdim=True)\n",
+    "        idx = torch.cat((idx, idx_next), dim=1)\n",
+    "    return idx\n",
+    "\n",
+    "def generate_and_print_sample(model, tokenizer, device, start_context):\n",
+    "    model.eval()\n",
+    "    context_size = model.pos_emb.weight.shape[0]\n",
+    "    encoded = text_to_token_ids(start_context, tokenizer).to(device)\n",
+    "    with torch.no_grad():\n",
+    "        token_ids = generate_text_simple(model=model, idx=encoded, max_new_tokens=50, context_size=context_size)\n",
+    "        decoded_text = token_ids_to_text(token_ids, tokenizer)\n",
+    "        print(decoded_text.replace(\"\\n\", \" \"))\n",
+    "    model.train()\n",
+    "\n",
+    "def text_to_token_ids(text, tokenizer):\n",
+    "    encoded = tokenizer.encode(text)\n",
+    "    return torch.tensor(encoded).unsqueeze(0)\n",
+    "\n",
+    "def token_ids_to_text(token_ids, tokenizer):\n",
+    "    flat = token_ids.squeeze(0)\n",
+    "    return tokenizer.decode(flat.tolist())\n",
+    "\n",
+    "def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):\n",
+    "    fig, ax1 = plt.subplots(figsize=(5, 3))\n",
+    "    ax1.plot(epochs_seen, train_losses, label=\"Training loss\")\n",
+    "    ax1.plot(epochs_seen, val_losses, linestyle=\"-.\", label=\"Validation loss\")\n",
+    "    ax1.set_xlabel(\"Epochs\")\n",
+    "    ax1.set_ylabel(\"Loss\")\n",
+    "    ax1.legend(loc=\"upper right\")\n",
+    "    ax2 = ax1.twiny()\n",
+    "    ax2.plot(tokens_seen, train_losses, alpha=0)\n",
+    "    ax2.set_xlabel(\"Tokens seen\")\n",
+    "    fig.tight_layout()\n",
+    "    plt.show()\n",
+    "\n",
+    "#####################################\n",
+    "# Model Training\n",
+    "#####################################\n",
+    "def train_model(model, train_loader, val_loader, device, epochs=30, eval_iter=20, lr=1e-4):\n",
+    "    optimizer = optim.Adam(model.parameters(), lr=lr)\n",
+    "    model.to(device)\n",
+    "    for epoch in range(epochs):\n",
+    "        model.train()\n",
+    "        epoch_loss = 0.0\n",
+    "        start_time = time.time()\n",
+    "\n",
+    "        for batch_idx, (input_batch, target_batch) in enumerate(train_loader):\n",
+    "            optimizer.zero_grad()\n",
+    "            loss = calc_loss_batch(input_batch, target_batch, model, device)\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "            epoch_loss += loss.item()\n",
+    "            if (batch_idx + 1) % 10 == 0:\n",
+    "                print(f\"Epoch {epoch+1} Batch {batch_idx+1}, Loss: {loss.item():.4f}\")\n",
+    "\n",
+    "        avg_loss = epoch_loss / len(train_loader)\n",
+    "        elapsed = time.time() - start_time\n",
+    "        print(f\"Epoch {epoch+1} completed (time: {elapsed:.2f}s), avg. loss: {avg_loss:.4f}\")\n",
+    "        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)\n",
+    "        print(f\"Epoch {epoch+1} evaluation: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}\\n\")\n",
+    "\n",
+    "#####################################\n",
+    "# Text Generation\n",
+    "#####################################\n",
+    "def generate_sample(model, tokenizer, device, prompt, max_new_tokens=50):\n",
+    "    print(\"Generated text sample:\\n\")\n",
+    "    generate_and_print_sample(model, tokenizer, device, prompt)\n",
+    "\n",
+    "#####################################\n",
+    "# Main Function\n",
+    "#####################################\n",
+    "def main():\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "    print(f\"Using device: {device}\")\n",
+    "\n",
+    "    # Load Wikitext data from Hugging Face and use the first 50k lines\n",
+    "    text = load_wikitext_data(num_lines=50000, dataset_name=\"wikitext\", subset=\"wikitext-103-raw-v1\")\n",
+    "\n",
+    "    # Preprocess the text data\n",
+    "    text = preprocess_text(text)\n",
+    "\n",
+    "    # Split data into training and validation sets (e.g., 90% train, 10% validation)\n",
+    "    split_idx = int(0.9 * len(text))\n",
+    "    train_text = text[:split_idx]\n",
+    "    val_text = text[split_idx:]\n",
+    "    train_loader = create_dataloader_v1(train_text, batch_size=8, max_length=256, stride=128)\n",
+    "    val_loader = create_dataloader_v1(val_text, batch_size=8, max_length=256, stride=128)\n",
+    "\n",
+    "    # Tokenizer and advanced model configuration\n",
+    "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
+    "    cfg = {\n",
+    "        \"vocab_size\": tokenizer.n_vocab,\n",
+    "        \"emb_dim\": 256,\n",
+    "        \"context_length\": 256,\n",
+    "        \"drop_rate\": 0.1,\n",
+    "        \"n_layers\": 6,\n",
+    "        \"n_heads\": 8,\n",
+    "        \"qkv_bias\": True,\n",
+    "        \"use_rope\": True,\n",
+    "    }\n",
+    "\n",
+    "    model = GPTModel(cfg)\n",
+    "    train_model(model, train_loader, val_loader, device, epochs=1, eval_iter=25, lr=1e-5)\n",
+    "\n",
+    "    # Generate text after training with a given prompt\n",
+    "    prompt = \"Valkyria Chronicles III \"\n",
+    "    generate_sample(model, tokenizer, device, prompt, max_new_tokens=100)\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()\n"
+   ],
+   "metadata": {
+    "trusted": true,
+    "execution": {
+     "iopub.status.busy": "2025-02-12T20:33:49.198352Z",
+     "iopub.execute_input": "2025-02-12T20:33:49.198767Z",
+     "iopub.status.idle": "2025-02-12T20:38:21.836553Z",
+     "shell.execute_reply.started": "2025-02-12T20:33:49.198738Z",
+     "shell.execute_reply": "2025-02-12T20:38:21.83572Z"
     },
-    {
-      "cell_type": "code",
-      "source": [
-        "#!pip install evaluate reportlab"
-      ],
-      "metadata": {
-        "trusted": true,
-        "execution": {
-          "iopub.status.busy": "2025-02-12T22:06:55.854229Z",
-          "iopub.execute_input": "2025-02-12T22:06:55.854996Z",
-          "iopub.status.idle": "2025-02-12T22:06:55.861061Z",
-          "shell.execute_reply.started": "2025-02-12T22:06:55.854884Z",
-          "shell.execute_reply": "2025-02-12T22:06:55.859524Z"
-        },
-        "id": "6ZXigQy4w_I4"
-      },
-      "outputs": [],
-      "execution_count": null
+    "id": "2E8o3nAXw_I3"
+   },
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import math\n",
+    "import time\n",
+    "\n",
+    "#############################################\n",
+    "# 1. Alternatif Normalizasyon: RMSNorm\n",
+    "#############################################\n",
+    "class RMSNorm(nn.Module):\n",
+    "    def __init__(self, emb_dim, eps=1e-8):\n",
+    "        super().__init__()\n",
+    "        self.eps = eps\n",
+    "        self.scale = nn.Parameter(torch.ones(emb_dim))\n",
+    "    def forward(self, x):\n",
+    "        # x shape: (..., emb_dim)\n",
+    "        norm_x = x / torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)\n",
+    "        return self.scale * norm_x\n",
+    "\n",
+    "def get_norm(norm_type, emb_dim):\n",
+    "    if norm_type == 'layernorm':\n",
+    "        return nn.LayerNorm(emb_dim)\n",
+    "    elif norm_type == 'rmsnorm':\n",
+    "        return RMSNorm(emb_dim)\n",
+    "    else:\n",
+    "        raise ValueError(\"Unknown normalization type\")\n",
+    "\n",
+    "#############################################\n",
+    "# 2. Shared configuration\n",
+    "#############################################\n",
+    "class Config:\n",
+    "    def __init__(self, vocab_size=30522, emb_dim=768, max_length=512, n_layers=4, n_heads=12,\n",
+    "                 dropout=0.1, norm_type='layernorm'):\n",
+    "        self.vocab_size = vocab_size\n",
+    "        self.emb_dim = emb_dim\n",
+    "        self.max_length = max_length\n",
+    "        self.n_layers = n_layers\n",
+    "        self.n_heads = n_heads\n",
+    "        self.dropout = dropout\n",
+    "        self.norm_type = norm_type  # 'layernorm' veya 'rmsnorm'\n",
+    "        # Additional parameters for advanced variants:\n",
+    "        self.latent_dim = emb_dim // 2   # For RoPE and latent projection\n",
+    "        self.num_experts = 4             # Number of experts used in the MoE FFN\n",
+    "\n",
+    "#############################################\n",
+    "# --- Attention modules ---\n",
+    "#############################################\n",
+    "# 1. Standard Dot-Product Attention\n",
+    "class StandardAttention(nn.Module):\n",
+    "    def __init__(self, emb_dim, n_heads, dropout):\n",
+    "        super().__init__()\n",
+    "        assert emb_dim % n_heads == 0, \"Embedding dimension must be divisible by the number of heads.\"\n",
+    "        self.n_heads = n_heads\n",
+    "        self.head_dim = emb_dim // n_heads\n",
+    "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        batch, seq_len, emb_dim = x.size()\n",
+    "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
+    "        attn = torch.softmax(scores, dim=-1)\n",
+    "        attn = self.dropout(attn)\n",
+    "        context = torch.matmul(attn, V)\n",
+    "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
+    "        return self.out_proj(context)\n",
+    "\n",
+    "# 2. RoPE Attention\n",
+    "def apply_rope(x, base=10000):\n",
+    "    # x: (batch, n_heads, seq_len, head_dim)\n",
+    "    batch, n_heads, seq_len, head_dim = x.shape\n",
+    "    inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device=x.device).float() / head_dim))\n",
+    "    pos = torch.arange(seq_len, device=x.device).float()\n",
+    "    sinusoid_inp = torch.einsum(\"i,j->ij\", pos, inv_freq)  # (seq_len, head_dim/2)\n",
+    "    sin = torch.sin(sinusoid_inp).unsqueeze(0).unsqueeze(0)\n",
+    "    cos = torch.cos(sinusoid_inp).unsqueeze(0).unsqueeze(0)\n",
+    "    x1 = x[..., :head_dim//2]\n",
+    "    x2 = x[..., head_dim//2:]\n",
+    "    return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)\n",
+    "\n",
+    "class RoPEAttention(nn.Module):\n",
+    "    def __init__(self, emb_dim, n_heads, dropout):\n",
+    "        super().__init__()\n",
+    "        assert emb_dim % n_heads == 0, \"Embedding dimension must be divisible by the number of heads.\"\n",
+    "        self.n_heads = n_heads\n",
+    "        self.head_dim = emb_dim // n_heads\n",
+    "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        batch, seq_len, emb_dim = x.size()\n",
+    "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        Q = apply_rope(Q)\n",
+    "        K = apply_rope(K)\n",
+    "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
+    "        attn = torch.softmax(scores, dim=-1)\n",
+    "        attn = self.dropout(attn)\n",
+    "        context = torch.matmul(attn, V)\n",
+    "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
+    "        return self.out_proj(context)\n",
+    "\n",
+    "# 3. FlashAttention benzeri Attention (placeholder)\n",
+    "def flash_attention(Q, K, V):\n",
+    "    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(Q.size(-1))\n",
+    "    attn = torch.softmax(scores, dim=-1)\n",
+    "    return torch.matmul(attn, V)\n",
+    "\n",
+    "class FlashAttentionModule(nn.Module):\n",
+    "    def __init__(self, emb_dim, n_heads, dropout):\n",
+    "        super().__init__()\n",
+    "        assert emb_dim % n_heads == 0, \"Embedding dimension must be divisible by the number of heads.\"\n",
+    "        self.n_heads = n_heads\n",
+    "        self.head_dim = emb_dim // n_heads\n",
+    "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        batch, seq_len, emb_dim = x.size()\n",
+    "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        context = flash_attention(Q, K, V)\n",
+    "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
+    "        return self.out_proj(context)\n",
+    "\n",
+    "# 4. Multi-Query Attention: Keys & Values tek projeksiyon\n",
+    "class MultiQueryAttention(nn.Module):\n",
+    "    def __init__(self, emb_dim, n_heads, dropout):\n",
+    "        super().__init__()\n",
+    "        self.n_heads = n_heads\n",
+    "        self.head_dim = emb_dim // n_heads\n",
+    "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.k_proj = nn.Linear(emb_dim, self.head_dim)\n",
+    "        self.v_proj = nn.Linear(emb_dim, self.head_dim)\n",
+    "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        batch, seq_len, emb_dim = x.size()\n",
+    "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        K = self.k_proj(x).unsqueeze(1).expand(batch, self.n_heads, seq_len, self.head_dim)\n",
+    "        V = self.v_proj(x).unsqueeze(1).expand(batch, self.n_heads, seq_len, self.head_dim)\n",
+    "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
+    "        attn = torch.softmax(scores, dim=-1)\n",
+    "        attn = self.dropout(attn)\n",
+    "        context = torch.matmul(attn, V)\n",
+    "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
+    "        return self.out_proj(context)\n",
+    "\n",
+    "# 5. ALiBi Attention: integrates relative position information by adding linear bias (Press et al., 2021)\n",
+    "class ALiBiAttention(nn.Module):\n",
+    "    def __init__(self, emb_dim, n_heads, dropout, alibi_scaling=-1.0):\n",
+    "        super().__init__()\n",
+    "        assert emb_dim % n_heads == 0, \"Embedding dimension must be divisible by the number of heads.\"\n",
+    "        self.n_heads = n_heads\n",
+    "        self.head_dim = emb_dim // n_heads\n",
+    "        self.q_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.k_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.v_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.out_proj = nn.Linear(emb_dim, emb_dim)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "        self.alibi_scaling = alibi_scaling\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        batch, seq_len, emb_dim = x.size()\n",
+    "        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)\n",
+    "        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)\n",
+    "        # ALiBi bias: B[i,j] = (j - i) * scale\n",
+    "        bias = torch.arange(seq_len, device=x.device).unsqueeze(0) - torch.arange(seq_len, device=x.device).unsqueeze(1)\n",
+    "        bias = self.alibi_scaling * bias.float()\n",
+    "        scores = scores + bias.unsqueeze(0).unsqueeze(0)\n",
+    "        attn = torch.softmax(scores, dim=-1)\n",
+    "        attn = self.dropout(attn)\n",
+    "        context = torch.matmul(attn, V)\n",
+    "        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)\n",
+    "        return self.out_proj(context)\n",
+    "\n",
+    "#############################################\n",
+    "# --- FFN variants ---\n",
+    "#############################################\n",
+    "# 1. Standart FFN\n",
+    "class StandardFFN(nn.Module):\n",
+    "    def __init__(self, emb_dim, expansion=4, dropout=0.1):\n",
+    "        super().__init__()\n",
+    "        self.net = nn.Sequential(\n",
+    "            nn.Linear(emb_dim, expansion * emb_dim),\n",
+    "            nn.GELU(),\n",
+    "            nn.Dropout(dropout),\n",
+    "            nn.Linear(expansion * emb_dim, emb_dim)\n",
+    "        )\n",
+    "    def forward(self, x):\n",
+    "        return self.net(x)\n",
+    "\n",
+    "# 2. MoE FFN\n",
+    "class MoEFFN(nn.Module):\n",
+    "    def __init__(self, emb_dim, num_experts, expansion=4, dropout=0.1):\n",
+    "        super().__init__()\n",
+    "        self.num_experts = num_experts\n",
+    "        self.experts = nn.ModuleList([\n",
+    "            nn.Sequential(\n",
+    "                nn.Linear(emb_dim, expansion * emb_dim),\n",
+    "                nn.GELU(),\n",
+    "                nn.Dropout(dropout),\n",
+    "                nn.Linear(expansion * emb_dim, emb_dim)\n",
+    "            ) for _ in range(num_experts)\n",
+    "        ])\n",
+    "        self.gate = nn.Linear(emb_dim, num_experts)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        gate_scores = torch.softmax(self.gate(x), dim=-1)  # (batch, seq_len, num_experts)\n",
+    "        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=-1)  # (batch, seq_len, emb_dim, num_experts)\n",
+    "        gate_scores = gate_scores.unsqueeze(2)  # (batch, seq_len, 1, num_experts)\n",
+    "        return (expert_outputs * gate_scores).sum(dim=-1)\n",
+    "\n",
+    "#############################################\n",
+    "# --- Transformer block: selectable attention and FFN variants, dropout, pre-norm ---\n",
+    "#############################################\n",
+    "class TransformerBlock(nn.Module):\n",
+    "    def __init__(self, emb_dim, n_heads, attn_module, ffn_module, dropout, norm_type):\n",
+    "        super().__init__()\n",
+    "        self.norm1 = get_norm(norm_type, emb_dim)\n",
+    "        self.attn = attn_module(emb_dim, n_heads, dropout)\n",
+    "        self.norm2 = get_norm(norm_type, emb_dim)\n",
+    "        self.ffn = ffn_module(emb_dim, dropout=dropout)  # ffn_module: StandardFFN or MoEFFN (for MoE, lambda is used)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = x + self.attn(self.norm1(x))\n",
+    "        x = x + self.ffn(self.norm2(x))\n",
+    "        return x\n",
+    "\n",
+    "#############################################\n",
+    "# --- Transformer model: structure supporting different variants ---\n",
+    "#############################################\n",
+    "class TransformerModel(nn.Module):\n",
+    "    def __init__(self, config, attn_variant='standard', ffn_variant='standard'):\n",
+    "        super().__init__()\n",
+    "        self.token_embed = nn.Embedding(config.vocab_size, config.emb_dim)\n",
+    "        self.pos_embed   = nn.Embedding(config.max_length, config.emb_dim)\n",
+    "\n",
+    "        attn_dict = {\n",
+    "            'standard': StandardAttention,\n",
+    "            'rope': RoPEAttention,\n",
+    "            'flash': FlashAttentionModule,\n",
+    "            'multiquery': MultiQueryAttention,\n",
+    "            'alibi': ALiBiAttention\n",
+    "        }\n",
+    "        ffn_dict = {\n",
+    "            'standard': StandardFFN,\n",
+    "            'moe': lambda emb_dim, dropout: MoEFFN(emb_dim, config.num_experts, dropout=dropout)\n",
+    "        }\n",
+    "        self.layers = nn.ModuleList([\n",
+    "            TransformerBlock(config.emb_dim, config.n_heads, attn_dict[attn_variant], ffn_dict[ffn_variant], config.dropout, config.norm_type)\n",
+    "            for _ in range(config.n_layers)\n",
+    "        ])\n",
+    "        self.norm = get_norm(config.norm_type, config.emb_dim)\n",
+    "        self.output_proj = nn.Linear(config.emb_dim, config.vocab_size, bias=False)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        seq_len = x.size(1)\n",
+    "        x = self.token_embed(x) + self.pos_embed(torch.arange(seq_len, device=x.device))\n",
+    "        for layer in self.layers:\n",
+    "            x = layer(x)\n",
+    "        x = self.norm(x)\n",
+    "        return self.output_proj(x)\n",
+    "\n",
+    "#############################################\n",
+    "# --- Extra: model summary and parameter count function ---\n",
+    "#############################################\n",
+    "def model_summary(model):\n",
+    "    total_params = sum(p.numel() for p in model.parameters())\n",
+    "    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
+    "    print(f\"Toplam Parametre: {total_params:,}\")\n",
+    "    print(f\"Trainable parameters: {trainable:,}\")\n",
+    "\n",
+    "#############################################\n",
+    "# --- Ek: Greedy Decoding Fonksiyonu ---\n",
+    "#############################################\n",
+    "def greedy_decode(model, start_token, max_length, device):\n",
+    "    model.eval()\n",
+    "    generated = [start_token]\n",
+    "    input_seq = torch.tensor([generated], device=device)\n",
+    "    with torch.no_grad():\n",
+    "        for _ in range(max_length - 1):\n",
+    "            logits = model(input_seq)  # (batch, seq_len, vocab_size)\n",
+    "            next_token = torch.argmax(logits[0, -1, :]).item()\n",
+    "            generated.append(next_token)\n",
+    "            input_seq = torch.tensor([generated], device=device)\n",
+    "    model.train()\n",
+    "    return generated\n",
+    "\n",
+    "#############################################\n",
+    "# --- Extra: simple training loop ---\n",
+    "#############################################\n",
+    "def train_model(model, config, epochs=3):\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "    model.to(device)\n",
+    "    optimizer = optim.AdamW(model.parameters(), lr=1e-4)\n",
+    "    loss_fn = nn.CrossEntropyLoss()\n",
+    "    # Dummy dataset: rastgele token dizileri\n",
+    "    for epoch in range(epochs):\n",
+    "        model.train()\n",
+    "        dummy_input = torch.randint(0, config.vocab_size, (8, config.max_length), device=device)\n",
+    "        dummy_target = torch.randint(0, config.vocab_size, (8, config.max_length), device=device)\n",
+    "        optimizer.zero_grad()\n",
+    "        logits = model(dummy_input)  # (batch, seq_len, vocab_size)\n",
+    "        loss = loss_fn(logits.view(-1, config.vocab_size), dummy_target.view(-1))\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        print(f\"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}\")\n",
+    "\n",
+    "#############################################\n",
+    "# --- Detailed test functions (improved version of the previous one) ---\n",
+    "#############################################\n",
+    "def run_detailed_tests(config, variant_list):\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "    for variant in variant_list:\n",
+    "        attn_var = variant['attn']\n",
+    "        ffn_var = variant['ffn']\n",
+    "        print(f\"\\nTest: Attention = {attn_var}, FFN = {ffn_var}\")\n",
+    "        model = TransformerModel(config, attn_variant=attn_var, ffn_variant=ffn_var).to(device)\n",
+    "        model_summary(model)\n",
+    "        model.train()\n",
+    "        dummy_input = torch.randint(0, config.vocab_size, (4, config.max_length), device=device)\n",
+    "        logits = model(dummy_input)\n",
+    "        loss = nn.CrossEntropyLoss()(logits.view(-1, config.vocab_size),\n",
+    "                                     torch.randint(0, config.vocab_size, (4 * config.max_length,), device=device))\n",
+    "        loss.backward()\n",
+    "        print(f\"Loss: {loss.item():.4f}, Output shape: {logits.shape}\")\n",
+    "\n",
+    "        torch.cuda.synchronize() if device.type == 'cuda' else None\n",
+    "        start_time = time.time()\n",
+    "        for _ in range(10):\n",
+    "            _ = model(dummy_input)\n",
+    "        torch.cuda.synchronize() if device.type == 'cuda' else None\n",
+    "        avg_time = (time.time() - start_time) / 10.0\n",
+    "        print(f\"Average forward pass time: {avg_time:.6f} s\")\n",
+    "\n",
+    "        # Greedy decoding test (generates the first 10 tokens)\n",
+    "        start_token = dummy_input[0, 0].item()\n",
+    "        generated = greedy_decode(model, start_token, max_length=10, device=device)\n",
+    "        print(f\"Greedy Decode Output: {generated}\")\n",
+    "\n",
+    "#############################################\n",
+    "# --- Main Working Section: try different variants ---\n",
+    "#############################################\n",
+    "if __name__ == \"__main__\":\n",
+    "    # The configuration includes the norm type and dropout.\n",
+    "    config = Config(vocab_size=30522, emb_dim=768, max_length=128, n_layers=4, n_heads=12, dropout=0.1, norm_type='rmsnorm')\n",
+    "\n",
+    "    # Variants to try: different attention and FFN variants\n",
+    "    variant_list = [\n",
+    "        {'attn': 'standard', 'ffn': 'standard'},\n",
+    "        {'attn': 'rope',     'ffn': 'standard'},\n",
+    "        {'attn': 'flash',    'ffn': 'standard'},\n",
+    "        {'attn': 'multiquery', 'ffn': 'standard'},\n",
+    "        {'attn': 'alibi',    'ffn': 'standard'},\n",
+    "        {'attn': 'standard', 'ffn': 'moe'},\n",
+    "        {'attn': 'rope',     'ffn': 'moe'},\n",
+    "        {'attn': 'flash',    'ffn': 'moe'},\n",
+    "        {'attn': 'multiquery', 'ffn': 'moe'},\n",
+    "        {'attn': 'alibi',    'ffn': 'moe'},\n",
+    "    ]\n",
+    "\n",
+    "    print(\"=== Detailed Variant Tests ===\")\n",
+    "    run_detailed_tests(config, variant_list)\n",
+    "\n",
+    "    print(\"\\n=== Training Loop Test ===\")\n",
+    "    # Choose a variant (e.g., advanced variant: RoPE + MoE FFN)\n",
+    "    model = TransformerModel(config, attn_variant='rope', ffn_variant='moe').to(torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\"))\n",
+    "    train_model(model, config, epochs=3)\n",
+    "\n",
+    "    print(\"\\n=== Greedy Decoding Testi ===\")\n",
+    "    # Greedy decoding example: take the first token from dummy input and generate 20 tokens\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "    prompt_token = torch.randint(0, config.vocab_size, (1,)).item()\n",
+    "    generated_tokens = greedy_decode(model, prompt_token, max_length=20, device=device)\n",
+    "    print(\"Generated Tokens:\", generated_tokens)"
+   ],
+   "metadata": {
+    "trusted": true,
+    "execution": {
+     "iopub.status.busy": "2025-02-12T22:04:05.352069Z",
+     "iopub.execute_input": "2025-02-12T22:04:05.352443Z",
+     "iopub.status.idle": "2025-02-12T22:06:42.132452Z",
+     "shell.execute_reply.started": "2025-02-12T22:04:05.352413Z",
+     "shell.execute_reply": "2025-02-12T22:06:42.131334Z"
     },
-    {
-      "cell_type": "code",
-      "source": [
-        "import torch\n",
-        "import torch.nn as nn\n",
-        "import torch.optim as optim\n",
-        "import math\n",
-        "from datasets import load_dataset\n",
-        "from collections import defaultdict\n",
-        "from reportlab.lib.pagesizes import A4\n",
-        "from reportlab.pdfgen import canvas\n",
-        "\n",
-        "#############################################\n",
-        "# Turkish-Alpaca Veri Seti ve Tokenizer\n",
-        "#############################################\n",
-        "class TurkishAlpacaDataset:\n",
-        "    def __init__(self, config):\n",
-        "        # Hugging Face'den veri setini yükle\n",
-        "        dataset = load_dataset(\"TFLai/Turkish-Alpaca\")\n",
-        "        self.instructions = dataset['train']['instruction'][:100]  # Limit to 100 samples\n",
-        "        self.outputs = dataset['train']['output'][:100]            # Limit to 100 samples\n",
-        "\n",
-        "        # Tokenizer oluştur\n",
-        "        self.vocab = defaultdict(lambda: len(self.vocab))\n",
-        "        self.vocab['<pad>'] = 0  # Padding token'i ekle\n",
-        "\n",
-        "        # Tüm veriyi tokenize et\n",
-        "        self.tokenize_data()\n",
-        "\n",
-        "        # Inverse vocab oluştur\n",
-        "        self.inverse_vocab = {v: k for k, v in self.vocab.items()}\n",
-        "\n",
-        "        # Dynamically update vocab_size in config\n",
-        "        config.vocab_size = len(self.vocab)\n",
-        "        self.config = config\n",
-        "\n",
-        "    def tokenize_data(self):\n",
-        "        # Instruction ve Output'u tokenize et\n",
-        "        self.tokenized_instructions = []\n",
-        "        self.tokenized_outputs = []\n",
-        "\n",
-        "        for inst, out in zip(self.instructions, self.outputs):\n",
-        "            inst_tokens = [self.vocab[word] for word in inst.split()]\n",
-        "            out_tokens = [self.vocab[word] for word in out.split()]\n",
-        "            self.tokenized_instructions.append(inst_tokens)\n",
-        "            self.tokenized_outputs.append(out_tokens)\n",
-        "\n",
-        "    def get_batch(self, batch_size=4):\n",
-        "        # Rastgele bir batch oluştur\n",
-        "        indices = torch.randint(0, len(self.tokenized_instructions), (batch_size,))\n",
-        "        inputs, targets = [], []\n",
-        "\n",
-        "        for i in indices:\n",
-        "            input_tokens = self.tokenized_instructions[i][:-1]\n",
-        "            target_tokens = self.tokenized_outputs[i][1:]\n",
-        "            inputs.append(torch.tensor(input_tokens, dtype=torch.long))\n",
-        "            targets.append(torch.tensor(target_tokens, dtype=torch.long))\n",
-        "\n",
-        "        # Padding işlemi\n",
-        "        inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)\n",
-        "        targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)\n",
-        "        return inputs, targets\n",
-        "\n",
-        "\n",
-        "#############################################\n",
-        "# Greedy Decode Function\n",
-        "#############################################\n",
-        "def greedy_decode(model, start_token, max_length, device, temperature=1.0):\n",
-        "    \"\"\"\n",
-        "    Greedy decoding to generate sequences from a language model.\n",
-        "\n",
-        "    Args:\n",
-        "        model: The language model to use for generation.\n",
-        "        start_token: The token ID to start decoding from.\n",
-        "        max_length: Maximum length of the generated sequence.\n",
-        "        device: The device (CPU/GPU) where the model resides.\n",
-        "        temperature: Sampling temperature (optional, default=1.0).\n",
-        "\n",
-        "    Returns:\n",
-        "        List of generated token IDs.\n",
-        "    \"\"\"\n",
-        "    model.eval()\n",
-        "    with torch.no_grad():\n",
-        "        input_token = torch.tensor([[start_token]], dtype=torch.long).to(device)\n",
-        "        generated_tokens = [start_token]\n",
-        "\n",
-        "        for _ in range(max_length - 1):\n",
-        "            logits = model(input_token)\n",
-        "            next_token_logits = logits[:, -1, :] / temperature\n",
-        "            next_token = torch.argmax(next_token_logits, dim=-1).item()\n",
-        "\n",
-        "            if next_token == 0:  # Stop if <pad> token is generated\n",
-        "                break\n",
-        "\n",
-        "            generated_tokens.append(next_token)\n",
-        "            input_token = torch.cat([input_token, torch.tensor([[next_token]], dtype=torch.long).to(device)], dim=1)\n",
-        "\n",
-        "    return generated_tokens\n",
-        "\n",
-        "\n",
-        "#############################################\n",
-        "# Geliştirilmiş Eğitim ve Değerlendirme\n",
-        "#############################################\n",
-        "def train_and_evaluate(model, config, epochs=10):\n",
-        "    device = next(model.parameters()).device\n",
-        "    dataset = TurkishAlpacaDataset(config)\n",
-        "    optimizer = optim.AdamW(model.parameters(), lr=1e-4)  # Learning rate artırıldı\n",
-        "    loss_fn = nn.CrossEntropyLoss(ignore_index=0)\n",
-        "\n",
-        "    print(f\"\\n{'='*40}\")\n",
-        "    print(f\"🏁 {model.name} Eğitime Başlıyor...\")\n",
-        "    print(f\"🔢 Toplam Token Sayısı: {len(dataset.vocab)}\")\n",
-        "    print(f\"⚙️  Kullanılan Donanım: {'GPU' if device.type=='cuda' else 'CPU'}\")\n",
-        "    print(f\"{'='*40}\\n\")\n",
-        "\n",
-        "    for epoch in range(epochs):\n",
-        "        model.train()\n",
-        "        inputs, targets = dataset.get_batch(batch_size=8)  # Batch size artırıldı\n",
-        "        inputs, targets = inputs.to(device), targets.to(device)\n",
-        "\n",
-        "        optimizer.zero_grad()\n",
-        "        logits = model(inputs)\n",
-        "        logits = logits.view(-1, config.vocab_size)  # Reshape logits\n",
-        "        targets = targets.view(-1)                  # Reshape targets\n",
-        "        loss = loss_fn(logits, targets)\n",
-        "        loss.backward()\n",
-        "        optimizer.step()\n",
-        "\n",
-        "        # Eğitim Metrikleri\n",
-        "        preds = torch.argmax(logits, dim=-1)\n",
-        "        mask = targets != 0\n",
-        "        correct = (preds[mask] == targets[mask]).sum().item()\n",
-        "        total = mask.sum().item()\n",
-        "        acc = correct / total if total > 0 else 0\n",
-        "        ppl = math.exp(loss.item())\n",
-        "\n",
-        "        print(f\"Epok {epoch+1}/{epochs} | \"\n",
-        "              f\"Kayıp: {loss.item():.3f} | \"\n",
-        "              f\"Doğruluk: {acc:.1%} | \"\n",
-        "              f\"Perplexity: {ppl:.2f}\")\n",
-        "\n",
-        "    # Son Değerlendirme\n",
-        "    model.eval()\n",
-        "    with torch.no_grad():\n",
-        "        inputs, targets = dataset.get_batch(batch_size=8)\n",
-        "        inputs, targets = inputs.to(device), targets.to(device)\n",
-        "        logits = model(inputs)\n",
-        "        logits = logits.view(-1, config.vocab_size)  # Reshape logits\n",
-        "        targets = targets.view(-1)                  # Reshape targets\n",
-        "        loss = loss_fn(logits, targets)\n",
-        "\n",
-        "        # Metrik Hesaplama\n",
-        "        preds = torch.argmax(logits, dim=-1)\n",
-        "        mask = targets != 0\n",
-        "        correct = (preds[mask] == targets[mask]).sum().item()\n",
-        "        total = mask.sum().item()\n",
-        "        final_acc = correct / total if total > 0 else 0\n",
-        "        final_ppl = math.exp(loss.item())\n",
-        "\n",
-        "        # Örnek Üretim\n",
-        "        start_word = dataset.instructions[0].split()[0]\n",
-        "        input_token = dataset.vocab[start_word]\n",
-        "        generated = greedy_decode(model, input_token, max_length=config.max_length, device=device)\n",
-        "        generated_sentence = ' '.join([dataset.inverse_vocab.get(t, \"?\") for t in generated])\n",
-        "\n",
-        "    print(f\"\\n⭐ Final Performans ⭐\")\n",
-        "    print(f\"|{'Metric':<15}|{'Değer':<15}|\")\n",
-        "    print(f\"|{'-'*15}|{'-'*15}|\")\n",
-        "    print(f\"|{'Kayıp':<15}|{loss.item():.3f}|\")\n",
-        "    print(f\"|{'Doğruluk':<15}|{final_acc:.1%}|\")\n",
-        "    print(f\"|{'Perplexity':<15}|{final_ppl:.2f}|\")\n",
-        "    print(f\"\\n🔮 Örnek Çıktı: {generated_sentence}\")\n",
-        "\n",
-        "    # Metrikleri döndür\n",
-        "    return {\n",
-        "        'parameters': sum(p.numel() for p in model.parameters()),\n",
-        "        'trainable_parameters': sum(p.numel() for p in model.parameters() if p.requires_grad),\n",
-        "        'loss': loss.item(),\n",
-        "        'accuracy': final_acc,\n",
-        "        'perplexity': final_ppl,\n",
-        "        'sample_outputs': [generated_sentence]\n",
-        "    }\n",
-        "\n",
-        "\n",
-        "#############################################\n",
-        "# PDF Oluşturma Fonksiyonu (reportlab ile)\n",
-        "#############################################\n",
-        "def save_results_to_pdf(metrics, model_name):\n",
-        "    # PDF dosyasını oluştur\n",
-        "    pdf_path = f\"{model_name}_degerlendirme.pdf\"\n",
-        "    c = canvas.Canvas(pdf_path, pagesize=A4)\n",
-        "    width, height = A4\n",
-        "\n",
-        "    # Başlık\n",
-        "    c.setFont(\"Helvetica-Bold\", 16)\n",
-        "    c.drawString(50, height - 50, f\"Model Değerlendirme Raporu: {model_name}\")\n",
-        "\n",
-        "    # Metrikler\n",
-        "    c.setFont(\"Helvetica\", 12)\n",
-        "    y = height - 80\n",
-        "    c.drawString(50, y, \"📊 Performans Metrikleri\")\n",
-        "    y -= 20\n",
-        "    c.drawString(50, y, f\"Toplam Parametre Sayısı: {metrics['parameters']:,}\")\n",
-        "    y -= 20\n",
-        "    c.drawString(50, y, f\"Eğitilebilir Parametre Sayısı: {metrics['trainable_parameters']:,}\")\n",
-        "    y -= 20\n",
-        "    c.drawString(50, y, f\"Kayıp: {metrics['loss']:.3f}\")\n",
-        "    y -= 20\n",
-        "    c.drawString(50, y, f\"Doğruluk: {metrics['accuracy']:.1%}\")\n",
-        "    y -= 20\n",
-        "    c.drawString(50, y, f\"Perplexity: {metrics['perplexity']:.2f}\")\n",
-        "\n",
-        "    # Örnek Çıktılar\n",
-        "    y -= 30\n",
-        "    c.drawString(50, y, \"🔮 Örnek Çıktılar\")\n",
-        "    y -= 20\n",
-        "    for i, output in enumerate(metrics['sample_outputs']):\n",
-        "        c.drawString(50, y, f\"Örnek {i+1}: {output}\")\n",
-        "        y -= 20\n",
-        "\n",
-        "    # PDF'i kaydet\n",
-        "    c.save()\n",
-        "    print(f\"📄 {model_name} için rapor PDF olarak kaydedildi: {pdf_path}\")\n",
-        "\n",
-        "\n",
-        "#############################################\n",
-        "# Dummy Transformer Model for Testing\n",
-        "#############################################\n",
-        "class TransformerModel(nn.Module):\n",
-        "    def __init__(self, config, attn_type, ffn_type):\n",
-        "        super().__init__()\n",
-        "        self.name = f\"{attn_type}-{ffn_type}\"\n",
-        "        self.embedding = nn.Embedding(config.vocab_size, config.emb_dim)\n",
-        "        self.transformer = nn.Transformer(\n",
-        "            d_model=config.emb_dim,\n",
-        "            nhead=config.n_heads,\n",
-        "            num_encoder_layers=config.n_layers,\n",
-        "            num_decoder_layers=config.n_layers,\n",
-        "            dim_feedforward=config.emb_dim * 4,\n",
-        "            dropout=config.dropout\n",
-        "        )\n",
-        "        self.fc_out = nn.Linear(config.emb_dim, config.vocab_size)\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        x = self.embedding(x)\n",
-        "        x = self.transformer(x, x)\n",
-        "        x = self.fc_out(x)\n",
-        "        return x\n",
-        "\n",
-        "\n",
-        "if __name__ == \"__main__\":\n",
-        "    # Konfigürasyon\n",
-        "    class Config:\n",
-        "        def __init__(self):\n",
-        "            self.vocab_size = 100  # Bu değer dinamik olarak güncellenecek\n",
-        "            self.emb_dim = 256     # Embedding boyutu artırıldı\n",
-        "            self.max_length = 32   # Maksimum uzunluk artırıldı\n",
-        "\n",
-        "            # Veri setinden maksimum uzunluğu hesapla\n",
-        "            dataset = load_dataset(\"TFLai/Turkish-Alpaca\")\n",
-        "            instructions = dataset['train']['instruction'][:100]  # Limit to 100 samples\n",
-        "            outputs = dataset['train']['output'][:100]            # Limit to 100 samples\n",
-        "\n",
-        "            instruction_lengths = [len(inst.split()) for inst in instructions]\n",
-        "            output_lengths = [len(out.split()) for out in outputs]\n",
-        "\n",
-        "            max_instruction_length = max(instruction_lengths)\n",
-        "            max_output_length = max(output_lengths)\n",
-        "\n",
-        "            # max_length'ı instruction ve output'un maksimum uzunluğuna göre ayarla\n",
-        "            self.max_length = max(max_instruction_length, max_output_length) + 10  # Ekstra pay bırak\n",
-        "\n",
-        "            self.n_layers = 4      # Katman sayısı artırıldı\n",
-        "            self.n_heads = 8       # Head sayısı artırıldı\n",
-        "            self.dropout = 0.1\n",
-        "            self.norm_type = 'rmsnorm'\n",
-        "            self.num_experts = 2\n",
-        "\n",
-        "    config = Config()\n",
-        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
-        "\n",
-        "    # Test Edilecek Modeller\n",
-        "    experiments = [\n",
-        "        {'attn': 'standard', 'ffn': 'standard', 'name': 'Standart Model'},\n",
-        "        {'attn': 'rope', 'ffn': 'standard', 'name': 'RoPE Dikkat'},\n",
-        "        {'attn': 'alibi', 'ffn': 'moe', 'name': 'ALiBi + MoE'},\n",
-        "        {'attn': 'multiquery', 'ffn': 'moe', 'name': 'Multi-Query MoE'}\n",
-        "    ]\n",
-        "\n",
-        "    # Deneyleri Çalıştır\n",
-        "    results = []\n",
-        "    for exp in experiments:\n",
-        "        print(f\"\\n{'='*40}\")\n",
-        "        print(f\"🧪 {exp['name']} Değerlendiriliyor...\")\n",
-        "        print(f\"{'='*40}\")\n",
-        "\n",
-        "        model = TransformerModel(config, exp['attn'], exp['ffn']).to(device)\n",
-        "        model.name = exp['name']\n",
-        "\n",
-        "        # Eğitim ve Değerlendirme\n",
-        "        metrics = train_and_evaluate(model, config, epochs=20)  # Epoch sayısı artırıldı\n",
-        "        results.append((exp['name'], metrics))\n",
-        "\n",
-        "        # PDF Raporu Oluştur\n",
-        "        save_results_to_pdf(metrics, exp['name'])\n",
-        "\n",
-        "    # Tüm Sonuçları Karşılaştır\n",
-        "    print(\"\\n📊 Tüm Modellerin Karşılaştırması:\")\n",
-        "    print(f\"|{'Model':<20}|{'Parametre':<10}|{'Doğruluk':<10}|{'Perplexity':<12}|\")\n",
-        "    print(f\"|{'-'*20}|{'-'*10}|{'-'*10}|{'-'*12}|\")\n",
-        "    for name, metrics in results:\n",
-        "        print(f\"|{name:<20}|{metrics['parameters']:<10,}|{metrics['accuracy']:<10.1%}|{metrics['perplexity']:<12.2f}|\")"
-      ],
-      "metadata": {
-        "trusted": true,
-        "execution": {
-          "iopub.status.busy": "2025-02-12T22:11:53.299779Z",
-          "iopub.execute_input": "2025-02-12T22:11:53.300209Z",
-          "iopub.status.idle": "2025-02-12T22:11:55.659114Z",
-          "shell.execute_reply.started": "2025-02-12T22:11:53.300168Z",
-          "shell.execute_reply": "2025-02-12T22:11:55.657535Z"
-        },
-        "id": "Gdew4V4fw_I4"
-      },
-      "outputs": [],
-      "execution_count": null
+    "id": "4Q_jYSvEw_I3"
+   },
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "#!pip install evaluate reportlab"
+   ],
+   "metadata": {
+    "trusted": true,
+    "execution": {
+     "iopub.status.busy": "2025-02-12T22:06:55.854229Z",
+     "iopub.execute_input": "2025-02-12T22:06:55.854996Z",
+     "iopub.status.idle": "2025-02-12T22:06:55.861061Z",
+     "shell.execute_reply.started": "2025-02-12T22:06:55.854884Z",
+     "shell.execute_reply": "2025-02-12T22:06:55.859524Z"
     },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "İşlem için CPU ve GPU yetersiz o yüzden bu uyarı geliyor"
-      ],
-      "metadata": {
-        "id": "6f-KvTbfw_I4"
-      }
+    "id": "6ZXigQy4w_I4"
+   },
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import math\n",
+    "from datasets import load_dataset\n",
+    "from collections import defaultdict\n",
+    "from reportlab.lib.pagesizes import A4\n",
+    "from reportlab.pdfgen import canvas\n",
+    "\n",
+    "#############################################\n",
+    "# Turkish-Alpaca Veri Seti ve Tokenizer\n",
+    "#############################################\n",
+    "class TurkishAlpacaDataset:\n",
+    "    def __init__(self, config):\n",
+    "        # Load the dataset from Hugging Face\n",
+    "        dataset = load_dataset(\"TFLai/Turkish-Alpaca\")\n",
+    "        self.instructions = dataset['train']['instruction'][:100]  # Limit to 100 samples\n",
+    "        self.outputs = dataset['train']['output'][:100]            # Limit to 100 samples\n",
+    "\n",
+    "        # Create tokenizer\n",
+    "        self.vocab = defaultdict(lambda: len(self.vocab))\n",
+    "        self.vocab['<pad>'] = 0  # Padding token'i ekle\n",
+    "\n",
+    "        # Tokenize all the data\n",
+    "        self.tokenize_data()\n",
+    "\n",
+    "        # Build the inverse vocabulary\n",
+    "        self.inverse_vocab = {v: k for k, v in self.vocab.items()}\n",
+    "\n",
+    "        # Dynamically update vocab_size in config\n",
+    "        config.vocab_size = len(self.vocab)\n",
+    "        self.config = config\n",
+    "\n",
+    "    def tokenize_data(self):\n",
+    "        # Instruction ve Output'u tokenize et\n",
+    "        self.tokenized_instructions = []\n",
+    "        self.tokenized_outputs = []\n",
+    "\n",
+    "        for inst, out in zip(self.instructions, self.outputs):\n",
+    "            inst_tokens = [self.vocab[word] for word in inst.split()]\n",
+    "            out_tokens = [self.vocab[word] for word in out.split()]\n",
+    "            self.tokenized_instructions.append(inst_tokens)\n",
+    "            self.tokenized_outputs.append(out_tokens)\n",
+    "\n",
+    "    def get_batch(self, batch_size=4):\n",
+    "        # Create a random batch\n",
+    "        indices = torch.randint(0, len(self.tokenized_instructions), (batch_size,))\n",
+    "        inputs, targets = [], []\n",
+    "\n",
+    "        for i in indices:\n",
+    "            input_tokens = self.tokenized_instructions[i][:-1]\n",
+    "            target_tokens = self.tokenized_outputs[i][1:]\n",
+    "            inputs.append(torch.tensor(input_tokens, dtype=torch.long))\n",
+    "            targets.append(torch.tensor(target_tokens, dtype=torch.long))\n",
+    "\n",
+    "        # Padding operation\n",
+    "        inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)\n",
+    "        targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)\n",
+    "        return inputs, targets\n",
+    "\n",
+    "\n",
+    "#############################################\n",
+    "# Greedy Decode Function\n",
+    "#############################################\n",
+    "def greedy_decode(model, start_token, max_length, device, temperature=1.0):\n",
+    "    \"\"\"\n",
+    "    Greedy decoding to generate sequences from a language model.\n",
+    "\n",
+    "    Args:\n",
+    "        model: The language model to use for generation.\n",
+    "        start_token: The token ID to start decoding from.\n",
+    "        max_length: Maximum length of the generated sequence.\n",
+    "        device: The device (CPU/GPU) where the model resides.\n",
+    "        temperature: Sampling temperature (optional, default=1.0).\n",
+    "\n",
+    "    Returns:\n",
+    "        List of generated token IDs.\n",
+    "    \"\"\"\n",
+    "    model.eval()\n",
+    "    with torch.no_grad():\n",
+    "        input_token = torch.tensor([[start_token]], dtype=torch.long).to(device)\n",
+    "        generated_tokens = [start_token]\n",
+    "\n",
+    "        for _ in range(max_length - 1):\n",
+    "            logits = model(input_token)\n",
+    "            next_token_logits = logits[:, -1, :] / temperature\n",
+    "            next_token = torch.argmax(next_token_logits, dim=-1).item()\n",
+    "\n",
+    "            if next_token == 0:  # Stop if <pad> token is generated\n",
+    "                break\n",
+    "\n",
+    "            generated_tokens.append(next_token)\n",
+    "            input_token = torch.cat([input_token, torch.tensor([[next_token]], dtype=torch.long).to(device)], dim=1)\n",
+    "\n",
+    "    return generated_tokens\n",
+    "\n",
+    "\n",
+    "#############################################\n",
+    "# Improved training and evaluation\n",
+    "#############################################\n",
+    "def train_and_evaluate(model, config, epochs=10):\n",
+    "    device = next(model.parameters()).device\n",
+    "    dataset = TurkishAlpacaDataset(config)\n",
+    "    optimizer = optim.AdamW(model.parameters(), lr=1e-4)  # Increased learning rate\n",
+    "    loss_fn = nn.CrossEntropyLoss(ignore_index=0)\n",
+    "\n",
+    "    print(f\"\\n{'='*40}\")\n",
+    "    print(f\"🏁 {model.name} is starting training...\")\n",
+    "    print(f\"🔢 Toplam Token Count: {len(dataset.vocab)}\")\n",
+    "    print(f\"⚙️  Hardware in use: {'GPU' if device.type=='cuda' else 'CPU'}\")\n",
+    "    print(f\"{'='*40}\\n\")\n",
+    "\n",
+    "    for epoch in range(epochs):\n",
+    "        model.train()\n",
+    "        inputs, targets = dataset.get_batch(batch_size=8)  # Increased batch size\n",
+    "        inputs, targets = inputs.to(device), targets.to(device)\n",
+    "\n",
+    "        optimizer.zero_grad()\n",
+    "        logits = model(inputs)\n",
+    "        logits = logits.view(-1, config.vocab_size)  # Reshape logits\n",
+    "        targets = targets.view(-1)                  # Reshape targets\n",
+    "        loss = loss_fn(logits, targets)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "\n",
+    "        # Training metrics\n",
+    "        preds = torch.argmax(logits, dim=-1)\n",
+    "        mask = targets != 0\n",
+    "        correct = (preds[mask] == targets[mask]).sum().item()\n",
+    "        total = mask.sum().item()\n",
+    "        acc = correct / total if total > 0 else 0\n",
+    "        ppl = math.exp(loss.item())\n",
+    "\n",
+    "        print(f\"Epok {epoch+1}/{epochs} | \"\n",
+    "              f\"Loss: {loss.item():.3f} | \"\n",
+    "              f\"Accuracy: {acc:.1%} | \"\n",
+    "              f\"Perplexity: {ppl:.2f}\")\n",
+    "\n",
+    "    # Final evaluation\n",
+    "    model.eval()\n",
+    "    with torch.no_grad():\n",
+    "        inputs, targets = dataset.get_batch(batch_size=8)\n",
+    "        inputs, targets = inputs.to(device), targets.to(device)\n",
+    "        logits = model(inputs)\n",
+    "        logits = logits.view(-1, config.vocab_size)  # Reshape logits\n",
+    "        targets = targets.view(-1)                  # Reshape targets\n",
+    "        loss = loss_fn(logits, targets)\n",
+    "\n",
+    "        # Metrik Hesaplama\n",
+    "        preds = torch.argmax(logits, dim=-1)\n",
+    "        mask = targets != 0\n",
+    "        correct = (preds[mask] == targets[mask]).sum().item()\n",
+    "        total = mask.sum().item()\n",
+    "        final_acc = correct / total if total > 0 else 0\n",
+    "        final_ppl = math.exp(loss.item())\n",
+    "\n",
+    "        # Example generation\n",
+    "        start_word = dataset.instructions[0].split()[0]\n",
+    "        input_token = dataset.vocab[start_word]\n",
+    "        generated = greedy_decode(model, input_token, max_length=config.max_length, device=device)\n",
+    "        generated_sentence = ' '.join([dataset.inverse_vocab.get(t, \"?\") for t in generated])\n",
+    "\n",
+    "    print(f\"\\n⭐ Final Performans ⭐\")\n",
+    "    print(f\"|{'Metric':<15}|{'Value':<15}|\")\n",
+    "    print(f\"|{'-'*15}|{'-'*15}|\")\n",
+    "    print(f\"|{'Loss':<15}|{loss.item():.3f}|\")\n",
+    "    print(f\"|{'Accuracy':<15}|{final_acc:.1%}|\")\n",
+    "    print(f\"|{'Perplexity':<15}|{final_ppl:.2f}|\")\n",
+    "    print(f\"\\n🔮 Example Output: {generated_sentence}\")\n",
+    "\n",
+    "    # Return the metrics\n",
+    "    return {\n",
+    "        'parameters': sum(p.numel() for p in model.parameters()),\n",
+    "        'trainable_parameters': sum(p.numel() for p in model.parameters() if p.requires_grad),\n",
+    "        'loss': loss.item(),\n",
+    "        'accuracy': final_acc,\n",
+    "        'perplexity': final_ppl,\n",
+    "        'sample_outputs': [generated_sentence]\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "#############################################\n",
+    "# PDF generation function (with reportlab)\n",
+    "#############################################\n",
+    "def save_results_to_pdf(metrics, model_name):\n",
+    "    # Create the PDF file\n",
+    "    pdf_path = f\"{model_name}_degerlendirme.pdf\"\n",
+    "    c = canvas.Canvas(pdf_path, pagesize=A4)\n",
+    "    width, height = A4\n",
+    "\n",
+    "    # Title\n",
+    "    c.setFont(\"Helvetica-Bold\", 16)\n",
+    "    c.drawString(50, height - 50, f\"Model Evaluation Report: {model_name}\")\n",
+    "\n",
+    "    # Metrikler\n",
+    "    c.setFont(\"Helvetica\", 12)\n",
+    "    y = height - 80\n",
+    "    c.drawString(50, y, \"📊 Performans Metrikleri\")\n",
+    "    y -= 20\n",
+    "    c.drawString(50, y, f\"Total Number of Parameters: {metrics['parameters']:,}\")\n",
+    "    y -= 20\n",
+    "    c.drawString(50, y, f\"Number of Trainable Parameters: {metrics['trainable_parameters']:,}\")\n",
+    "    y -= 20\n",
+    "    c.drawString(50, y, f\"Loss: {metrics['loss']:.3f}\")\n",
+    "    y -= 20\n",
+    "    c.drawString(50, y, f\"Accuracy: {metrics['accuracy']:.1%}\")\n",
+    "    y -= 20\n",
+    "    c.drawString(50, y, f\"Perplexity: {metrics['perplexity']:.2f}\")\n",
+    "\n",
+    "    # Example outputs\n",
+    "    y -= 30\n",
+    "    c.drawString(50, y, \"🔮 Example Outputs\")\n",
+    "    y -= 20\n",
+    "    for i, output in enumerate(metrics['sample_outputs']):\n",
+    "        c.drawString(50, y, f\"Example {i+1}: {output}\")\n",
+    "        y -= 20\n",
+    "\n",
+    "    # PDF'i kaydet\n",
+    "    c.save()\n",
+    "    print(f\"📄 Saved the report for {model_name} as a PDF: {pdf_path}\")\n",
+    "\n",
+    "\n",
+    "#############################################\n",
+    "# Dummy Transformer Model for Testing\n",
+    "#############################################\n",
+    "class TransformerModel(nn.Module):\n",
+    "    def __init__(self, config, attn_type, ffn_type):\n",
+    "        super().__init__()\n",
+    "        self.name = f\"{attn_type}-{ffn_type}\"\n",
+    "        self.embedding = nn.Embedding(config.vocab_size, config.emb_dim)\n",
+    "        self.transformer = nn.Transformer(\n",
+    "            d_model=config.emb_dim,\n",
+    "            nhead=config.n_heads,\n",
+    "            num_encoder_layers=config.n_layers,\n",
+    "            num_decoder_layers=config.n_layers,\n",
+    "            dim_feedforward=config.emb_dim * 4,\n",
+    "            dropout=config.dropout\n",
+    "        )\n",
+    "        self.fc_out = nn.Linear(config.emb_dim, config.vocab_size)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.embedding(x)\n",
+    "        x = self.transformer(x, x)\n",
+    "        x = self.fc_out(x)\n",
+    "        return x\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    # Configuration\n",
+    "    class Config:\n",
+    "        def __init__(self):\n",
+    "            self.vocab_size = 100  # This value will be updated dynamically\n",
+    "            self.emb_dim = 256     # Embedding dimension increased\n",
+    "            self.max_length = 32   # Maximum length increased\n",
+    "\n",
+    "            # Compute the maximum length from the dataset\n",
+    "            dataset = load_dataset(\"TFLai/Turkish-Alpaca\")\n",
+    "            instructions = dataset['train']['instruction'][:100]  # Limit to 100 samples\n",
+    "            outputs = dataset['train']['output'][:100]            # Limit to 100 samples\n",
+    "\n",
+    "            instruction_lengths = [len(inst.split()) for inst in instructions]\n",
+    "            output_lengths = [len(out.split()) for out in outputs]\n",
+    "\n",
+    "            max_instruction_length = max(instruction_lengths)\n",
+    "            max_output_length = max(output_lengths)\n",
+    "\n",
+    "            # Set max_length based on the maximum instruction and output length\n",
+    "            self.max_length = max(max_instruction_length, max_output_length) + 10  # Leave additional buffer\n",
+    "\n",
+    "            self.n_layers = 4      # Number of layers increased\n",
+    "            self.n_heads = 8       # Number of heads increased\n",
+    "            self.dropout = 0.1\n",
+    "            self.norm_type = 'rmsnorm'\n",
+    "            self.num_experts = 2\n",
+    "\n",
+    "    config = Config()\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "    # Test Edilecek Modeller\n",
+    "    experiments = [\n",
+    "        {'attn': 'standard', 'ffn': 'standard', 'name': 'Standart Model'},\n",
+    "        {'attn': 'rope', 'ffn': 'standard', 'name': 'RoPE Dikkat'},\n",
+    "        {'attn': 'alibi', 'ffn': 'moe', 'name': 'ALiBi + MoE'},\n",
+    "        {'attn': 'multiquery', 'ffn': 'moe', 'name': 'Multi-Query MoE'}\n",
+    "    ]\n",
+    "\n",
+    "    # Run the experiments\n",
+    "    results = []\n",
+    "    for exp in experiments:\n",
+    "        print(f\"\\n{'='*40}\")\n",
+    "        print(f\"🧪 Evaluating {exp['name']}...\")\n",
+    "        print(f\"{'='*40}\")\n",
+    "\n",
+    "        model = TransformerModel(config, exp['attn'], exp['ffn']).to(device)\n",
+    "        model.name = exp['name']\n",
+    "\n",
+    "        # Training and evaluation\n",
+    "        metrics = train_and_evaluate(model, config, epochs=20)  # Increased number of epochs\n",
+    "        results.append((exp['name'], metrics))\n",
+    "\n",
+    "        # Generate the PDF report\n",
+    "        save_results_to_pdf(metrics, exp['name'])\n",
+    "\n",
+    "    # Compare all results\n",
+    "    print(\"\\n📊 Comparison of All Models:\")\n",
+    "    print(f\"|{'Model':<20}|{'Parametre':<10}|{'Accuracy':<10}|{'Perplexity':<12}|\")\n",
+    "    print(f\"|{'-'*20}|{'-'*10}|{'-'*10}|{'-'*12}|\")\n",
+    "    for name, metrics in results:\n",
+    "        print(f\"|{name:<20}|{metrics['parameters']:<10,}|{metrics['accuracy']:<10.1%}|{metrics['perplexity']:<12.2f}|\")"
+   ],
+   "metadata": {
+    "trusted": true,
+    "execution": {
+     "iopub.status.busy": "2025-02-12T22:11:53.299779Z",
+     "iopub.execute_input": "2025-02-12T22:11:53.300209Z",
+     "iopub.status.idle": "2025-02-12T22:11:55.659114Z",
+     "shell.execute_reply.started": "2025-02-12T22:11:53.300168Z",
+     "shell.execute_reply": "2025-02-12T22:11:55.657535Z"
     },
-    {
-      "cell_type": "code",
-      "source": [],
-      "metadata": {
-        "trusted": true,
-        "id": "HhxJ5m_Dw_I5"
-      },
-      "outputs": [],
-      "execution_count": null
-    }
-  ]
+    "id": "Gdew4V4fw_I4"
+   },
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "This warning appears because both the CPU and GPU are insufficient for the operation"
+   ],
+   "metadata": {
+    "id": "6f-KvTbfw_I4"
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [],
+   "metadata": {
+    "trusted": true,
+    "id": "HhxJ5m_Dw_I5"
+   },
+   "outputs": [],
+   "execution_count": null
+  }
+ ]
 }
\ No newline at end of file
diff --git a/Genel-5/llada.ipynb b/Genel-5/llada.ipynb
index cd17eb2..0f003e1 100644
--- a/Genel-5/llada.ipynb
+++ b/Genel-5/llada.ipynb
@@ -5,8 +5,8 @@
    "id": "9e726e7e",
    "metadata": {},
    "source": [
-    "# Difüzyon Temelli Metin Üretimi (SE Data Set ile)\n",
-    "Bu notebook, HuggingFace'den alınan `salihturkoglu/se_data_set` veri setinin `instruction` ve `response` sütunlarını kullanarak difüzyon temelli metin üretimini gösterir."
+    "# Diffusion-based text generation (with the SE data set)\n",
+    "This notebook demonstrates diffusion-based text generation using the `instruction` and `response` columns of the `salihturkoglu/se_data_set` dataset from HuggingFace."
    ]
   },
   {
@@ -14,13 +14,13 @@
    "id": "fed42a70",
    "metadata": {},
    "source": [
-    "## 1. Veri Setini İndir ve Hazırla\n",
-    "Veri seti HuggingFace Datasets ile yüklenir. Her bir örnek için `instruction` giriş, `response` ise hedef metindir."
+    "## 1. Download and prepare the dataset\n",
+    "Load the dataset with HuggingFace Datasets. For each sample, `instruction` is the input and `response` is the target text."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "d67b069a",
    "metadata": {},
    "outputs": [],
@@ -32,10 +32,10 @@
     "from torch.utils.data import DataLoader, Dataset\n",
     "import numpy as np\n",
     "\n",
-    "# HuggingFace veri setini yükle\n",
+    "# Load the HuggingFace dataset\n",
     "dataset = load_dataset('salihturkoglu/se_data_set', split='train')\n",
     "\n",
-    "# Tüm örnekleri kullan (877 satır var)\n",
+    "# Use all examples (877 rows)\n",
     "instructions = [ex['instruction'] for ex in dataset]\n",
     "responses = [ex['response'] for ex in dataset]"
    ]
@@ -45,13 +45,13 @@
    "id": "5f0ef5f0",
    "metadata": {},
    "source": [
-    "## 2. Tokenizer ve Sözlük Oluşturma\n",
-    "Tüm metinlerden bir kelime sözlüğü oluşturulur ve metinler tokenlara çevrilir."
+    "## 2. Build the tokenizer and vocabulary\n",
+    "Build a vocabulary from all text and convert the sentences into tokens."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "447747d9",
    "metadata": {},
    "outputs": [],
@@ -61,7 +61,7 @@
     "def tokenize(text):\n",
     "    return text.lower().strip().split()\n",
     "\n",
-    "# Sözlük oluştur\n",
+    "# Build the vocabulary\n",
     "PAD_TOKEN = '<PAD>'\n",
     "UNK_TOKEN = '<UNK>'\n",
     "all_texts = instructions + responses\n",
@@ -86,12 +86,12 @@
    "metadata": {},
    "source": [
     "## 3. PyTorch Dataset ve DataLoader\n",
-    "Instruction ve response çiftlerini uygun şekilde tensor haline getirir."
+    "Convert instruction and response pairs into tensors."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "bb7245eb",
    "metadata": {},
    "outputs": [],
@@ -123,12 +123,12 @@
    "id": "651c71c8",
    "metadata": {},
    "source": [
-    "## 4. Difüzyon Süreci: Gürültü Ekleme ve Çıkarma"
+    "## 4. Diffusion process: add and remove noise"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "d4b3c522",
    "metadata": {},
    "outputs": [],
@@ -146,12 +146,12 @@
    "id": "5997f6c3",
    "metadata": {},
    "source": [
-    "## 5. Model Tanımı"
+    "## 5. Model definition"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "69d48719",
    "metadata": {},
    "outputs": [],
@@ -178,37 +178,15 @@
    "id": "e7b7055b",
    "metadata": {},
    "source": [
-    "## 6. Eğitim Süreci"
+    "## 6. Training process"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "4bc574d4",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Epoch 1, Loss: 7.2675\n",
-      "Epoch 2, Loss: 6.5326\n",
-      "Epoch 3, Loss: 5.9995\n",
-      "Epoch 4, Loss: 5.6588\n",
-      "Epoch 5, Loss: 5.4110\n",
-      "Epoch 6, Loss: 5.2345\n",
-      "Epoch 7, Loss: 5.0627\n",
-      "Epoch 8, Loss: 4.9098\n",
-      "Epoch 9, Loss: 4.7601\n",
-      "Epoch 10, Loss: 4.6605\n",
-      "Epoch 11, Loss: 4.5139\n",
-      "Epoch 12, Loss: 4.4244\n",
-      "Epoch 13, Loss: 4.3298\n",
-      "Epoch 14, Loss: 4.2664\n",
-      "Epoch 15, Loss: 4.1858\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
     "model = DiffusionTextModel(len(vocab)).to(device)\n",
@@ -240,40 +218,22 @@
    "id": "6edb3735",
    "metadata": {},
    "source": [
-    "## 7. Metin Üretimi (Difüzyon ile Response Oluşturma)"
+    "## 7. Text generation (producing a response with diffusion)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "a29fd606",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Instruction: Yazılım Mühendisliği bölümünün ders programını görebilir miyim?\n",
-      "Gerçek Response: Güncel ders programını bölüm web sitesindeki duyurular bölümündeki en güncel ders programı duyurusuna ulaşarak görebilirsiniz.\n",
-      "Model Response: öğrenci ve eğitimi ve fazla i̇şyeri i̇şyeri öğrenci eğitimi i̇şyeri bilgi i̇şyeri ise, ile ders ile ders en ve en eğitimi ile ders bölümün ile eğitimi öğrenci ders için i̇şyeri ve eğitimi\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\emreq\\Desktop\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\modules\\transformer.py:508: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. We recommend specifying layout=torch.jagged when constructing a nested tensor, as this layout receives active development, has better operator coverage, and works with torch.compile. (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\aten\\src\\ATen\\NestedTensorImpl.cpp:182.)\n",
-      "  output = torch._nested_tensor_from_mask(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def generate_response(model, instruction, steps=8, max_len=32):\n",
     "    model.eval()\n",
     "    inp = encode(instruction)[:max_len]\n",
     "    inp = inp + [vocab['<PAD>']] * (max_len - len(inp))\n",
     "    inp_tensor = torch.tensor([inp], dtype=torch.long, device=device)\n",
-    "    # Başlangıçta tamamen rastgele bir dizi\n",
+    "    # Initially a completely random sequence\n",
     "    generated = torch.randint(2, len(vocab), (1, max_len), device=device)\n",
     "    for step in range(steps):\n",
     "        mask = (generated == vocab['<PAD>'])\n",
@@ -286,10 +246,10 @@
     "    tokens = generated[0].tolist()\n",
     "    return decode(tokens)\n",
     "\n",
-    "# Örnek bir instruction ile response üret\n",
+    "# Generate a response with a sample instruction\n",
     "test_instruction = instructions[0]\n",
     "print('Instruction:', test_instruction)\n",
-    "print('Gerçek Response:', responses[0])\n",
+    "print('Ground-truth response:', responses[0])\n",
     "print('Model Response:', generate_response(model, test_instruction))"
    ]
   },
@@ -299,31 +259,21 @@
    "metadata": {},
    "source": [
     "## 9. Test: Herhangi Bir Soru ile Modeli Deneyin\n",
-    "Aşağıdaki hücrede, istediğiniz bir soruyu `test_instruction` değişkenine yazarak modelin cevabını görebilirsiniz."
+    "In the cell below, set the `test_instruction` variable to any question to view the model's answer."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "a9e1eacd",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Instruction: Ders kaydı yaparken üst sınıftan ders alabilir miyim?\n",
-      "Gerçek Response: Yok\n",
-      "Model Response: akademik veya yapılır. öğrenci öğrenci ve ders ders ve ders olan üzerinden ve öğrenci akademik veya öğrenci teknoloji ve veya yönetimi için ders öğrenci veya ve ve bölümün bilgi öğrenci veya ders en olan resmi öğrenci akademik akademik akademik sistemi öğrenci en öğrenci ders öğrenci ile bir öğrenci öğrenci ve bir istediği öğrenci ve ve genellikle sayfasından tüm sistemi öğrenci olmak ilgili değişim öğrenci ders ders öğrenci ve sistemi onaylandıktan öğrenci ders genellikle akademik ve veya ders ders üniversite öğrenci hizmetlerinden fakültesi ve içindeki ve ile bilgi öğrenci ders ve olanakları bir öğrenci akademik veya ve ders bilgi ders öğrenci (katkı ve 4. için ve için da bilgi istediği bilgiye ve fazla ve bir sağlamak ve akademik olan ve ve akademik ve ve sağlanır. öğrenci bir web öğrenci\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "# Test etmek istediğiniz soruyu buradan değiştirebilirsiniz.\n",
-    "test_instruction = \"Ders kaydı yaparken üst sınıftan ders alabilir miyim?\"\n",
+    "# You can change the question to test here.\n",
+    "test_instruction = \"Can I take a course from an upper grade during course registration?\"\n",
     "\n",
     "print('Instruction:', test_instruction)\n",
-    "print('Gerçek Response:', responses[instructions.index(test_instruction)] if test_instruction in instructions else \"Yok\")\n",
+    "print('Ground-truth response:', responses[instructions.index(test_instruction)] if test_instruction in instructions else \"None\")\n",
     "print('Model Response:', generate_response(model, test_instruction, max_len=max_len))"
    ]
   },
@@ -333,23 +283,15 @@
    "metadata": {},
    "source": [
     "## 10. Modelin Test Edilmesi\n",
-    "Aşağıdaki hücrede modelin test verisi üzerinde ne kadar doğru response üretebildiği ölçülür. Basit bir doğruluk metriği olarak, modelin response üretiminde orijinal response ile token bazında ne kadar örtüştüğü hesaplanır."
+    "The cell below measures how accurately the model can produce responses on the test data. As a simple accuracy metric, it computes how closely the generated response matches the original response token by token."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "83d0357a",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Test doğruluğu: 1.71% (60/3509)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def evaluate_diffusion_model(model, dataset, n_samples=100, steps=8):\n",
     "    model.eval()\n",
@@ -372,7 +314,7 @@
     "        total += mask.sum().item()\n",
     "        correct += ((generated == tgt) & mask).sum().item()\n",
     "    accuracy = correct / total if total > 0 else 0.0\n",
-    "    print(f\"Test doğruluğu: {accuracy:.2%} ({correct}/{total})\")\n",
+    "    print(f\"Test accuracy: {accuracy:.2%} ({correct}/{total})\")\n",
     "\n",
     "# Test et\n",
     "evaluate_diffusion_model(model, dataset, n_samples=100, steps=8)"
@@ -400,4 +342,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/Image - Patch/pixel-unshuffle.ipynb b/Image - Patch/pixel-unshuffle.ipynb
index 16e7ce6..5e16e64 100644
--- a/Image - Patch/pixel-unshuffle.ipynb	
+++ b/Image - Patch/pixel-unshuffle.ipynb	
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "e42bd322",
    "metadata": {},
    "outputs": [],
@@ -18,20 +18,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "488f300e",
    "metadata": {},
    "outputs": [],
    "source": [
     "def apply_pixel_unshuffle(image, scale_factor=4):\n",
     "    \"\"\"\n",
-    "    Pixel unshuffle işlemi uygular\n",
-    "    scale_factor: Ölçek küçültme faktörü (genellikle 2, 4, 8 gibi)\n",
+    "    Applies a pixel unshuffle operation\n",
+    "    scale_factor: Downscaling factor (commonly 2, 4, or 8)\n",
     "    \"\"\"\n",
-    "    # Girdi boyutları\n",
+    "    # Input dimensions\n",
     "    B, C, H, W = image.shape\n",
     "    \n",
-    "    # Yeni boyutları hesapla\n",
+    "    # Calculate the new dimensions\n",
     "    new_H = H // scale_factor\n",
     "    new_W = W // scale_factor\n",
     "    new_C = C * (scale_factor ** 2)\n",
@@ -43,7 +43,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "005e90fe",
    "metadata": {},
    "outputs": [],
@@ -64,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "49f900c0",
    "metadata": {},
    "outputs": [],
@@ -76,7 +76,7 @@
     "        self.token_dim = token_dim\n",
     "        self.hidden_dim = hidden_dim\n",
     "        \n",
-    "        # MLP bağlayıcı\n",
+    "        # MLP connector\n",
     "        self.mlp = TwoLayerMLPConnector(\n",
     "            input_dim=(3 * unshuffle_factor * unshuffle_factor),\n",
     "            hidden_dim=hidden_dim,\n",
@@ -87,7 +87,7 @@
     "        # 1. Pixel unshuffle uygula\n",
     "        x_unshuffled, shape_info = apply_pixel_unshuffle(x, self.unshuffle_factor)\n",
     "        \n",
-    "        # 2. Token'lara yeniden şekil ver (Batch, Tokens, Channels)\n",
+    "        # 2. Reshape into tokens (Batch, Tokens, Channels)\n",
     "        B, C, H, W = x_unshuffled.shape\n",
     "        tokens = x_unshuffled.reshape(B, C, H * W).permute(0, 2, 1)\n",
     "        \n",
@@ -99,14 +99,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "d11ffb7c",
    "metadata": {},
    "outputs": [],
    "source": [
     "def load_and_process_image(image_path, img_size=(256, 384)):\n",
-    "    \"\"\"Görseli yükle ve işle\"\"\"\n",
-    "    # Görseli yükle\n",
+    "    \"\"\"Load and process the visual\"\"\"\n",
+    "    # Load the image\n",
     "    image = Image.open(image_path).convert('RGB')\n",
     "    \n",
     "    # Transformations\n",
@@ -117,31 +117,31 @@
     "                           std=[0.229, 0.224, 0.225])\n",
     "    ])\n",
     "    \n",
-    "    # Görseli işle\n",
+    "    # Process the image\n",
     "    processed_image = transform(image).unsqueeze(0)  # Batch dimension ekle\n",
     "    return processed_image, image\n",
     "\n",
     "def visualize_results(original_img, processed_tensor, token_info):\n",
-    "    \"\"\"Sonuçları görselleştir\"\"\"\n",
+    "    \"\"\"Visualize the results\"\"\"\n",
     "    fig, axes = plt.subplots(1, 3, figsize=(15, 5))\n",
     "    \n",
-    "    # Orijinal görsel\n",
+    "    # Original visual\n",
     "    axes[0].imshow(original_img)\n",
-    "    axes[0].set_title(f'Orijinal Görsel\\nBoyut: {original_img.size}')\n",
+    "    axes[0].set_title(f'Orijinal Visual\\nBoyut: {original_img.size}')\n",
     "    axes[0].axis('off')\n",
     "    \n",
-    "    # İşlenmiş tensor\n",
+    "    # Processed tensor\n",
     "    axes[1].imshow(processed_tensor[0].permute(1, 2, 0))\n",
-    "    axes[1].set_title(f'İşlenmiş Tensor\\nShape: {processed_tensor.shape}')\n",
+    "    axes[1].set_title(f'Processed Tensor\\nShape: {processed_tensor.shape}')\n",
     "    axes[1].axis('off')\n",
     "    \n",
     "    # Token bilgisi\n",
     "    axes[2].text(0.1, 0.5, \n",
-    "                f'Token Sayısı: {token_info[\"num_tokens\"]}\\n'\n",
+    "                f'Token Count: {token_info[\"num_tokens\"]}\\n'\n",
     "                f'Token Boyutu: {token_info[\"token_dim\"]}\\n'\n",
-    "                f'Unshuffle Faktörü: {token_info[\"unshuffle_factor\"]}\\n'\n",
-    "                f'Orijinal Piksel Sayısı: {token_info[\"original_pixels\"]}\\n'\n",
-    "                f'İndirgeme Oranı: {token_info[\"reduction_ratio\"]:.1f}x',\n",
+    "                f'Unshuffle Factor: {token_info[\"unshuffle_factor\"]}\\n'\n",
+    "                f'Original Pixel Count: {token_info[\"original_pixels\"]}\\n'\n",
+    "                f'Reduction Ratio: {token_info[\"reduction_ratio\"]:.1f}x',\n",
     "                fontsize=12, va='center')\n",
     "    axes[2].set_title('Token Bilgisi')\n",
     "    axes[2].axis('off')\n",
@@ -152,41 +152,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "4d9fdf10",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Kullanılan cihaz: cuda\n",
-      "Orijinal görsel boyutu: 256x384 = 98304 piksel\n",
-      "Pixel unshuffle sonrası: (1, 48, 64, 96)\n",
-      "Token sayısı: 6144\n",
-      "Her tokenın boyutu: 768\n",
-      "Toplam özellik sayısı: 4718592\n",
-      "İndirgeme oranı: 16.0x\n",
-      "\n",
-      "Gerçek bir görselle test etmek için:\n",
-      "1. Yukarıdaki load_and_process_image fonksiyonunu kullanın\n",
-      "2. dummy_image yerine gerçek görsel tensorünü verin\n",
-      "3. visualize_results fonksiyonunu kullanın\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def main():\n",
-    "    # Cihaz ayarı\n",
+    "    # Device setup\n",
     "    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
-    "    print(f\"Kullanılan cihaz: {device}\")\n",
+    "    print(f\"Device in use: {device}\")\n",
     "    \n",
-    "    # Görsel yükle (kendi görsel yolunu verebilirsiniz)\n",
-    "    # Örnek olarak rastgele bir tensor kullanıyoruz\n",
+    "    # Load a visual (you can provide your own image path)\n",
+    "    # We use a random tensor as an example\n",
     "    original_height, original_width = 256, 384\n",
     "    dummy_image = torch.randn(1, 3, original_height, original_width).to(device)\n",
     "    \n",
-    "    # Modeli oluştur\n",
+    "    # Build the model\n",
     "    unshuffle_factor = 4\n",
     "    token_dim = 768\n",
     "    model = EfficientTokenMapper(\n",
@@ -194,11 +175,11 @@
     "        token_dim=token_dim\n",
     "    ).to(device)\n",
     "    \n",
-    "    # İşlemi uygula\n",
+    "    # Apply the operation\n",
     "    with torch.no_grad():\n",
     "        mapped_tokens, spatial_dims, shape_info = model(dummy_image)\n",
     "    \n",
-    "    # Sonuçları hesapla\n",
+    "    # Compute the results\n",
     "    H, W = spatial_dims\n",
     "    num_tokens = H * W\n",
     "    original_pixels = original_height * original_width\n",
@@ -211,19 +192,19 @@
     "        'reduction_ratio': original_pixels / num_tokens\n",
     "    }\n",
     "    \n",
-    "    # Sonuçları yazdır\n",
-    "    print(f\"Orijinal görsel boyutu: {original_height}x{original_width} = {original_pixels} piksel\")\n",
-    "    print(f\"Pixel unshuffle sonrası: {shape_info}\")\n",
-    "    print(f\"Token sayısı: {num_tokens}\")\n",
-    "    print(f\"Her tokenın boyutu: {token_dim}\")\n",
-    "    print(f\"Toplam özellik sayısı: {num_tokens * token_dim}\")\n",
-    "    print(f\"İndirgeme oranı: {token_info['reduction_ratio']:.1f}x\")\n",
+    "    # Print the results\n",
+    "    print(f\"Original image size: {original_height}x{original_width} = {original_pixels} pixels\")\n",
+    "    print(f\"After pixel unshuffle: {shape_info}\")\n",
+    "    print(f\"Token count: {num_tokens}\")\n",
+    "    print(f\"Token dimension: {token_dim}\")\n",
+    "    print(f\"Total number of features: {num_tokens * token_dim}\")\n",
+    "    print(f\"Reduction ratio: {token_info['reduction_ratio']:.1f}x\")\n",
     "    \n",
-    "    # Görselleştirme için örnek (gerçek görselle çalışmak isterseniz)\n",
-    "    print(\"\\nGerçek bir görselle test etmek için:\")\n",
-    "    print(\"1. Yukarıdaki load_and_process_image fonksiyonunu kullanın\")\n",
-    "    print(\"2. dummy_image yerine gerçek görsel tensorünü verin\")\n",
-    "    print(\"3. visualize_results fonksiyonunu kullanın\")\n",
+    "    # Visualization example (use a real image if desired)\n",
+    "    print(\"\\nTo test with a real visual:\")\n",
+    "    print(\"1. Use the load_and_process_image function above\")\n",
+    "    print(\"2. Provide a real image tensor instead of dummy_image\")\n",
+    "    print(\"3. Use the visualize_results function\")\n",
     "\n",
     "if __name__ == \"__main__\":\n",
     "    main()"
@@ -231,31 +212,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "0e9cad3e",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Orijinal görsel: 256x384\n",
-      "Token sayısı: 6144\n",
-      "İndirgeme oranı: 16.0x\n",
-      "6144 token oluşturuldu!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def test_with_real_image(image_path):\n",
-    "    \"\"\"Gerçek bir görselle test edelim\"\"\"\n",
+    "    \"\"\"Let's test with a real visual\"\"\"\n",
     "    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
     "    \n",
-    "    # Görseli yükle ve işle\n",
+    "    # Load and process the image\n",
     "    processed_tensor, original_img = load_and_process_image(image_path)\n",
     "    processed_tensor = processed_tensor.to(device)\n",
     "    \n",
-    "    # Modeli oluştur\n",
+    "    # Build the model\n",
     "    unshuffle_factor = 4\n",
     "    token_dim = 768\n",
     "    model = EfficientTokenMapper(\n",
@@ -263,7 +233,7 @@
     "        token_dim=token_dim\n",
     "    ).to(device)\n",
     "    \n",
-    "    # İşlemi uygula\n",
+    "    # Apply the operation\n",
     "    with torch.no_grad():\n",
     "        mapped_tokens, spatial_dims, _ = model(processed_tensor)\n",
     "    \n",
@@ -271,15 +241,15 @@
     "    num_tokens = H * W\n",
     "    original_pixels = processed_tensor.shape[2] * processed_tensor.shape[3]\n",
     "    \n",
-    "    print(f\"Orijinal görsel: {processed_tensor.shape[2]}x{processed_tensor.shape[3]}\")\n",
-    "    print(f\"Token sayısı: {num_tokens}\")\n",
-    "    print(f\"İndirgeme oranı: {original_pixels/num_tokens:.1f}x\")\n",
+    "    print(f\"Original image: {processed_tensor.shape[2]}x{processed_tensor.shape[3]}\")\n",
+    "    print(f\"Token count: {num_tokens}\")\n",
+    "    print(f\"Reduction ratio: {original_pixels/num_tokens:.1f}x\")\n",
     "    \n",
     "    return mapped_tokens, num_tokens\n",
     "\n",
-    "# Örnek kullanım:\n",
+    "# Example usage:\n",
     "tokens, count = test_with_real_image('YouTube-QA-Agent-08-22-2025_01_46_PM.png')\n",
-    "print(f\"{count} token oluşturuldu!\")"
+    "print(f\"{count} tokens generated!\")"
    ]
   },
   {
@@ -287,12 +257,12 @@
    "id": "83b8db32",
    "metadata": {},
    "source": [
-    "# 🚀 Geliştirebileceklerimiz\n",
+    "# 🚀 Potential improvements\n",
     "\n",
     "## 1. Adaptif Token Mapping\n",
-    "- **Dinamik Unshuffle Factor**: Görüntü karmaşıklığına göre otomatik ayarlama\n",
-    "- **Attention-based Token Selection**: Önemli bölgelere daha fazla token\n",
-    "- **Multi-scale Processing**: Farklı resolution'larda token'lar\n",
+    "- **Dynamic Unshuffle Factor**: Automatically adjust based on image complexity\n",
+    "- **Attention-based Token Selection**: Allocate more tokens to important regions\n",
+    "- **Multi-scale Processing**: Tokens at different resolutions\n",
     "\n",
     "## 2. Advanced Architecture Improvements\n",
     "- **Learnable Position Embedding**: Spatial bilgiyi koruma\n",
@@ -300,47 +270,27 @@
     "- **Efficient Attention Patterns**: Linear attention, sparse attention\n",
     "\n",
     "## 3. Optimization Techniques\n",
-    "- **Knowledge Distillation**: Büyük modelden küçük modele bilgi transferi\n",
-    "- **Quantization**: Model boyutunu küçültme\n",
-    "- **Pruning**: Gereksiz parametreleri kaldırma\n",
+    "- **Knowledge Distillation**: Transfer knowledge from a large model to a smaller one\n",
+    "- **Quantization**: Reduce model size\n",
+    "- **Pruning**: Remove unnecessary parameters\n",
     "\n",
     "## 4. Multi-Modal Extensions\n",
-    "- **Text-Image Fusion**: CLIP tarzı joint embeddings\n",
+    "- **Text-Image Fusion**: Joint embeddings similar to CLIP\n",
     "- **Video Processing**: Temporal dimension ekleme\n",
-    "- **Audio-Visual**: Çoklu modalite desteği\n",
+    "- **Audio-Visual**: Multi-modal support\n",
     "\n",
     "## 5. Real-world Applications\n",
-    "- **Fine-tuning Pipelines**: Specific task'lar için adaptasyon\n",
-    "- **Deployment Optimization**: Edge device'lar için optimizasyon\n",
-    "- **Benchmarking**: Standart dataset'lerde performance ölçümü"
+    "- **Fine-tuning Pipelines**: Adaptation for specific tasks\n",
+    "- **Deployment Optimization**: Optimize for edge devices\n",
+    "- **Benchmarking**: Measure performance on standard datasets"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "2fa685ee",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "AssertionError",
-     "evalue": "was expecting embedding dimension of 48, but got 12",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mAssertionError\u001b[39m                            Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 79\u001b[39m\n\u001b[32m     76\u001b[39m         tokens2, dims2, attn2, factor2 = adaptive_model(complex_image)\n\u001b[32m     77\u001b[39m         \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mKarmaşık görüntü - Unshuffle factor: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfactor2\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, Token sayısı: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdims2[\u001b[32m0\u001b[39m]*dims2[\u001b[32m1\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m79\u001b[39m \u001b[43mtest_adaptive_mapping\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 72\u001b[39m, in \u001b[36mtest_adaptive_mapping\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m     68\u001b[39m adaptive_model = AdaptiveTokenMapper().to(device)\n\u001b[32m     70\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.no_grad():\n\u001b[32m     71\u001b[39m     \u001b[38;5;66;03m# Simple image test\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m72\u001b[39m     tokens1, dims1, attn1, factor1 = \u001b[43madaptive_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43msimple_image\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     73\u001b[39m     \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mBasit görüntü - Unshuffle factor: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfactor1\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, Token sayısı: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdims1[\u001b[32m0\u001b[39m]*dims1[\u001b[32m1\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m     75\u001b[39m     \u001b[38;5;66;03m# Complex image test  \u001b[39;00m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\emreq\\Desktop\\Projeler\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1751\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1749\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m   1750\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1751\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\emreq\\Desktop\\Projeler\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1762\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1757\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m   1758\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m   1759\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m   1760\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m   1761\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1762\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1764\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1765\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 52\u001b[39m, in \u001b[36mAdaptiveTokenMapper.forward\u001b[39m\u001b[34m(self, x)\u001b[39m\n\u001b[32m     49\u001b[39m tokens = x_unshuffled.reshape(B, C, H * W).permute(\u001b[32m0\u001b[39m, \u001b[32m2\u001b[39m, \u001b[32m1\u001b[39m)\n\u001b[32m     51\u001b[39m \u001b[38;5;66;03m# Attention mechanism ile önemli token'ları belirle\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m52\u001b[39m attended_tokens, attention_weights = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mattention\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtokens\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtokens\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtokens\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     54\u001b[39m \u001b[38;5;66;03m# Adaptive MLP seçimi\u001b[39;00m\n\u001b[32m     55\u001b[39m mlp_key = \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00munshuffle_factor\u001b[38;5;132;01m}\u001b[39;00m\u001b[33mx\u001b[39m\u001b[33m\"\u001b[39m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\emreq\\Desktop\\Projeler\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1751\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1749\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m   1750\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1751\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\emreq\\Desktop\\Projeler\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1762\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1757\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m   1758\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m   1759\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m   1760\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m   1761\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1762\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1764\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1765\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n",
-      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\emreq\\Desktop\\Projeler\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\modules\\activation.py:1373\u001b[39m, in \u001b[36mMultiheadAttention.forward\u001b[39m\u001b[34m(self, query, key, value, key_padding_mask, need_weights, attn_mask, average_attn_weights, is_causal)\u001b[39m\n\u001b[32m   1347\u001b[39m     attn_output, attn_output_weights = F.multi_head_attention_forward(\n\u001b[32m   1348\u001b[39m         query,\n\u001b[32m   1349\u001b[39m         key,\n\u001b[32m   (...)\u001b[39m\u001b[32m   1370\u001b[39m         is_causal=is_causal,\n\u001b[32m   1371\u001b[39m     )\n\u001b[32m   1372\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1373\u001b[39m     attn_output, attn_output_weights = \u001b[43mF\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmulti_head_attention_forward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1374\u001b[39m \u001b[43m        \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1375\u001b[39m \u001b[43m        \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1376\u001b[39m \u001b[43m        \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1377\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43membed_dim\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1378\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnum_heads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1379\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43min_proj_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1380\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43min_proj_bias\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1381\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mbias_k\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1382\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mbias_v\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1383\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43madd_zero_attn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1384\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mdropout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1385\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mout_proj\u001b[49m\u001b[43m.\u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1386\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mout_proj\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1387\u001b[39m \u001b[43m        \u001b[49m\u001b[43mtraining\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtraining\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1388\u001b[39m \u001b[43m        \u001b[49m\u001b[43mkey_padding_mask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkey_padding_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1389\u001b[39m \u001b[43m        \u001b[49m\u001b[43mneed_weights\u001b[49m\u001b[43m=\u001b[49m\u001b[43mneed_weights\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1390\u001b[39m \u001b[43m        \u001b[49m\u001b[43mattn_mask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mattn_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1391\u001b[39m \u001b[43m        \u001b[49m\u001b[43maverage_attn_weights\u001b[49m\u001b[43m=\u001b[49m\u001b[43maverage_attn_weights\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1392\u001b[39m \u001b[43m        \u001b[49m\u001b[43mis_causal\u001b[49m\u001b[43m=\u001b[49m\u001b[43mis_causal\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1393\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1394\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.batch_first \u001b[38;5;129;01mand\u001b[39;00m is_batched:\n\u001b[32m   1395\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m attn_output.transpose(\u001b[32m1\u001b[39m, \u001b[32m0\u001b[39m), attn_output_weights\n",
-      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\emreq\\Desktop\\Projeler\\Transformers\\.venv\\Lib\\site-packages\\torch\\nn\\functional.py:6203\u001b[39m, in \u001b[36mmulti_head_attention_forward\u001b[39m\u001b[34m(query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight, q_proj_weight, k_proj_weight, v_proj_weight, static_k, static_v, average_attn_weights, is_causal)\u001b[39m\n\u001b[32m   6196\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m key_padding_mask \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m   6197\u001b[39m         \u001b[38;5;66;03m# We have the attn_mask, and use that to merge kpm into it.\u001b[39;00m\n\u001b[32m   6198\u001b[39m         \u001b[38;5;66;03m# Turn off use of is_causal hint, as the merged mask is no\u001b[39;00m\n\u001b[32m   6199\u001b[39m         \u001b[38;5;66;03m# longer causal.\u001b[39;00m\n\u001b[32m   6200\u001b[39m         is_causal = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m   6202\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m (\n\u001b[32m-> \u001b[39m\u001b[32m6203\u001b[39m     embed_dim == embed_dim_to_check\n\u001b[32m   6204\u001b[39m ), \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mwas expecting embedding dimension of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00membed_dim_to_check\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00membed_dim\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m   6205\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(embed_dim, torch.Tensor):\n\u001b[32m   6206\u001b[39m     \u001b[38;5;66;03m# embed_dim can be a tensor when JIT tracing\u001b[39;00m\n\u001b[32m   6207\u001b[39m     head_dim = embed_dim.div(num_heads, rounding_mode=\u001b[33m\"\u001b[39m\u001b[33mtrunc\u001b[39m\u001b[33m\"\u001b[39m)\n",
-      "\u001b[31mAssertionError\u001b[39m: was expecting embedding dimension of 48, but got 12"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# 1. Adaptif Token Mapping Implementasyonu\n",
     "class AdaptiveTokenMapper(nn.Module):\n",
@@ -369,17 +319,17 @@
     "        })\n",
     "        \n",
     "    def analyze_complexity(self, x):\n",
-    "        \"\"\"Görüntü karmaşıklığını analiz et\"\"\"\n",
+    "        \"\"\"Analyze image complexity\"\"\"\n",
     "        complexity = self.complexity_conv(x)\n",
     "        complexity = self.complexity_pool(complexity).squeeze()\n",
     "        \n",
-    "        # Complexity score'a göre unshuffle factor belirle\n",
+    "        # Determine the unshuffle factor based on the complexity score\n",
     "        if complexity < 0.3:\n",
-    "            return 2  # Basit görüntüler için\n",
+    "            return 2  # For simple images\n",
     "        elif complexity < 0.7:\n",
-    "            return 4  # Orta karmaşıklık\n",
+    "            return 4  # Medium complexity\n",
     "        else:\n",
-    "            return 8  # Karmaşık görüntüler için\n",
+    "            return 8  # For complex images\n",
     "    \n",
     "    def forward(self, x):\n",
     "        # Adaptive unshuffle factor\n",
@@ -388,14 +338,14 @@
     "        # Pixel unshuffle uygula\n",
     "        x_unshuffled, shape_info = apply_pixel_unshuffle(x, unshuffle_factor)\n",
     "        \n",
-    "        # Token'lara dönüştür\n",
+    "        # Convert to tokens\n",
     "        B, C, H, W = x_unshuffled.shape\n",
     "        tokens = x_unshuffled.reshape(B, C, H * W).permute(0, 2, 1)\n",
     "        \n",
-    "        # Attention mechanism ile önemli token'ları belirle\n",
+    "        # Use the attention mechanism to highlight important tokens\n",
     "        attended_tokens, attention_weights = self.attention(tokens, tokens, tokens)\n",
     "        \n",
-    "        # Adaptive MLP seçimi\n",
+    "        # Adaptive MLP selection\n",
     "        mlp_key = f\"{unshuffle_factor}x\"\n",
     "        mapped_tokens = self.adaptive_mlp[mlp_key](attended_tokens)\n",
     "        \n",
@@ -414,11 +364,11 @@
     "    with torch.no_grad():\n",
     "        # Simple image test\n",
     "        tokens1, dims1, attn1, factor1 = adaptive_model(simple_image)\n",
-    "        print(f\"Basit görüntü - Unshuffle factor: {factor1}, Token sayısı: {dims1[0]*dims1[1]}\")\n",
+    "        print(f\"Simple image - Unshuffle factor: {factor1}, Token count: {dims1[0]*dims1[1]}\")\n",
     "        \n",
     "        # Complex image test  \n",
     "        tokens2, dims2, attn2, factor2 = adaptive_model(complex_image)\n",
-    "        print(f\"Karmaşık görüntü - Unshuffle factor: {factor2}, Token sayısı: {dims2[0]*dims2[1]}\")\n",
+    "        print(f\"Complex image - Unshuffle factor: {factor2}, Token count: {dims2[0]*dims2[1]}\")\n",
     "\n",
     "test_adaptive_mapping()"
    ]
@@ -437,7 +387,7 @@
     "        self.scales = scales\n",
     "        self.token_dim = token_dim\n",
     "        \n",
-    "        # Her scale için ayrı mapper\n",
+    "        # Separate mapper for each scale\n",
     "        self.scale_mappers = nn.ModuleDict()\n",
     "        for scale in scales:\n",
     "            input_dim = 3 * scale * scale\n",
@@ -462,13 +412,13 @@
     "        scale_tokens = []\n",
     "        scale_dims = []\n",
     "        \n",
-    "        # Her scale için token'lar oluştur\n",
+    "        # Create tokens for each scale\n",
     "        for scale in self.scales:\n",
     "            # Pixel unshuffle uygula\n",
     "            x_unshuffled, shape_info = apply_pixel_unshuffle(x, scale)\n",
     "            _, C, H, W = x_unshuffled.shape\n",
     "            \n",
-    "            # Token'lara dönüştür\n",
+    "            # Convert to tokens\n",
     "            tokens = x_unshuffled.reshape(B, C, H * W).permute(0, 2, 1)\n",
     "            \n",
     "            # Scale-specific mapping\n",
@@ -476,20 +426,20 @@
     "            scale_tokens.append(mapped)\n",
     "            scale_dims.append((H, W))\n",
     "        \n",
-    "        # En küçük scale'i referans al (en fazla token sayısı)\n",
+    "        # Use the smallest scale as a reference (maximum number of tokens)\n",
     "        max_tokens = max([t.shape[1] for t in scale_tokens])\n",
     "        \n",
-    "        # Tüm scale'leri aynı token sayısına getir (interpolation)\n",
+    "        # Bring all scales to the same number of tokens (interpolation)\n",
     "        aligned_tokens = []\n",
     "        for i, tokens in enumerate(scale_tokens):\n",
     "            if tokens.shape[1] != max_tokens:\n",
-    "                # Adaptive pooling ile token sayısını eşitle\n",
+    "                # Match the token count with adaptive pooling\n",
     "                tokens_reshaped = tokens.permute(0, 2, 1)  # (B, token_dim, num_tokens)\n",
     "                tokens_pooled = F.adaptive_avg_pool1d(tokens_reshaped, max_tokens)\n",
     "                tokens = tokens_pooled.permute(0, 2, 1)  # (B, num_tokens, token_dim)\n",
     "            aligned_tokens.append(tokens)\n",
     "        \n",
-    "        # Scale'leri birleştir\n",
+    "        # Combine the scales\n",
     "        stacked_tokens = torch.stack(aligned_tokens, dim=2)  # (B, num_tokens, num_scales, token_dim)\n",
     "        B, num_tokens, num_scales, token_dim = stacked_tokens.shape\n",
     "        \n",
@@ -497,7 +447,7 @@
     "        fused_tokens = stacked_tokens.view(B * num_tokens, num_scales, token_dim)\n",
     "        fused_output, _ = self.scale_fusion(fused_tokens, fused_tokens, fused_tokens)\n",
     "        \n",
-    "        # Scale dimension'ı birleştir (ortalama al)\n",
+    "        # Merge the scale dimension (take the mean)\n",
     "        final_tokens = fused_output.mean(dim=1)  # (B * num_tokens, token_dim)\n",
     "        final_tokens = final_tokens.view(B, num_tokens, token_dim)\n",
     "        \n",
@@ -546,7 +496,7 @@
     "        # Vision token mapper (mevcut pixel unshuffle kullanarak)\n",
     "        self.vision_mapper = EfficientTokenMapper()\n",
     "        \n",
-    "        # Text embedding (basit bir örnek)\n",
+    "        # Text embedding (simple example)\n",
     "        self.text_embedding = nn.Embedding(vocab_size=50000, embedding_dim=text_dim)\n",
     "        self.text_pos_embedding = nn.Parameter(torch.randn(1, 512, text_dim))\n",
     "        \n",
@@ -828,22 +778,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "id": "57960b55",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "🚀 Real-World Application Demo\n",
-      "==================================================\n",
-      "1. Basic Deployment Test:\n",
-      "❌ Demo failed: name 'OptimizedEfficientTokenMapper' is not defined\n",
-      "💡 Make sure you have the test image file available\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# 5. Real-World Deployment Examples\n",
     "class DeploymentPipeline:\n",
@@ -1056,19 +994,19 @@
    "id": "4daf5861",
    "metadata": {},
    "source": [
-    "# 🎯 Sonuç ve Gelecek Adımlar\n",
+    "# 🎯 Conclusion and next steps\n",
     "\n",
-    "## 📈 Geliştirdiğimiz İyileştirmeler\n",
+    "## 📈 Improvements we introduced\n",
     "\n",
     "### 1. **Adaptif Token Mapping**\n",
-    "- ✅ Görüntü karmaşıklığına göre otomatik unshuffle factor ayarı\n",
-    "- ✅ Attention mechanism ile önemli token'ları belirleme\n",
+    "- ✅ Automatic unshuffle factor adjustment based on image complexity\n",
+    "- ✅ Identify important tokens with an attention mechanism\n",
     "- ✅ Dinamik model kapasitesi\n",
     "\n",
     "### 2. **Multi-Scale Processing**\n",
-    "- ✅ Farklı resolution'larda eş zamanlı işleme\n",
+    "- ✅ Concurrent processing at different resolutions\n",
     "- ✅ Scale fusion attention mechanism\n",
-    "- ✅ Daha zengin özellik çıkarımı\n",
+    "- ✅ Richer feature extraction\n",
     "\n",
     "### 3. **Multimodal Capabilities**\n",
     "- ✅ Vision-Language joint processing\n",
@@ -1081,16 +1019,16 @@
     "- ✅ Grouped convolutions\n",
     "- ✅ Efficient deployment pipeline\n",
     "\n",
-    "## 🚀 Önerilen Gelecek Adımlar\n",
+    "## 🚀 Suggested next steps\n",
     "\n",
-    "### Kısa Vadeli (1-2 hafta)\n",
+    "### Short term (1-2 weeks)\n",
     "1. **Benchmark Testing**: Standart dataset'lerde performance testi\n",
-    "2. **Fine-tuning Pipeline**: Specific task'lar için adaptasyon\n",
+    "2. **Fine-tuning Pipeline**: Adapt for specific tasks\n",
     "3. **Memory Optimization**: Gradient checkpointing, mixed precision\n",
-    "4. **Validation**: Gerçek görüntü dataset'leriyle test\n",
+    "4. **Validation**: Test with real image datasets\n",
     "\n",
     "### Orta Vadeli (1-2 ay)\n",
-    "1. **Knowledge Distillation**: Büyük modelden bilgi transferi\n",
+    "1. **Knowledge Distillation**: Transfer knowledge from a large model\n",
     "2. **Pruning Techniques**: Model compression\n",
     "3. **Advanced Attention**: Sparse attention, sliding window\n",
     "4. **Video Extension**: Temporal dimension ekleme\n",
@@ -1103,8 +1041,8 @@
     "\n",
     "## 💡 Pratik Uygulamalar\n",
     "\n",
-    "### Hemen Başlayabileceğiniz Projeler:\n",
-    "1. **Image Search Engine**: Token similarity ile görüntü arama\n",
+    "### Projects you can start right away:\n",
+    "1. **Image Search Engine**: Search images using token similarity\n",
     "2. **Content Moderation**: Inappropriate content detection\n",
     "3. **Medical Imaging**: X-ray, MRI analysis\n",
     "4. **Satellite Imagery**: Geographic feature detection\n",
@@ -1116,8 +1054,8 @@
     "- **Evaluation**: Standard metrics, human evaluation\n",
     "- **Deployment**: Cloud services, edge deployment\n",
     "\n",
-    "## 🎉 Özet\n",
-    "Bu notebook ile başlangıçtaki basit pixel unshuffle implementasyonundan, production-ready multimodal vision system'e kadar kapsamlı bir gelişim yolculuğu oluşturduk. Her adım real-world applications'a odaklanarak pratik çözümler sundu."
+    "## 🎉 Summary\n",
+    "With this notebook we moved from a simple pixel unshuffle implementation to a production-ready multimodal vision system. Each step focused on real-world applications and practical solutions."
    ]
   }
  ],
@@ -1142,4 +1080,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file