1990two commited on
Commit
3a61d42
Β·
verified Β·
1 Parent(s): 32b65b6

Upload 2 files

Browse files
Files changed (2) hide show
  1. evolutionary_turing.py +371 -0
  2. evolutionary_turing_docs.py +955 -0
evolutionary_turing.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ############################################################################################################################################
2
+ #|| - - - |8.19.2025| - - - || Evolutionary Turing Machine || - - - | 1990two | - - -||#
3
+ ############################################################################################################################################
4
+
5
+ from __future__ import annotations
6
+ from dataclasses import dataclass
7
+ from typing import Dict, List, Optional, Tuple, Union
8
+ import math
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ from copy import deepcopy
13
+
14
+
15
+ @dataclass
16
+ class NTMConfig:
17
+ input_dim: int
18
+ output_dim: int
19
+ controller_dim: int = 128
20
+ controller_layers: int = 1
21
+ memory_slots: int = 128
22
+ memory_dim: int = 32
23
+ heads_read: int = 1
24
+ heads_write: int = 1
25
+ init_std: float = 0.1
26
+
27
+ ############################################################################################################################################
28
+ #################################################### - - - Neural Turing Machine - - - ###############################################
29
+
30
+ class NeuralTuringMachine(nn.Module):
31
+ def __init__(self, cfg: NTMConfig):
32
+ super().__init__()
33
+ self.cfg = cfg
34
+ R, W, Dm = cfg.heads_read, cfg.heads_write, cfg.memory_dim
35
+
36
+ ctrl_in = cfg.input_dim + R * Dm
37
+ self.controller = nn.LSTMCell(ctrl_in, cfg.controller_dim)
38
+
39
+ iface_read = R * (Dm + 1) # key + strength
40
+ iface_write = W * (Dm + 1 + Dm + Dm) # key + strength + erase + add
41
+ self.interface = nn.Linear(cfg.controller_dim, iface_read + iface_write)
42
+ self.output = nn.Linear(cfg.controller_dim + R * Dm, cfg.output_dim)
43
+
44
+ self.reset_parameters()
45
+
46
+ def reset_parameters(self):
47
+ for m in self.modules():
48
+ if isinstance(m, nn.Linear):
49
+ nn.init.xavier_uniform_(m.weight)
50
+ nn.init.zeros_(m.bias)
51
+ if isinstance(m, nn.LSTMCell):
52
+ nn.init.xavier_uniform_(m.weight_ih)
53
+ nn.init.orthogonal_(m.weight_hh)
54
+ nn.init.zeros_(m.bias_ih)
55
+ nn.init.zeros_(m.bias_hh)
56
+ hs = m.bias_ih.shape[0] // 4
57
+ m.bias_ih.data[hs:2*hs].fill_(1.0) # forget gate
58
+ m.bias_hh.data[hs:2*hs].fill_(1.0)
59
+
60
+
61
+ def initial_state(self, batch_size: int, device=None):
62
+ cfg = self.cfg
63
+ device = device or next(self.parameters()).device
64
+
65
+ M = torch.zeros(batch_size, cfg.memory_slots, cfg.memory_dim, device=device)
66
+ if cfg.init_std > 0:
67
+ M.normal_(0.0, cfg.init_std)
68
+
69
+ w_r = torch.ones(batch_size, cfg.heads_read, cfg.memory_slots, device=device) / cfg.memory_slots
70
+ w_w = torch.ones(batch_size, cfg.heads_write, cfg.memory_slots, device=device) / cfg.memory_slots
71
+ r = torch.zeros(batch_size, cfg.heads_read, cfg.memory_dim, device=device)
72
+ h = torch.zeros(batch_size, cfg.controller_dim, device=device)
73
+ c = torch.zeros(batch_size, cfg.controller_dim, device=device)
74
+
75
+ return {'M': M, 'w_r': w_r, 'w_w': w_w, 'r': r, 'h': h, 'c': c}
76
+
77
+ def step(self, x: torch.Tensor, state: Dict[str, torch.Tensor]):
78
+ cfg = self.cfg
79
+ B = x.shape[0]
80
+
81
+ ctrl_in = torch.cat([x, state['r'].view(B, -1)], dim=-1)
82
+ h, c = self.controller(ctrl_in, (state['h'], state['c']))
83
+ iface = self.interface(h)
84
+ R, W, Dm = cfg.heads_read, cfg.heads_write, cfg.memory_dim
85
+
86
+ offset = 0
87
+ k_r = iface[:, offset:offset + R * Dm].view(B, R, Dm)
88
+ offset += R * Dm
89
+ beta_r = F.softplus(iface[:, offset:offset + R])
90
+ offset += R
91
+
92
+ k_w = iface[:, offset:offset + W * Dm].view(B, W, Dm)
93
+ offset += W * Dm
94
+ beta_w = F.softplus(iface[:, offset:offset + W])
95
+ offset += W
96
+ erase = torch.sigmoid(iface[:, offset:offset + W * Dm]).view(B, W, Dm)
97
+ offset += W * Dm
98
+ add = torch.tanh(iface[:, offset:offset + W * Dm]).view(B, W, Dm)
99
+
100
+ def address(M, k, beta, prev_weight=None):
101
+ M_norm = torch.norm(M, dim=-1, keepdim=True).clamp_min(1e-8)
102
+ k_norm = torch.norm(k, dim=-1, keepdim=True).clamp_min(1e-8)
103
+ cos_sim = torch.sum(M.unsqueeze(1) * k.unsqueeze(2), dim=-1) / (
104
+ M_norm.squeeze(-1).unsqueeze(1) * k_norm.squeeze(-1).unsqueeze(-1)
105
+ )
106
+ content_logits = beta.unsqueeze(-1) * cos_sim
107
+ if prev_weight is not None:
108
+ content_logits = content_logits + 0.02 * prev_weight
109
+ return F.softmax(content_logits, dim=-1)
110
+
111
+
112
+ w_r = address(state['M'], k_r, beta_r, prev_weight=state.get('w_r'))
113
+ w_w = address(state['M'], k_w, beta_w, prev_weight=state.get('w_w'))
114
+ r = torch.sum(w_r.unsqueeze(-1) * state['M'].unsqueeze(1), dim=2)
115
+
116
+ M = state['M']
117
+ if W > 0:
118
+ erase_term = torch.prod(1 - w_w.unsqueeze(-1) * erase.unsqueeze(2), dim=1)
119
+ M = M * erase_term
120
+ add_term = torch.sum(w_w.unsqueeze(-1) * add.unsqueeze(2), dim=1)
121
+ M = M + add_term
122
+
123
+ y = self.output(torch.cat([h, r.view(B, -1)], dim=-1))
124
+
125
+ new_state = {'M': M, 'w_r': w_r, 'w_w': w_w, 'r': r, 'h': h, 'c': c}
126
+ return y, new_state
127
+
128
+ def forward(self, x: torch.Tensor, state=None):
129
+ if x.dim() == 2:
130
+ if state is None:
131
+ state = self.initial_state(x.shape[0], x.device)
132
+ return self.step(x, state)
133
+
134
+ B, T, _ = x.shape
135
+ if state is None:
136
+ state = self.initial_state(B, x.device)
137
+
138
+ outputs = []
139
+ for t in range(T):
140
+ y, state = self.step(x[:, t], state)
141
+ outputs.append(y)
142
+
143
+ return torch.stack(outputs, dim=1), state
144
+
145
+ @dataclass
146
+ class EvolutionaryTuringConfig:
147
+ population_size: int = 100
148
+ mutation_rate: float = 0.1
149
+ architecture_mutation_rate: float = 0.05
150
+ elite_ratio: float = 0.2
151
+ max_generations: int = 200
152
+ input_dim: int = 8
153
+ output_dim: int = 8
154
+ device: str = 'cpu'
155
+ seed: Optional[int] = None
156
+
157
+ ############################################################################################################################################
158
+ ################################################# - - - Fitness Evaluation - - - #####################################################
159
+
160
+ class FitnessEvaluator:
161
+ def __init__(self, device: str = 'cpu'):
162
+ self.device = device
163
+
164
+ def copy_task(self, ntm: NeuralTuringMachine, seq_len: int = 8, batch_size: int = 16) -> float:
165
+ with torch.no_grad():
166
+ x = torch.randint(0, 2, (batch_size, seq_len, ntm.cfg.input_dim),
167
+ device=self.device, dtype=torch.float32)
168
+
169
+ delimiter = torch.zeros(batch_size, 1, ntm.cfg.input_dim, device=self.device)
170
+ delimiter[:, :, -1] = 1
171
+
172
+ input_seq = torch.cat([x, delimiter], dim=1)
173
+ try:
174
+ output, _ = ntm(input_seq)
175
+ T = seq_len
176
+ D = ntm.cfg.output_dim
177
+ pred = output[:, -T:, :D]
178
+ d = min(ntm.cfg.input_dim, D)
179
+ loss = F.mse_loss(pred[..., :d], x[..., :d])
180
+ accuracy = 1.0 / (1.0 + loss.item())
181
+ return accuracy
182
+ except:
183
+ return 0.0
184
+
185
+
186
+ def associative_recall(self, ntm: NeuralTuringMachine, num_pairs: int = 4) -> float:
187
+ with torch.no_grad():
188
+ batch_size = 8
189
+ dim = ntm.cfg.input_dim
190
+ keys = torch.randn(batch_size, num_pairs, dim // 2, device=self.device)
191
+ values = torch.randn(batch_size, num_pairs, dim // 2, device=self.device)
192
+ pairs = torch.cat([keys, values], dim=-1)
193
+
194
+ test_keys = torch.cat([keys, torch.zeros_like(values)], dim=-1)
195
+ expected_values = torch.cat([torch.zeros_like(keys), values], dim=-1)
196
+
197
+ input_seq = torch.cat([pairs, test_keys], dim=1) # (B, 2P, dim)
198
+ target_seq = torch.cat([torch.zeros_like(pairs), expected_values], dim=1)
199
+
200
+ try:
201
+ output, _ = ntm(input_seq) # (B, 2P, out_dim)
202
+ D = ntm.cfg.output_dim
203
+ d = min(dim, D)
204
+ loss = F.mse_loss(output[:, num_pairs:, :d], target_seq[:, num_pairs:, :d])
205
+ accuracy = 1.0 / (1.0 + loss.item())
206
+ return accuracy
207
+ except:
208
+ return 0.0
209
+
210
+
211
+ def evaluate_fitness(self, ntm: NeuralTuringMachine) -> Dict[str, float]:
212
+ copy_score = self.copy_task(ntm)
213
+ recall_score = self.associative_recall(ntm)
214
+
215
+ param_count = sum(p.numel() for p in ntm.parameters())
216
+ efficiency = 1.0 / (1.0 + param_count / 100000)
217
+
218
+ composite_score = 0.5 * copy_score + 0.3 * recall_score + 0.2 * efficiency
219
+
220
+ return {
221
+ 'copy': copy_score,
222
+ 'recall': recall_score,
223
+ 'efficiency': efficiency,
224
+ 'composite': composite_score
225
+ }
226
+
227
+ ############################################################################################################################################
228
+ ############################################## - - - Evolutionary Turing Machine - - - ###############################################
229
+
230
+ class EvolutionaryTuringMachine:
231
+ def __init__(self, cfg: EvolutionaryTuringConfig):
232
+ self.cfg = cfg
233
+ self.evaluator = FitnessEvaluator(cfg.device)
234
+ self.generation = 0
235
+ self.best_fitness = 0.0
236
+ self.population = []
237
+
238
+ if cfg.seed is not None:
239
+ torch.manual_seed(cfg.seed)
240
+
241
+ def create_random_config(self) -> NTMConfig:
242
+ return NTMConfig(
243
+ input_dim=self.cfg.input_dim,
244
+ output_dim=self.cfg.output_dim,
245
+ controller_dim=torch.randint(64, 256, (1,)).item(),
246
+ controller_layers=torch.randint(1, 3, (1,)).item(),
247
+ memory_slots=torch.randint(32, 256, (1,)).item(),
248
+ memory_dim=torch.randint(16, 64, (1,)).item(),
249
+ heads_read=torch.randint(1, 4, (1,)).item(),
250
+ heads_write=torch.randint(1, 3, (1,)).item(),
251
+ init_std=0.1
252
+ )
253
+
254
+ def mutate_architecture(self, cfg: NTMConfig) -> NTMConfig:
255
+ new_cfg = deepcopy(cfg)
256
+
257
+ if torch.rand(1) < self.cfg.architecture_mutation_rate:
258
+ new_cfg.controller_dim = max(32, new_cfg.controller_dim + torch.randint(-32, 33, (1,)).item())
259
+
260
+ if torch.rand(1) < self.cfg.architecture_mutation_rate:
261
+ new_cfg.memory_slots = max(16, new_cfg.memory_slots + torch.randint(-16, 17, (1,)).item())
262
+
263
+ if torch.rand(1) < self.cfg.architecture_mutation_rate:
264
+ new_cfg.memory_dim = max(8, new_cfg.memory_dim + torch.randint(-8, 9, (1,)).item())
265
+
266
+ if torch.rand(1) < self.cfg.architecture_mutation_rate:
267
+ new_cfg.heads_read = max(1, min(4, new_cfg.heads_read + torch.randint(-1, 2, (1,)).item()))
268
+
269
+ if torch.rand(1) < self.cfg.architecture_mutation_rate:
270
+ new_cfg.heads_write = max(1, min(3, new_cfg.heads_write + torch.randint(-1, 2, (1,)).item()))
271
+
272
+ return new_cfg
273
+
274
+ def mutate_parameters(self, ntm: NeuralTuringMachine) -> NeuralTuringMachine:
275
+ new_ntm = NeuralTuringMachine(ntm.cfg).to(self.cfg.device)
276
+ new_ntm.load_state_dict(deepcopy(ntm.state_dict()))
277
+ with torch.no_grad():
278
+ for p in new_ntm.parameters():
279
+ mask = (torch.rand_like(p) < self.cfg.mutation_rate)
280
+ p.add_(torch.randn_like(p) * 0.01 * mask)
281
+ return new_ntm
282
+
283
+
284
+ def crossover(self, parent1: NeuralTuringMachine, parent2: NeuralTuringMachine) -> NeuralTuringMachine:
285
+ cfg1, cfg2 = parent1.cfg, parent2.cfg
286
+
287
+ new_cfg = NTMConfig(
288
+ input_dim=self.cfg.input_dim,
289
+ output_dim=self.cfg.output_dim,
290
+ controller_dim=cfg1.controller_dim if torch.rand(1) < 0.5 else cfg2.controller_dim,
291
+ memory_slots=cfg1.memory_slots if torch.rand(1) < 0.5 else cfg2.memory_slots,
292
+ memory_dim=cfg1.memory_dim if torch.rand(1) < 0.5 else cfg2.memory_dim,
293
+ heads_read=cfg1.heads_read if torch.rand(1) < 0.5 else cfg2.heads_read,
294
+ heads_write=cfg1.heads_write if torch.rand(1) < 0.5 else cfg2.heads_write,
295
+ init_std=0.1
296
+ )
297
+
298
+ child = NeuralTuringMachine(new_cfg).to(self.cfg.device)
299
+ return child
300
+
301
+ def initialize_population(self):
302
+ self.population = []
303
+ for _ in range(self.cfg.population_size):
304
+ cfg = self.create_random_config()
305
+ ntm = NeuralTuringMachine(cfg).to(self.cfg.device)
306
+ self.population.append(ntm)
307
+
308
+ def evolve_generation(self) -> Dict[str, float]:
309
+ fitness_scores = []
310
+ for ntm in self.population:
311
+ fitness = self.evaluator.evaluate_fitness(ntm)
312
+ fitness_scores.append(fitness['composite'])
313
+
314
+ sorted_indices = sorted(range(len(fitness_scores)), key=lambda i: fitness_scores[i], reverse=True)
315
+
316
+ elite_count = int(self.cfg.elite_ratio * self.cfg.population_size)
317
+ elites = [self.population[i] for i in sorted_indices[:elite_count]]
318
+
319
+ new_population = elites.copy()
320
+
321
+ while len(new_population) < self.cfg.population_size:
322
+ if torch.rand(1) < 0.3 and len(elites) >= 2:
323
+ parent1, parent2 = torch.randperm(len(elites))[:2]
324
+ child = self.crossover(elites[parent1], elites[parent2])
325
+ else:
326
+ parent_idx = torch.randint(0, elite_count, (1,)).item()
327
+ parent = elites[parent_idx]
328
+
329
+ if torch.rand(1) < 0.5:
330
+ child = self.mutate_parameters(parent)
331
+ else:
332
+ new_cfg = self.mutate_architecture(parent.cfg)
333
+ child = NeuralTuringMachine(new_cfg).to(self.cfg.device)
334
+
335
+ new_population.append(child)
336
+
337
+ self.population = new_population[:self.cfg.population_size]
338
+ self.generation += 1
339
+
340
+ best_fitness = max(fitness_scores)
341
+ avg_fitness = sum(fitness_scores) / len(fitness_scores)
342
+ self.best_fitness = max(self.best_fitness, best_fitness)
343
+
344
+ return {
345
+ 'generation': self.generation,
346
+ 'best_fitness': best_fitness,
347
+ 'avg_fitness': avg_fitness,
348
+ 'best_ever': self.best_fitness
349
+ }
350
+
351
+ def run_evolution(self) -> List[Dict[str, float]]:
352
+ self.initialize_population()
353
+
354
+ history = []
355
+ for gen in range(self.cfg.max_generations):
356
+ stats = self.evolve_generation()
357
+ history.append(stats)
358
+
359
+ if gen % 10 == 0:
360
+ print(f"Gen {gen}: Best={stats['best_fitness']:.4f}, Avg={stats['avg_fitness']:.4f}")
361
+
362
+ return history
363
+
364
+ def get_best_model(self) -> NeuralTuringMachine:
365
+ fitness_scores = []
366
+ for ntm in self.population:
367
+ fitness = self.evaluator.evaluate_fitness(ntm)
368
+ fitness_scores.append(fitness['composite'])
369
+
370
+ best_idx = max(range(len(fitness_scores)), key=lambda i: fitness_scores[i])
371
+ return self.population[best_idx]
evolutionary_turing_docs.py ADDED
@@ -0,0 +1,955 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ############################################################################################################################################
2
+ #|| - - - |8.19.2025| - - - || Evolutionary Turing Machine || - - - | 1990two | - - -||#
3
+ ############################################################################################################################################
4
+ """
5
+ Mathematical Foundation & Conceptual Documentation
6
+ -------------------------------------------------
7
+
8
+ CORE PRINCIPLE:
9
+ Combines Neural Turing Machines (external memory architectures) with evolutionary
10
+ algorithms to create adaptive memory systems that evolve both their architecture
11
+ and parameters through natural selection, enabling discovery of optimal memory
12
+ access patterns and computational structures.
13
+
14
+ MATHEMATICAL FOUNDATION:
15
+ =======================
16
+
17
+ 1. NEURAL TURING MACHINE DYNAMICS:
18
+ Content-based addressing: w_t^c = softmax(Ξ²_t βŠ™ K[M_t, k_t])
19
+ Where:
20
+ - w_t: attention weights over memory locations
21
+ - Ξ²_t: key strength (focus parameter)
22
+ - K[M,k]: cosine similarity between memory M and key k
23
+ - M_t: memory matrix at time t
24
+ - k_t: generated key vector
25
+
26
+ 2. MEMORY OPERATIONS:
27
+ Read: r_t = Ξ£_i w_t^r[i] Γ— M_t[i]
28
+ Erase: MΜƒ_t[i] = M_{t-1}[i] βŠ™ (1 - w_t^w[i] βŠ™ e_t)
29
+ Add: M_t[i] = MΜƒ_t[i] + w_t^w[i] βŠ™ a_t
30
+
31
+ Where:
32
+ - r_t: read vector
33
+ - e_t: erase vector ∈ [0,1]^M
34
+ - a_t: add vector ∈ ℝ^M
35
+ - βŠ™: element-wise product
36
+
37
+ 3. EVOLUTIONARY FITNESS:
38
+ F(individual) = Ξ±Β·task_performance + Ξ²Β·memory_efficiency + Ξ³Β·stability
39
+
40
+ Where:
41
+ - task_performance: accuracy on computational tasks
42
+ - memory_efficiency: 1/(parameter_count/baseline)
43
+ - stability: consistency across multiple runs
44
+
45
+ 4. GENETIC OPERATIONS:
46
+ Architecture Crossover: A_child = random_blend(A_parent1, A_parent2)
47
+ Parameter Mutation: ΞΈ'_i = ΞΈ_i + Ρ·N(0,σ²) with probability p_mut
48
+ Selection: P(selection) ∝ exp(F(individual)/T)
49
+
50
+ Where T is selection temperature.
51
+
52
+ 5. POPULATION DYNAMICS:
53
+ Elite Preservation: Keep top k% individuals
54
+ Tournament Selection: Choose parents via tournament
55
+ Replacement Strategy: (ΞΌ + Ξ») evolution strategy
56
+
57
+ CONCEPTUAL REASONING:
58
+ ====================
59
+
60
+ WHY EVOLUTIONARY + TURING MACHINES?
61
+ - Fixed NTM architectures may be suboptimal for specific tasks
62
+ - Manual architecture design is time-intensive and domain-specific
63
+ - Evolution can discover novel memory access patterns
64
+ - Natural selection optimizes both structure and parameters simultaneously
65
+
66
+ KEY INNOVATIONS:
67
+ 1. **Evolvable Architecture**: Memory size, heads, controller complexity all mutable
68
+ 2. **Task-Adaptive Evolution**: Fitness functions guide toward task-specific solutions
69
+ 3. **Multi-Objective Optimization**: Balance performance, efficiency, and stability
70
+ 4. **Hierarchical Mutation**: Different rates for architecture vs parameters
71
+ 5. **Memory Access Pattern Evolution**: Learn optimal attention strategies
72
+
73
+ APPLICATIONS:
74
+ - Algorithmic learning (sorting, copying, associative recall)
75
+ - Adaptive control systems with memory requirements
76
+ - Meta-learning for memory-augmented architectures
77
+ - Neural architecture search for sequence modeling
78
+ - Continual learning with evolving memory structures
79
+
80
+ COMPLEXITY ANALYSIS:
81
+ - Individual Evaluation: O(TΒ·(DΒ² + MΒ·H)) where T=sequence length, D=hidden size, M=memory slots, H=heads
82
+ - Population Evolution: O(PΒ·evaluations) where P=population size
83
+ - Architecture Mutation: O(1) for parameter changes, O(M) for structural changes
84
+ - Memory: O(PΒ·(DΒ² + MΒ²)) for population storage
85
+
86
+ BIOLOGICAL INSPIRATION:
87
+ - Neural plasticity and synaptic evolution
88
+ - Natural selection of neural circuits
89
+ - Memory consolidation and forgetting mechanisms
90
+ - Adaptive brain architecture development
91
+ """
92
+
93
+ from __future__ import annotations
94
+
95
+ from dataclasses import dataclass
96
+ from typing import Dict, List, Optional, Tuple, Union
97
+ import math
98
+ import torch
99
+ import torch.nn as nn
100
+ import torch.nn.functional as F
101
+ from copy import deepcopy
102
+
103
+
104
+ @dataclass
105
+ class NTMConfig:
106
+ """Configuration for Neural Turing Machine architecture.
107
+
108
+ Defines the structure and hyperparameters for a single NTM individual
109
+ in the evolutionary population. All parameters are evolvable.
110
+ """
111
+ input_dim: int
112
+ output_dim: int
113
+ controller_dim: int = 128
114
+ controller_layers: int = 1
115
+ memory_slots: int = 128
116
+ memory_dim: int = 32
117
+ heads_read: int = 1
118
+ heads_write: int = 1
119
+ init_std: float = 0.1
120
+
121
+ ############################################################################################################################################
122
+ #################################################### - - - Neural Turing Machine - - - ###############################################
123
+
124
+ class NeuralTuringMachine(nn.Module):
125
+ """Neural Turing Machine with external memory and attention mechanisms.
126
+
127
+ Implements the complete NTM architecture including:
128
+ - LSTM controller for sequence processing
129
+ - External memory matrix with read/write operations
130
+ - Content-based addressing via cosine similarity
131
+ - Differentiable memory operations (erase, add)
132
+
133
+ Mathematical Details:
134
+ - Controller processes input + read vectors: h_t = LSTM(x_t βŠ• r_{t-1}, h_{t-1})
135
+ - Interface parameters: keys, strengths, erase/add vectors
136
+ - Attention: w_t = softmax(Ξ²_t βŠ™ cosine_sim(M_t, k_t))
137
+ - Memory updates preserve differentiability for gradient-based learning
138
+ """
139
+ def __init__(self, cfg: NTMConfig):
140
+ super().__init__()
141
+ self.cfg = cfg
142
+ R, W, Dm = cfg.heads_read, cfg.heads_write, cfg.memory_dim
143
+
144
+ # Controller: processes input + read vectors
145
+ ctrl_in = cfg.input_dim + R * Dm
146
+ self.controller = nn.LSTMCell(ctrl_in, cfg.controller_dim)
147
+
148
+ # Interface: generates read/write parameters
149
+ iface_read = R * (Dm + 1) # key + strength per read head
150
+ iface_write = W * (Dm + 1 + Dm + Dm) # key + strength + erase + add per write head
151
+ self.interface = nn.Linear(cfg.controller_dim, iface_read + iface_write)
152
+
153
+ # Output head: combines controller state + read vectors
154
+ self.output = nn.Linear(cfg.controller_dim + R * Dm, cfg.output_dim)
155
+
156
+ self.reset_parameters()
157
+
158
+ def reset_parameters(self):
159
+ """Initialize parameters with appropriate distributions.
160
+
161
+ Uses Xavier initialization for linear layers and orthogonal
162
+ initialization for LSTM recurrent weights to ensure stable training.
163
+ Forget gate bias is initialized to 1.0 for better gradient flow.
164
+ """
165
+ for m in self.modules():
166
+ if isinstance(m, nn.Linear):
167
+ nn.init.xavier_uniform_(m.weight)
168
+ nn.init.zeros_(m.bias)
169
+ if isinstance(m, nn.LSTMCell):
170
+ nn.init.xavier_uniform_(m.weight_ih)
171
+ nn.init.orthogonal_(m.weight_hh)
172
+ nn.init.zeros_(m.bias_ih)
173
+ nn.init.zeros_(m.bias_hh)
174
+ # Forget gate bias = 1.0 for better gradient flow
175
+ hs = m.bias_ih.shape[0] // 4
176
+ m.bias_ih.data[hs:2*hs].fill_(1.0)
177
+ m.bias_hh.data[hs:2*hs].fill_(1.0)
178
+
179
+ def initial_state(self, batch_size: int, device=None):
180
+ """Initialize NTM state including memory, attention weights, and controller state.
181
+
182
+ Args:
183
+ batch_size: Number of parallel sequences
184
+ device: Target device for tensors
185
+
186
+ Returns:
187
+ Dictionary containing:
188
+ - M: Memory matrix [batch_size, memory_slots, memory_dim]
189
+ - w_r: Read attention weights [batch_size, heads_read, memory_slots]
190
+ - w_w: Write attention weights [batch_size, heads_write, memory_slots]
191
+ - r: Read vectors [batch_size, heads_read, memory_dim]
192
+ - h, c: LSTM controller states
193
+ """
194
+ cfg = self.cfg
195
+ device = device or next(self.parameters()).device
196
+
197
+ # Initialize memory with small random values
198
+ M = torch.zeros(batch_size, cfg.memory_slots, cfg.memory_dim, device=device)
199
+ if cfg.init_std > 0:
200
+ M.normal_(0.0, cfg.init_std)
201
+
202
+ # Initialize attention weights uniformly (all locations equally attended)
203
+ w_r = torch.ones(batch_size, cfg.heads_read, cfg.memory_slots, device=device) / cfg.memory_slots
204
+ w_w = torch.ones(batch_size, cfg.heads_write, cfg.memory_slots, device=device) / cfg.memory_slots
205
+
206
+ # Initialize read vectors and controller states
207
+ r = torch.zeros(batch_size, cfg.heads_read, cfg.memory_dim, device=device)
208
+ h = torch.zeros(batch_size, cfg.controller_dim, device=device)
209
+ c = torch.zeros(batch_size, cfg.controller_dim, device=device)
210
+
211
+ return {'M': M, 'w_r': w_r, 'w_w': w_w, 'r': r, 'h': h, 'c': c}
212
+
213
+ def step(self, x: torch.Tensor, state: Dict[str, torch.Tensor]):
214
+ """Execute one forward step of NTM computation.
215
+
216
+ Complete NTM forward pass:
217
+ 1. Controller processes input + previous reads
218
+ 2. Interface generates memory operation parameters
219
+ 3. Content-based addressing computes attention weights
220
+ 4. Memory operations (read, erase, add)
221
+ 5. Output generation
222
+
223
+ Args:
224
+ x: Input tensor [batch_size, input_dim]
225
+ state: Current NTM state dictionary
226
+
227
+ Returns:
228
+ y: Output tensor [batch_size, output_dim]
229
+ new_state: Updated state dictionary
230
+ """
231
+ cfg = self.cfg
232
+ B = x.shape[0]
233
+
234
+ # Step 1: Controller forward pass
235
+ ctrl_in = torch.cat([x, state['r'].view(B, -1)], dim=-1)
236
+ h, c = self.controller(ctrl_in, (state['h'], state['c']))
237
+
238
+ # Step 2: Generate interface parameters
239
+ iface = self.interface(h)
240
+ R, W, Dm = cfg.heads_read, cfg.heads_write, cfg.memory_dim
241
+
242
+ # Parse interface outputs
243
+ offset = 0
244
+ # Read parameters: keys and strengths
245
+ k_r = iface[:, offset:offset + R * Dm].view(B, R, Dm)
246
+ offset += R * Dm
247
+ beta_r = F.softplus(iface[:, offset:offset + R])
248
+ offset += R
249
+
250
+ # Write parameters: keys, strengths, erase vectors, add vectors
251
+ k_w = iface[:, offset:offset + W * Dm].view(B, W, Dm)
252
+ offset += W * Dm
253
+ beta_w = F.softplus(iface[:, offset:offset + W])
254
+ offset += W
255
+ erase = torch.sigmoid(iface[:, offset:offset + W * Dm]).view(B, W, Dm)
256
+ offset += W * Dm
257
+ add = torch.tanh(iface[:, offset:offset + W * Dm]).view(B, W, Dm)
258
+
259
+ def address(M, k, beta, prev_weight=None):
260
+ """Content-based addressing mechanism.
261
+
262
+ Computes attention weights using cosine similarity between
263
+ memory contents and generated keys, focused by strength parameter.
264
+
265
+ Mathematical Details:
266
+ - Cosine similarity: sim(M[i], k) = (M[i] Β· k) / (||M[i]|| ||k||)
267
+ - Focused attention: w = softmax(Ξ² βŠ™ sim)
268
+ - Optional momentum: adds small fraction of previous weights
269
+
270
+ Args:
271
+ M: Memory matrix [batch_size, slots, memory_dim]
272
+ k: Key vectors [batch_size, heads, memory_dim]
273
+ beta: Strength parameters [batch_size, heads]
274
+ prev_weight: Previous attention weights for momentum
275
+
276
+ Returns:
277
+ Attention weights [batch_size, heads, slots]
278
+ """
279
+ # Normalize for cosine similarity
280
+ M_norm = torch.norm(M, dim=-1, keepdim=True).clamp_min(1e-8)
281
+ k_norm = torch.norm(k, dim=-1, keepdim=True).clamp_min(1e-8)
282
+
283
+ # Cosine similarity: M[i] Β· k / (||M[i]|| ||k||)
284
+ cos_sim = torch.sum(M.unsqueeze(1) * k.unsqueeze(2), dim=-1) / (
285
+ M_norm.squeeze(-1).unsqueeze(1) * k_norm.squeeze(-1).unsqueeze(-1)
286
+ )
287
+
288
+ # Apply strength and optional momentum
289
+ content_logits = beta.unsqueeze(-1) * cos_sim
290
+ if prev_weight is not None:
291
+ content_logits = content_logits + 0.02 * prev_weight # Small momentum term
292
+
293
+ return F.softmax(content_logits, dim=-1)
294
+
295
+ # Step 3: Compute attention weights
296
+ w_r = address(state['M'], k_r, beta_r, prev_weight=state.get('w_r'))
297
+ w_w = address(state['M'], k_w, beta_w, prev_weight=state.get('w_w'))
298
+
299
+ # Step 4: Memory operations
300
+ # Read: weighted sum over memory locations
301
+ r = torch.sum(w_r.unsqueeze(-1) * state['M'].unsqueeze(1), dim=2)
302
+
303
+ # Write: erase then add
304
+ M = state['M']
305
+ if W > 0:
306
+ # Erase: M[i] := M[i] βŠ™ (1 - w[i] βŠ™ e)
307
+ erase_term = torch.prod(1 - w_w.unsqueeze(-1) * erase.unsqueeze(2), dim=1)
308
+ M = M * erase_term
309
+
310
+ # Add: M[i] := M[i] + w[i] βŠ™ a
311
+ add_term = torch.sum(w_w.unsqueeze(-1) * add.unsqueeze(2), dim=1)
312
+ M = M + add_term
313
+
314
+ # Step 5: Generate output
315
+ y = self.output(torch.cat([h, r.view(B, -1)], dim=-1))
316
+
317
+ new_state = {'M': M, 'w_r': w_r, 'w_w': w_w, 'r': r, 'h': h, 'c': c}
318
+ return y, new_state
319
+
320
+ def forward(self, x: torch.Tensor, state=None):
321
+ """Forward pass for single step or sequence.
322
+
323
+ Handles both single-step operation (for interactive use) and
324
+ sequence processing (for training/evaluation).
325
+
326
+ Args:
327
+ x: Input tensor [batch_size, input_dim] or [batch_size, seq_len, input_dim]
328
+ state: Optional initial state (created if None)
329
+
330
+ Returns:
331
+ For single step: (output, new_state)
332
+ For sequence: (output_sequence, final_state)
333
+ """
334
+ if x.dim() == 2: # Single step
335
+ if state is None:
336
+ state = self.initial_state(x.shape[0], x.device)
337
+ return self.step(x, state)
338
+
339
+ # Sequence processing
340
+ B, T, _ = x.shape
341
+ if state is None:
342
+ state = self.initial_state(B, x.device)
343
+
344
+ outputs = []
345
+ for t in range(T):
346
+ y, state = self.step(x[:, t], state)
347
+ outputs.append(y)
348
+
349
+ return torch.stack(outputs, dim=1), state
350
+
351
+ @dataclass
352
+ class EvolutionaryTuringConfig:
353
+ """Configuration for evolutionary optimization of NTM population.
354
+
355
+ Defines hyperparameters for the evolutionary algorithm including
356
+ population size, mutation rates, selection pressure, and fitness
357
+ evaluation parameters.
358
+ """
359
+ population_size: int = 100
360
+ mutation_rate: float = 0.1
361
+ architecture_mutation_rate: float = 0.05
362
+ elite_ratio: float = 0.2
363
+ max_generations: int = 200
364
+ input_dim: int = 8
365
+ output_dim: int = 8
366
+ device: str = 'cpu'
367
+ seed: Optional[int] = None
368
+
369
+ ############################################################################################################################################
370
+ ################################################# - - - Fitness Evaluation - - - #####################################################
371
+
372
+ class FitnessEvaluator:
373
+ """Comprehensive fitness evaluation for NTM individuals.
374
+
375
+ Evaluates NTM performance on multiple algorithmic tasks to assess
376
+ general computational capability. Includes efficiency penalties
377
+ to encourage compact, effective architectures.
378
+
379
+ Tasks:
380
+ 1. Copy Task: Tests basic memory read/write capabilities
381
+ 2. Associative Recall: Tests content-based memory access
382
+ 3. Efficiency: Penalizes excessive parameters
383
+
384
+ Mathematical Details:
385
+ - Copy task measures sequence reproduction accuracy
386
+ - Associative recall tests key-value pair memory
387
+ - Composite fitness balances multiple objectives
388
+ """
389
+ def __init__(self, device: str = 'cpu'):
390
+ self.device = device
391
+
392
+ def copy_task(self, ntm: NeuralTuringMachine, seq_len: int = 8, batch_size: int = 16) -> float:
393
+ """Evaluate NTM on sequence copying task.
394
+
395
+ The copy task is fundamental for testing memory capabilities:
396
+ 1. Present input sequence
397
+ 2. Present delimiter (end-of-sequence marker)
398
+ 3. Evaluate output sequence reproduction accuracy
399
+
400
+ Mathematical Details:
401
+ - Input: x₁, xβ‚‚, ..., xβ‚œ, delimiter
402
+ - Target: reproduce x₁, xβ‚‚, ..., xβ‚œ after delimiter
403
+ - Loss: MSE between predicted and target sequences
404
+ - Accuracy: 1 / (1 + loss) for bounded score ∈ [0,1]
405
+
406
+ Args:
407
+ ntm: NTM individual to evaluate
408
+ seq_len: Length of sequences to copy
409
+ batch_size: Number of parallel sequences
410
+
411
+ Returns:
412
+ Copy task accuracy score ∈ [0,1]
413
+ """
414
+ with torch.no_grad():
415
+ # Generate random binary sequences
416
+ x = torch.randint(0, 2, (batch_size, seq_len, ntm.cfg.input_dim),
417
+ device=self.device, dtype=torch.float32)
418
+
419
+ # Add delimiter (end-of-sequence marker)
420
+ delimiter = torch.zeros(batch_size, 1, ntm.cfg.input_dim, device=self.device)
421
+ delimiter[:, :, -1] = 1 # Use last dimension as delimiter signal
422
+
423
+ # Complete input: sequence + delimiter
424
+ input_seq = torch.cat([x, delimiter], dim=1)
425
+
426
+ try:
427
+ output, _ = ntm(input_seq)
428
+
429
+ # Compare output to target (original sequence)
430
+ T = seq_len
431
+ D = ntm.cfg.output_dim
432
+ pred = output[:, -T:, :D] # Last T outputs
433
+
434
+ # Handle dimension mismatch by using overlap
435
+ d = min(ntm.cfg.input_dim, D)
436
+ loss = F.mse_loss(pred[..., :d], x[..., :d])
437
+ accuracy = 1.0 / (1.0 + loss.item())
438
+ return accuracy
439
+ except:
440
+ # Return zero for failed evaluations (architecture issues)
441
+ return 0.0
442
+
443
+ def associative_recall(self, ntm: NeuralTuringMachine, num_pairs: int = 4) -> float:
444
+ """Evaluate NTM on associative memory recall task.
445
+
446
+ Tests content-based memory access by storing key-value pairs
447
+ and then querying with keys to retrieve associated values.
448
+
449
+ Task Structure:
450
+ 1. Store phase: present key-value pairs
451
+ 2. Query phase: present keys (with zero values)
452
+ 3. Evaluate: check if correct values are recalled
453
+
454
+ Mathematical Details:
455
+ - Keys: k₁, kβ‚‚, ..., kβ‚™ (half of input dimension)
456
+ - Values: v₁, vβ‚‚, ..., vβ‚™ (other half of input dimension)
457
+ - Query: present [k₁, 0], expect output [0, v₁]
458
+ - Score based on MSE between recalled and target values
459
+
460
+ Args:
461
+ ntm: NTM individual to evaluate
462
+ num_pairs: Number of key-value pairs to store/recall
463
+
464
+ Returns:
465
+ Associative recall accuracy score ∈ [0,1]
466
+ """
467
+ with torch.no_grad():
468
+ batch_size = 8
469
+ dim = ntm.cfg.input_dim
470
+
471
+ # Generate key-value pairs
472
+ keys = torch.randn(batch_size, num_pairs, dim // 2, device=self.device)
473
+ values = torch.randn(batch_size, num_pairs, dim // 2, device=self.device)
474
+ pairs = torch.cat([keys, values], dim=-1)
475
+
476
+ # Query format: keys with zero values
477
+ test_keys = torch.cat([keys, torch.zeros_like(values)], dim=-1)
478
+ expected_values = torch.cat([torch.zeros_like(keys), values], dim=-1)
479
+
480
+ # Complete sequence: store pairs then query
481
+ input_seq = torch.cat([pairs, test_keys], dim=1)
482
+ target_seq = torch.cat([torch.zeros_like(pairs), expected_values], dim=1)
483
+
484
+ try:
485
+ output, _ = ntm(input_seq)
486
+
487
+ # Evaluate query phase (second half of sequence)
488
+ D = ntm.cfg.output_dim
489
+ d = min(dim, D)
490
+ loss = F.mse_loss(output[:, num_pairs:, :d], target_seq[:, num_pairs:, :d])
491
+ accuracy = 1.0 / (1.0 + loss.item())
492
+ return accuracy
493
+ except:
494
+ return 0.0
495
+
496
+ def evaluate_fitness(self, ntm: NeuralTuringMachine) -> Dict[str, float]:
497
+ """Comprehensive fitness evaluation across multiple criteria.
498
+
499
+ Evaluates individual on multiple tasks and efficiency metrics
500
+ to encourage both performance and architectural parsimony.
501
+
502
+ Fitness Components:
503
+ 1. Copy Task (50%): Basic memory functionality
504
+ 2. Associative Recall (30%): Content-based memory access
505
+ 3. Efficiency (20%): Parameter count penalty
506
+
507
+ Mathematical Details:
508
+ - Each component scored ∈ [0,1]
509
+ - Efficiency = 1 / (1 + params/baseline)
510
+ - Composite = weighted combination
511
+
512
+ Args:
513
+ ntm: NTM individual to evaluate
514
+
515
+ Returns:
516
+ Dictionary containing individual and composite fitness scores
517
+ """
518
+ copy_score = self.copy_task(ntm)
519
+ recall_score = self.associative_recall(ntm)
520
+
521
+ # Efficiency penalty based on parameter count
522
+ param_count = sum(p.numel() for p in ntm.parameters())
523
+ efficiency = 1.0 / (1.0 + param_count / 100000) # Normalize to reasonable range
524
+
525
+ # Weighted composite fitness
526
+ composite_score = 0.5 * copy_score + 0.3 * recall_score + 0.2 * efficiency
527
+
528
+ return {
529
+ 'copy': copy_score,
530
+ 'recall': recall_score,
531
+ 'efficiency': efficiency,
532
+ 'composite': composite_score
533
+ }
534
+
535
+ ###############################################################################################################################################
536
+ ################################################# - - - Evolutionary Turing Machine - - - ###############################################
537
+
538
+ class EvolutionaryTuringMachine:
539
+ """Evolutionary optimization system for Neural Turing Machine architectures.
540
+
541
+ Implements a complete evolutionary algorithm for discovering optimal
542
+ NTM architectures and parameters through natural selection. Uses
543
+ both architectural mutations (structure) and parameter mutations.
544
+
545
+ Evolutionary Operations:
546
+ 1. Selection: Tournament/rank-based parent selection
547
+ 2. Crossover: Architecture and parameter blending
548
+ 3. Mutation: Structure modification and parameter perturbation
549
+ 4. Replacement: Elite preservation with new offspring
550
+
551
+ The system evolves both the neural architecture (memory size, heads,
552
+ controller complexity) and the connection weights simultaneously.
553
+ """
554
+ def __init__(self, cfg: EvolutionaryTuringConfig):
555
+ self.cfg = cfg
556
+ self.evaluator = FitnessEvaluator(cfg.device)
557
+ self.generation = 0
558
+ self.best_fitness = 0.0
559
+ self.population = []
560
+
561
+ if cfg.seed is not None:
562
+ torch.manual_seed(cfg.seed)
563
+
564
+ def create_random_config(self) -> NTMConfig:
565
+ """Generate random NTM architecture configuration.
566
+
567
+ Creates diverse initial population by randomizing all
568
+ architectural hyperparameters within reasonable bounds.
569
+
570
+ Architectural Parameters:
571
+ - Controller dimension: [64, 256]
572
+ - Memory slots: [32, 256]
573
+ - Memory dimension: [16, 64]
574
+ - Read/write heads: [1, 4] and [1, 3]
575
+
576
+ Returns:
577
+ Random NTM configuration
578
+ """
579
+ return NTMConfig(
580
+ input_dim=self.cfg.input_dim,
581
+ output_dim=self.cfg.output_dim,
582
+ controller_dim=torch.randint(64, 256, (1,)).item(),
583
+ controller_layers=torch.randint(1, 3, (1,)).item(),
584
+ memory_slots=torch.randint(32, 256, (1,)).item(),
585
+ memory_dim=torch.randint(16, 64, (1,)).item(),
586
+ heads_read=torch.randint(1, 4, (1,)).item(),
587
+ heads_write=torch.randint(1, 3, (1,)).item(),
588
+ init_std=0.1
589
+ )
590
+
591
+ def mutate_architecture(self, cfg: NTMConfig) -> NTMConfig:
592
+ """Apply architectural mutations to NTM configuration.
593
+
594
+ Modifies structural parameters with probability architecture_mutation_rate.
595
+ Each architectural parameter can be independently mutated with
596
+ small random perturbations.
597
+
598
+ Mutation Operations:
599
+ - Controller dimension: Β±32 units
600
+ - Memory slots: Β±16 units
601
+ - Memory dimension: Β±8 units
602
+ - Read/write heads: Β±1 head (within bounds)
603
+
604
+ Args:
605
+ cfg: Original NTM configuration
606
+
607
+ Returns:
608
+ Mutated NTM configuration
609
+ """
610
+ new_cfg = deepcopy(cfg)
611
+
612
+ if torch.rand(1) < self.cfg.architecture_mutation_rate:
613
+ new_cfg.controller_dim = max(32, new_cfg.controller_dim + torch.randint(-32, 33, (1,)).item())
614
+
615
+ if torch.rand(1) < self.cfg.architecture_mutation_rate:
616
+ new_cfg.memory_slots = max(16, new_cfg.memory_slots + torch.randint(-16, 17, (1,)).item())
617
+
618
+ if torch.rand(1) < self.cfg.architecture_mutation_rate:
619
+ new_cfg.memory_dim = max(8, new_cfg.memory_dim + torch.randint(-8, 9, (1,)).item())
620
+
621
+ if torch.rand(1) < self.cfg.architecture_mutation_rate:
622
+ new_cfg.heads_read = max(1, min(4, new_cfg.heads_read + torch.randint(-1, 2, (1,)).item()))
623
+
624
+ if torch.rand(1) < self.cfg.architecture_mutation_rate:
625
+ new_cfg.heads_write = max(1, min(3, new_cfg.heads_write + torch.randint(-1, 2, (1,)).item()))
626
+
627
+ return new_cfg
628
+
629
+ def mutate_parameters(self, ntm: NeuralTuringMachine) -> NeuralTuringMachine:
630
+ """Apply parameter mutations to NTM weights.
631
+
632
+ Performs Gaussian perturbations to network parameters with
633
+ probability mutation_rate per parameter. Creates a new NTM
634
+ instance to avoid modifying the original.
635
+
636
+ Mathematical Details:
637
+ - Each parameter p mutated with probability mutation_rate
638
+ - Mutation: p' = p + Ξ΅ where Ξ΅ ~ N(0, 0.01Β²)
639
+ - Preserves network architecture, only modifies weights
640
+
641
+ Args:
642
+ ntm: Original NTM individual
643
+
644
+ Returns:
645
+ New NTM with mutated parameters
646
+ """
647
+ new_ntm = NeuralTuringMachine(ntm.cfg).to(self.cfg.device)
648
+ new_ntm.load_state_dict(deepcopy(ntm.state_dict()))
649
+
650
+ with torch.no_grad():
651
+ for p in new_ntm.parameters():
652
+ # Apply mutation mask (probability mutation_rate per element)
653
+ mask = (torch.rand_like(p) < self.cfg.mutation_rate)
654
+ p.add_(torch.randn_like(p) * 0.01 * mask)
655
+
656
+ return new_ntm
657
+
658
+ def crossover(self, parent1: NeuralTuringMachine, parent2: NeuralTuringMachine) -> NeuralTuringMachine:
659
+ """Create offspring through architectural crossover.
660
+
661
+ Combines architectural features from two parents by randomly
662
+ selecting each architectural parameter from either parent.
663
+ The resulting offspring has a new random weight initialization.
664
+
665
+ Crossover Strategy:
666
+ - Each architectural parameter chosen from parent1 or parent2 (50% each)
667
+ - New weights initialized randomly (architectural crossover only)
668
+ - Alternative: could implement parameter-level crossover
669
+
670
+ Args:
671
+ parent1: First parent NTM
672
+ parent2: Second parent NTM
673
+
674
+ Returns:
675
+ Offspring NTM with hybrid architecture
676
+ """
677
+ cfg1, cfg2 = parent1.cfg, parent2.cfg
678
+
679
+ # Create hybrid configuration
680
+ new_cfg = NTMConfig(
681
+ input_dim=self.cfg.input_dim,
682
+ output_dim=self.cfg.output_dim,
683
+ controller_dim=cfg1.controller_dim if torch.rand(1) < 0.5 else cfg2.controller_dim,
684
+ memory_slots=cfg1.memory_slots if torch.rand(1) < 0.5 else cfg2.memory_slots,
685
+ memory_dim=cfg1.memory_dim if torch.rand(1) < 0.5 else cfg2.memory_dim,
686
+ heads_read=cfg1.heads_read if torch.rand(1) < 0.5 else cfg2.heads_read,
687
+ heads_write=cfg1.heads_write if torch.rand(1) < 0.5 else cfg2.heads_write,
688
+ init_std=0.1
689
+ )
690
+
691
+ # Create new individual with hybrid architecture
692
+ child = NeuralTuringMachine(new_cfg).to(self.cfg.device)
693
+ return child
694
+
695
+ def initialize_population(self):
696
+ """Create initial population with diverse random architectures.
697
+
698
+ Generates population_size individuals with random architectural
699
+ configurations to ensure diversity in the initial gene pool.
700
+ Each individual is initialized with different structural parameters.
701
+ """
702
+ self.population = []
703
+ for _ in range(self.cfg.population_size):
704
+ cfg = self.create_random_config()
705
+ ntm = NeuralTuringMachine(cfg).to(self.cfg.device)
706
+ self.population.append(ntm)
707
+
708
+ def evolve_generation(self) -> Dict[str, float]:
709
+ """Execute one generation of evolutionary optimization.
710
+
711
+ Complete generational evolution cycle:
712
+ 1. Evaluate all individuals in population
713
+ 2. Select elite individuals for survival
714
+ 3. Generate offspring through crossover and mutation
715
+ 4. Replace non-elite individuals with offspring
716
+ 5. Update statistics and generation counter
717
+
718
+ Uses (ΞΌ + Ξ») evolution strategy with elite preservation
719
+ to ensure best solutions are never lost.
720
+
721
+ Returns:
722
+ Dictionary containing generation statistics
723
+ """
724
+ # Step 1: Evaluate population fitness
725
+ fitness_scores = []
726
+ for ntm in self.population:
727
+ fitness = self.evaluator.evaluate_fitness(ntm)
728
+ fitness_scores.append(fitness['composite'])
729
+
730
+ # Step 2: Selection - sort by fitness (descending)
731
+ sorted_indices = sorted(range(len(fitness_scores)), key=lambda i: fitness_scores[i], reverse=True)
732
+
733
+ # Step 3: Elite preservation
734
+ elite_count = int(self.cfg.elite_ratio * self.cfg.population_size)
735
+ elites = [self.population[i] for i in sorted_indices[:elite_count]]
736
+
737
+ # Step 4: Generate offspring to fill remaining population
738
+ new_population = elites.copy()
739
+
740
+ while len(new_population) < self.cfg.population_size:
741
+ if torch.rand(1) < 0.3 and len(elites) >= 2:
742
+ # Crossover: select two random elite parents
743
+ parent1, parent2 = torch.randperm(len(elites))[:2]
744
+ child = self.crossover(elites[parent1], elites[parent2])
745
+ else:
746
+ # Mutation: select random elite parent
747
+ parent_idx = torch.randint(0, elite_count, (1,)).item()
748
+ parent = elites[parent_idx]
749
+
750
+ if torch.rand(1) < 0.5:
751
+ # Parameter mutation
752
+ child = self.mutate_parameters(parent)
753
+ else:
754
+ # Architectural mutation
755
+ new_cfg = self.mutate_architecture(parent.cfg)
756
+ child = NeuralTuringMachine(new_cfg).to(self.cfg.device)
757
+
758
+ new_population.append(child)
759
+
760
+ # Step 5: Update population and statistics
761
+ self.population = new_population[:self.cfg.population_size]
762
+ self.generation += 1
763
+
764
+ best_fitness = max(fitness_scores)
765
+ avg_fitness = sum(fitness_scores) / len(fitness_scores)
766
+ self.best_fitness = max(self.best_fitness, best_fitness)
767
+
768
+ return {
769
+ 'generation': self.generation,
770
+ 'best_fitness': best_fitness,
771
+ 'avg_fitness': avg_fitness,
772
+ 'best_ever': self.best_fitness
773
+ }
774
+
775
+ def run_evolution(self) -> List[Dict[str, float]]:
776
+ """Execute complete evolutionary optimization run.
777
+
778
+ Runs evolution for max_generations, tracking progress and
779
+ printing periodic updates. Returns complete optimization
780
+ history for analysis and visualization.
781
+
782
+ Returns:
783
+ List of generation statistics dictionaries
784
+ """
785
+ self.initialize_population()
786
+
787
+ history = []
788
+ for gen in range(self.cfg.max_generations):
789
+ stats = self.evolve_generation()
790
+ history.append(stats)
791
+
792
+ # Periodic progress reporting
793
+ if gen % 10 == 0:
794
+ print(f"Gen {gen}: Best={stats['best_fitness']:.4f}, Avg={stats['avg_fitness']:.4f}")
795
+
796
+ return history
797
+
798
+ def get_best_model(self) -> NeuralTuringMachine:
799
+ """Retrieve the best individual from current population.
800
+
801
+ Evaluates all current individuals and returns the one
802
+ with highest composite fitness score.
803
+
804
+ Returns:
805
+ Best NTM individual from population
806
+ """
807
+ fitness_scores = []
808
+ for ntm in self.population:
809
+ fitness = self.evaluator.evaluate_fitness(ntm)
810
+ fitness_scores.append(fitness['composite'])
811
+
812
+ best_idx = max(range(len(fitness_scores)), key=lambda i: fitness_scores[i])
813
+ return self.population[best_idx]
814
+
815
+ ###########################################################################################################################################
816
+ ##################################################- - - DEMO AND TESTING - - -#########################################################
817
+
818
+ def test_evolutionary_turing():
819
+ """Comprehensive test of evolutionary NTM optimization."""
820
+ print(" Testing Evolutionary Turing Machine - Adaptive Memory Architecture Evolution")
821
+ print("=" * 90)
822
+
823
+ # Create evolutionary system
824
+ config = EvolutionaryTuringConfig(
825
+ population_size=20, # Small for demo
826
+ max_generations=30,
827
+ input_dim=8,
828
+ output_dim=8,
829
+ mutation_rate=0.15,
830
+ architecture_mutation_rate=0.1,
831
+ elite_ratio=0.3,
832
+ device='cpu'
833
+ )
834
+
835
+ system = EvolutionaryTuringMachine(config)
836
+
837
+ print(f"Created Evolutionary Turing System:")
838
+ print(f" - Population size: {config.population_size}")
839
+ print(f" - Max generations: {config.max_generations}")
840
+ print(f" - Architecture mutation rate: {config.architecture_mutation_rate}")
841
+ print(f" - Parameter mutation rate: {config.mutation_rate}")
842
+ print(f" - Elite preservation: {config.elite_ratio*100:.0f}%")
843
+
844
+ # Test individual components first
845
+ print("\n Testing individual NTM...")
846
+ test_config = system.create_random_config()
847
+ test_ntm = NeuralTuringMachine(test_config).to(config.device)
848
+
849
+ print(f"Random NTM architecture:")
850
+ print(f" - Controller: {test_config.controller_dim}D")
851
+ print(f" - Memory: {test_config.memory_slots} Γ— {test_config.memory_dim}")
852
+ print(f" - Heads: {test_config.heads_read}R/{test_config.heads_write}W")
853
+
854
+ # Test fitness evaluation
855
+ fitness = system.evaluator.evaluate_fitness(test_ntm)
856
+ print(f"\nFitness evaluation:")
857
+ for task, score in fitness.items():
858
+ print(f" - {task.capitalize()}: {score:.3f}")
859
+
860
+ # Test evolutionary operations
861
+ print("\n Testing evolutionary operations...")
862
+
863
+ # Test mutation
864
+ mutated_ntm = system.mutate_parameters(test_ntm)
865
+ print("βœ“ Parameter mutation successful")
866
+
867
+ # Test architectural mutation
868
+ mutated_config = system.mutate_architecture(test_config)
869
+ print("βœ“ Architecture mutation successful")
870
+
871
+ # Test crossover
872
+ parent2_config = system.create_random_config()
873
+ parent2 = NeuralTuringMachine(parent2_config).to(config.device)
874
+ offspring = system.crossover(test_ntm, parent2)
875
+ print("βœ“ Crossover operation successful")
876
+
877
+ # Run short evolutionary optimization
878
+ print(f"\n Running evolutionary optimization...")
879
+ print("(This may take a few minutes)")
880
+
881
+ history = system.run_evolution()
882
+
883
+ print(f"\nEvolution completed!")
884
+ print(f" - Final generation: {system.generation}")
885
+ print(f" - Best fitness achieved: {system.best_fitness:.4f}")
886
+
887
+ # Analyze evolution progress
888
+ initial_fitness = history[0]['best_fitness']
889
+ final_fitness = history[-1]['best_fitness']
890
+ improvement = final_fitness - initial_fitness
891
+
892
+ print(f"\nEvolution analysis:")
893
+ print(f" - Initial best fitness: {initial_fitness:.4f}")
894
+ print(f" - Final best fitness: {final_fitness:.4f}")
895
+ print(f" - Total improvement: {improvement:.4f}")
896
+ print(f" - Average generation improvement: {improvement/len(history):.4f}")
897
+
898
+ # Get and analyze best individual
899
+ best_ntm = system.get_best_model()
900
+ best_fitness = system.evaluator.evaluate_fitness(best_ntm)
901
+
902
+ print(f"\nBest evolved architecture:")
903
+ print(f" - Controller: {best_ntm.cfg.controller_dim}D")
904
+ print(f" - Memory: {best_ntm.cfg.memory_slots} Γ— {best_ntm.cfg.memory_dim}")
905
+ print(f" - Heads: {best_ntm.cfg.heads_read}R/{best_ntm.cfg.heads_write}W")
906
+ print(f" - Parameters: {sum(p.numel() for p in best_ntm.parameters()):,}")
907
+
908
+ print(f"\nBest individual performance:")
909
+ for task, score in best_fitness.items():
910
+ print(f" - {task.capitalize()}: {score:.4f}")
911
+
912
+ print("\n Evolutionary Turing Machine test completed!")
913
+ print("βœ“ Population initialization and diversity")
914
+ print("βœ“ Fitness evaluation on algorithmic tasks")
915
+ print("βœ“ Architectural and parameter mutations")
916
+ print("βœ“ Crossover and offspring generation")
917
+ print("βœ“ Elite preservation and selection")
918
+ print("βœ“ Multi-generational evolution and improvement")
919
+
920
+ return True
921
+
922
+ def architecture_evolution_demo():
923
+ """Demonstrate architectural evolution patterns."""
924
+ print("\n" + "="*70)
925
+ print(" ARCHITECTURE EVOLUTION DEMONSTRATION")
926
+ print("="*70)
927
+
928
+ config = EvolutionaryTuringConfig(population_size=10, max_generations=10)
929
+ system = EvolutionaryTuringMachine(config)
930
+
931
+ # Generate diverse initial architectures
932
+ architectures = []
933
+ for _ in range(5):
934
+ cfg = system.create_random_config()
935
+ architectures.append(cfg)
936
+
937
+ print("Initial architecture diversity:")
938
+ for i, cfg in enumerate(architectures):
939
+ params = (cfg.controller_dim * cfg.controller_dim +
940
+ cfg.memory_slots * cfg.memory_dim)
941
+ print(f" Arch {i+1}: {cfg.controller_dim}D controller, {cfg.memory_slots}Γ—{cfg.memory_dim} memory, {params:,} params")
942
+
943
+ # Show mutation effects
944
+ print("\nMutation examples:")
945
+ base_cfg = architectures[0]
946
+ for i in range(3):
947
+ mutated = system.mutate_architecture(base_cfg)
948
+ print(f" Mutation {i+1}: {mutated.controller_dim}D controller, {mutated.memory_slots}Γ—{mutated.memory_dim} memory")
949
+
950
+ print("\n Evolution discovers optimal architectures through natural selection!")
951
+ print(" Larger controllers and memories often emerge for complex tasks")
952
+
953
+ if __name__ == "__main__":
954
+ test_evolutionary_turing()
955
+ architecture_evolution_demo()