from IPython.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

import torch

torch.set_printoptions(precision=4, sci_mode=False)

a = torch.tensor([[1.1, 2.1, 3.1, 4.1]])
b = torch.tensor([5.6])
c = a + b

a.shape, b.shape, c.shape

(torch.Size([1, 4]), torch.Size([1]), torch.Size([1, 4]))

c

tensor([[6.7000, 7.7000, 8.7000, 9.7000]])

x=torch.ones((2,2,4,1))
y=torch.ones(2,1,1)

# 2, 2, 4, 1
#    2, 1, 1

# x and y are broadcastable.
# 1st trailing dimension: both have size 1
# 2nd trailing dimension: y has size 1
# 3rd trailing dimension: x size == y size
# 4th trailing dimension: y dimension doesn't exist
(x + y).shape

torch.Size([2, 2, 4, 1])

x

tensor([[[[1.],
          [1.],
          [1.],
          [1.]],

         [[1.],
          [1.],
          [1.],
          [1.]]],


        [[[1.],
          [1.],
          [1.],
          [1.]],

         [[1.],
          [1.],
          [1.],
          [1.]]]])

y

tensor([[[1.]],

        [[1.]]])

x + y

tensor([[[[2.],
          [2.],
          [2.],
          [2.]],

         [[2.],
          [2.],
          [2.],
          [2.]]],


        [[[2.],
          [2.],
          [2.],
          [2.]],

         [[2.],
          [2.],
          [2.],
          [2.]]]])

x=torch.ones((5,2,4,1))
y=torch.ones(3,1,1)

# 5, 2, 4, 1
#    3, 1, 1

# x and y are not broadcastable, because in the 3rd trailing dimension 2 != 3
# Error: 
# RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 1
# x + y

# to make result deterministic
g = torch.Generator().manual_seed(2147483647)

# Returns a tensor filled with random numbers from a uniform 
# distribution on the interval [0,1]
p = torch.rand(3, generator=g)
p = p / p.sum()
print(p)

tensor([0.6064, 0.3033, 0.0903])

# There are 3 classes: 0, 1, 2
# the result will be 100 samples of these categories
l = torch.multinomial(p, num_samples=100, replacement=True, generator=g)
print(l)

tensor([1, 1, 2, 0, 0, 2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 2, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
        0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, 0,
        0, 1, 1, 1])

from collections import Counter
Counter(l.numpy())

Counter({0: 61, 1: 33, 2: 6})

# provides probability
# 4 classes are present: 0, 1, 2, 3
weights = torch.tensor([0, 10, 3, 1], dtype=torch.float)

# Sample 2 values using the probability distribution "weights"
torch.multinomial(weights, 2)

tensor([2, 1])

# This will fail
# "RuntimeError: cannot sample n_sample > prob_dist.size(-1) samples without replacement"
# 
# torch.multinomial(weights, 5)

torch.multinomial(weights, 100, replacement=True)

tensor([1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 2, 1, 1, 1, 2, 1, 2, 2,
        1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1,
        1, 1, 2, 2, 3, 1, 1, 1, 1, 3, 3, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 3, 1, 1, 1, 2, 3, 2, 1, 1, 1,
        2, 1, 1, 1])

g = torch.Generator().manual_seed(2147483647)

x = torch.rand(3, 4, generator=g)
x

tensor([[0.7081, 0.3542, 0.1054, 0.5996],
        [0.0904, 0.0899, 0.8822, 0.9887],
        [0.0080, 0.2908, 0.7408, 0.4012]])

# Sum all the columns
s = x.sum(dim=1)
s

tensor([1.7674, 2.0513, 1.4409])

0.7081 + 0.3542 + 0.1054 + 0.5996

1.7673

s.shape

torch.Size([3])

s = x.sum(dim=1, keepdims=True)
s

tensor([[1.7674],
        [2.0513],
        [1.4409]])

s.shape

torch.Size([3, 1])

g = torch.Generator().manual_seed(2147483647)

logits = torch.rand(3, 3, generator=g)
logits

tensor([[0.7081, 0.3542, 0.1054],
        [0.5996, 0.0904, 0.0899],
        [0.8822, 0.9887, 0.0080]])

s = logits.sum(dim=1)

print(s)
print(s.shape)

tensor([1.1678, 0.7800, 1.8790])
torch.Size([3])

probs = logits / s
probs

tensor([[0.6064, 0.4542, 0.0561],
        [0.5135, 0.1160, 0.0478],
        [0.7555, 1.2677, 0.0043]])

probs[0].sum() == 1, probs[1].sum() == 1, probs[2].sum() == 1

(tensor(False), tensor(False), tensor(False))

s = logits.sum(dim=1, keepdim=True)

print(s)
print(s.shape)

tensor([[1.1678],
        [0.7800],
        [1.8790]])
torch.Size([3, 1])

# 1st trailing dimension: s dimension is 1
# 2nd trailing dimension: both have value 3.
# This broadcasting operation is possible
#
# During brocasting, 

# 3, 3
# 3, 1
probs = logits / s
probs

tensor([[0.6064, 0.3033, 0.0903],
        [0.7688, 0.1160, 0.1152],
        [0.4695, 0.5262, 0.0043]])

probs[0].sum() == 1, probs[1].sum() == 1, probs[2].sum() == 1

(tensor(True), tensor(True), tensor(True))

g = torch.Generator().manual_seed(2147483647)

a = torch.randn(1, 5, 4, generator=g)
b = torch.randn(1, 5, 4, generator=g)
c = torch.randn(1, 5, 4, generator=g)

# `-1`: Concatenate along the last dimension
r = torch.cat([a, b, c], dim=-1)  # (1, 5, 12)
r.shape

torch.Size([1, 5, 12])

a

tensor([[[ 1.5674, -0.2373, -0.0274, -1.1008],
         [ 0.9849, -0.1484, -1.4795,  0.4483],
         [-2.1921, -0.7814, -0.2808, -0.7389],
         [-1.2199,  0.3031, -1.0725,  0.7276],
         [ 2.2497, -0.4755,  0.6205,  1.1500]]])

b

tensor([[[-1.8068,  1.2523, -1.2256,  1.2165],
         [-0.5030, -1.0660,  0.8480,  2.0275],
         [-0.1158, -1.2078, -0.7441, -0.5903],
         [-0.5132,  0.2961, -1.4904, -0.2838],
         [ 0.2569,  0.2130,  1.5514, -1.3410]]])

c

tensor([[[ 0.2472, -0.3777, -1.9081, -0.3717],
         [ 0.0948, -1.1645,  1.8010,  0.4707],
         [-0.8746, -0.2977, -1.3707,  0.1150],
         [-0.1801,  1.3034, -1.1887,  0.8047],
         [-1.7149, -0.3379, -1.8263, -0.8390]]])

r

tensor([[[ 1.5674, -0.2373, -0.0274, -1.1008, -1.8068,  1.2523, -1.2256,
           1.2165,  0.2472, -0.3777, -1.9081, -0.3717],
         [ 0.9849, -0.1484, -1.4795,  0.4483, -0.5030, -1.0660,  0.8480,
           2.0275,  0.0948, -1.1645,  1.8010,  0.4707],
         [-2.1921, -0.7814, -0.2808, -0.7389, -0.1158, -1.2078, -0.7441,
          -0.5903, -0.8746, -0.2977, -1.3707,  0.1150],
         [-1.2199,  0.3031, -1.0725,  0.7276, -0.5132,  0.2961, -1.4904,
          -0.2838, -0.1801,  1.3034, -1.1887,  0.8047],
         [ 2.2497, -0.4755,  0.6205,  1.1500,  0.2569,  0.2130,  1.5514,
          -1.3410, -1.7149, -0.3379, -1.8263, -0.8390]]])

m = torch.tensor([
    [1., 2, 3],
    [4, 5, 6]
])

m

tensor([[1., 2., 3.],
        [4., 5., 6.]])

# View as 3 rows & 2 cols
m.view(3, 2)

tensor([[1., 2.],
        [3., 4.],
        [5., 6.]])

# I want 2 columns. determine the no. of rows
m.view(-1, 2)

tensor([[1., 2.],
        [3., 4.],
        [5., 6.]])

m.view(6, -1)

tensor([[1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.]])

m = torch.randn(2, 5, 3)
m

tensor([[[-0.7486, -1.3454, -1.1200],
         [-0.8051, -0.8451, -0.7295],
         [-0.6197, -0.1222,  0.7914],
         [ 0.4528, -2.6055,  0.3844],
         [-1.0877, -0.1612,  0.8568]],

        [[-0.3672,  0.3350,  2.7597],
         [-0.7933, -1.4860,  0.9841],
         [ 0.2437,  0.3617,  1.3867],
         [-0.0953,  0.0696, -1.4806],
         [-1.5924,  0.5686, -2.8422]]])

m.view(2, 3, 5)

tensor([[[-0.7486, -1.3454, -1.1200, -0.8051, -0.8451],
         [-0.7295, -0.6197, -0.1222,  0.7914,  0.4528],
         [-2.6055,  0.3844, -1.0877, -0.1612,  0.8568]],

        [[-0.3672,  0.3350,  2.7597, -0.7933, -1.4860],
         [ 0.9841,  0.2437,  0.3617,  1.3867, -0.0953],
         [ 0.0696, -1.4806, -1.5924,  0.5686, -2.8422]]])

torch.manual_seed(1337)
B, T, C = 2, 8, 2  # Batch, Time, Channels
x = torch.randn(B, T, C)
x

tensor([[[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]],

        [[ 1.3488, -0.1396],
         [ 0.2858,  0.9651],
         [-2.0371,  0.4931],
         [ 1.4870,  0.5910],
         [ 0.1260, -1.5627],
         [-1.1601, -0.3348],
         [ 0.4478, -0.8016],
         [ 1.5236,  2.5086]]])

x[0, :1]

tensor([[ 0.1808, -0.0700]])

x[0, :2]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152]])

x[0, :3]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255]])

xbow = torch.zeros((B, T, C))

for b in range(B):  # Loop through each batch
    for t in range(T):  # loop through all entries in the batch
        xprev = x[b, :t+1]  # This all "previous" entries
        xsum = torch.sum(xprev, dim=0)  # running mean
        xbow[b, t] = xsum

xbow

tensor([[[ 0.1808, -0.0700],
         [-0.1789, -0.9852],
         [ 0.4469, -0.9597],
         [ 1.4014, -0.8953],
         [ 1.7626,  0.2725],
         [ 0.4127, -0.2376],
         [ 0.6486, -0.4774],
         [-0.2725,  1.0659]],

        [[ 1.3488, -0.1396],
         [ 1.6346,  0.8255],
         [-0.4025,  1.3186],
         [ 1.0845,  1.9096],
         [ 1.2105,  0.3470],
         [ 0.0504,  0.0121],
         [ 0.4982, -0.7895],
         [ 2.0218,  1.7191]]])

wei = torch.tril(torch.ones(T, T))
wei

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

xbow2 = wei @ x
torch.allclose(xbow, xbow2)

True

x = torch.randn(4, 80) @ torch.randn(80, 200)
x.shape

torch.Size([4, 200])

x = torch.randn(5, 4, 80) @ torch.randn(80, 200)
x.shape

torch.Size([5, 4, 200])

x = torch.randn(5, 2, 4, 80) @ torch.randn(80, 200)
x.shape

torch.Size([5, 2, 4, 200])

no_examples = 100
no_classes = 27

p = torch.rand(no_examples, no_classes)
p = p / p.sum(dim=1, keepdim=True)
p.shape

torch.Size([100, 27])

# True labels.
# Generate 100 random ints between 0 and 27 (Exclusive)
ys = torch.randint(0, 27, (no_examples,))
ys

tensor([ 3,  6, 10,  3, 20, 16,  7, 22,  8, 10, 15,  7,  6, 21, 13,  6, 16, 26,
        22, 20, 24,  8, 13, 14,  4, 23, 13,  7, 26, 22, 17, 17,  8, 17, 13, 15,
         9, 15, 12, 23, 11, 23, 10, 22, 11, 10, 23, 15, 21, 10, 22,  2, 16,  3,
        11, 23,  5, 14, 19, 17, 13, 12,  7, 10, 13, 18, 10,  0,  3, 20, 13,  0,
        23,  4,  5, 18, 19,  1, 24, 15, 12,  5, 25, 26, 12, 10, 25,  4, 16, 15,
         6,  8, 14,  7, 17, 10,  1,  0, 14, 10])

torch.arange(no_examples)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
        54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
        72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
        90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

loss = 0

for i in range(no_examples):
    loss += (p[i, ys[i]]).log()
    
loss = -loss / no_examples    
print(f"Loss: {loss}")

Loss: 3.4523065090179443

-p[torch.arange(no_examples), ys].log().mean()

tensor(3.4523)

# Calculating probabilities manually
logits = torch.tensor([-2, -3, 0, 5])
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([    0.0009,     0.0003,     0.0067,     0.9921])

# Sample
logits = torch.tensor([-2, -3, 0, 5])
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([    0.0009,     0.0003,     0.0067,     0.9921])

# Sample
logits = torch.tensor([-100, -3, 0, 5])
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([    0.0000,     0.0003,     0.0067,     0.9930])

# Sample
logits = torch.tensor([-100, -3, 0, 100])
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([0., 0., 0., nan])

torch.tensor([-100]).exp(), torch.tensor([100]).exp()

(tensor([    0.0000]), tensor([inf]))

counts

tensor([    0.0000,     0.0498,     1.0000,        inf])

# Sample
logits = torch.tensor([-2, -3, 0, 5])
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([    0.0009,     0.0003,     0.0067,     0.9921])

# Sample
logits = torch.tensor([-2, -3, 0, 5]) + 1
counts = logits.exp() 
probs = counts / counts.sum()
probs

tensor([    0.0009,     0.0003,     0.0067,     0.9921])

# Sample
logits = torch.tensor([-2, -3, 0, 5]) - 10
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([    0.0009,     0.0003,     0.0067,     0.9921])

# Sample
logits = torch.tensor([-2, -3, 0, 100])
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([0., 0., 0., nan])

# Sample
# Identify the maximum value in the logits & subtract it from all the logits
logits = torch.tensor([-2, -3, 0, 100]) - 100
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([    0.0000,     0.0000,     0.0000,     1.0000])

Tips & notes from Karpathy's neural networks lectures¶

1. Broadcast operations¶

sample #1¶

sample #2¶

Sample #3¶

2. Multinomial¶

Sample #1¶

Sample #2¶

Sample #3¶

Sample #4¶

3. Sum¶

4. Broadcasting scenario¶

Unexpected result¶

Expected result¶

5. Concatenate¶

6. View¶

7. Running sum¶

Using Python¶

Using matrix multiplication¶

8. Batched matrix multiplication¶

9. Indexing tensors¶

Cross entropy loss - PyTorch¶

Calculate cross entropy loss manually¶

Problems with logits having bigger values¶

The way PyTorch solves:¶