Word representation using the common bag of words model

May 5, 2026 · word-embeddings, cbow, nlp

Problem: Bag of Words model

import torch
import torch.nn.functional as F

corpus = "The quick brown fox jumps over the lazy dog".lower()
corpus_list = corpus.split()


def data_preparation():

    window_size = 2
    context_target_pairs = []

    for index in range(window_size, len(corpus_list) - window_size):
        context = (
            corpus_list[index - window_size : index]
            + corpus_list[index + 1 : index + window_size + 1]
        )
        target = corpus_list[index]
        print(f"Context: {context} --> Target: {target}")
        context_target_pairs.append((context, target))
    return context_target_pairs


def create_vocabulary():
    vocab = set(corpus_list)
    word_to_index = {word: idx for idx, word in enumerate(vocab)}
    index_to_word = {idx: word for word, idx in word_to_index.items()}

    return word_to_index, index_to_word


def one_hot_encode(word, word_to_index):
    one_hot = torch.zeros(len(word_to_index))
    one_hot[word_to_index[word]] = 1.0
    return one_hot


class CBOW:
    def __init__(self, vocab_size, embedding_dim):
        self.W1 = torch.randn(vocab_size, embedding_dim, requires_grad=True)
        self.W2 = torch.randn(embedding_dim, vocab_size, requires_grad=True)

    def forward(self, context_words):
        h = context_words.mean(dim=0) @ self.W1
        logits = h @ self.W2
        return logits


def train(model, context_target_pairs, word_to_index, epochs=100, lr=0.01):
    for epoch in range(epochs):
        total_loss = 0.0

        for context, target in context_target_pairs:
            context_words = torch.stack(
                [one_hot_encode(w, word_to_index) for w in context]
            )
            target_idx = torch.tensor(word_to_index[target])

            logits = model.forward(context_words)

            loss = F.cross_entropy(
                logits.unsqueeze(0), target_idx.unsqueeze(0)
            )  # cross entropy expects (batch,vocab) and (batch,), so added a batch dimension.

            loss.backward()

            with torch.no_grad():
                model.W1 -= lr * model.W1.grad
                model.W2 -= lr * model.W2.grad

                model.W1.grad.zero_()
                model.W2.grad.zero_()

            total_loss += loss.item()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1:3d} ===== Loss: {total_loss:.4f}")


def predict(model, context_words, word_to_index, index_to_word):
    with torch.no_grad():
        context_tensor = torch.stack(
            [one_hot_encode(w, word_to_index) for w in context_words]
        )

        logits = model.forward(context_tensor)

        probs = torch.softmax(logits, dim=0)
        predicted_idx = torch.argmax(probs).item()

    return index_to_word[predicted_idx], probs.tolist()


def main():
    context_target_pairs = data_preparation()
    print(f"\nContext Target pairs: {context_target_pairs}")

    word_to_index, index_to_word = create_vocabulary()
    print(f"\nWord to Index Dictionary: {word_to_index}")

    context_one_hot = [one_hot_encode(word, word_to_index) for word in ["the", "quick"]]
    print(f"One-Hot Encoding: {context_one_hot}")

    model = CBOW(vocab_size=len(word_to_index), embedding_dim=5)

    train(
        model=model,
        context_target_pairs=context_target_pairs,
        word_to_index=word_to_index,
    )

    print("-------------- Prediction ----------------")
    test_context = ["the", "quick", "fox", "jumps"]

    predicted_word, _ = predict(model, test_context, word_to_index, index_to_word)

    print(f"Target: {predicted_word} \nProbability List: {_}")


if __name__ == "__main__":
    main()

CBOW predicts a target word from it's surrounding context words. Example: "the cat sits on the mat", window_size = 2, (two words left, two words right) Context = ["the", "cat", "on", "the"] -> word between "cat" and "on" is guessed i.e sits

CBOW uses the average of embeddings of context words to predict the target word using a probability distribution.

Steps:

Data Preparation
Generate Context Target Pairs
One-Hot Embedding
Embedding Layer
Context Aggregation
Prediction
Loss Calculation and Optimization
Repeat for all Pairs.

Embedding Layer: embedding matrix W of size Vxd where V is the vocabulary size and d is the embedding dimension. Context Aggregation: Embedding of all context words are averaged to compute the context vector.

Example of CBOW: Input: "I love machine learning".

target_word: machine
context_words: ["I", "love", "learning"]

One Hot Encoding:

Vocabulary = ["i", "love", "machine", "learning", "ai]
One-hot vectors:
- "I" -> [1,0,0,0,0][1,0,0,0,0][1,0,0,0,0]
- "love" -> [0,1,0,0,0][0,1,0,0,0][0,1,0,0,0]
- "learning" -> [0,0,0,1,0][0,0,0,1,0][0,0,0,1,0]

Embedding Layer:

embedding_dim(d) = 3
embedding_matrx (W)= [ 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 0.2 0.3 0.4 0.5 0.6 0.7 ]

Embeddings:

"i" -> [0.1, 0.2, 0.3]
"love"-> [0.4, 0.5, 0.6]
"learning" -> [0.2, 0.3, 0.4]

Aggregation:

Average the embeddings:
- h = ([0.1, 0.2, 0.3] + [0.4, 0.5, 0.6] + [0.2, 0.3, 0.4]) / 3 = [0.233, 0.333, 0.433]

Output Layer:

compute logits, apply softmax, and predict the target word.