Position-aware, cross-lingually aligned word embeddings built on FastText.
- Cross-Lingual Alignment: Procrustes alignment for multilingual compatibility
- Position-Aware Embeddings: Optional positional encoding (RoPE, sinusoidal, decay)
- FastText Foundation: Handles OOV words through subword information
pip install babelvecFor visualization support:
pip install babelvec[viz]from babelvec import BabelVec
# Load a model
model = BabelVec.load('path/to/model.bin')
# Get word vector
vec = model.get_word_vector("hello")
# Position-aware sentence embedding
vec1 = model.get_sentence_vector("The dog bites the man", method='rope')
vec2 = model.get_sentence_vector("The man bites the dog", method='rope')
# vec1 != vec2 because word order is encoded
# Simple averaging (no position encoding)
vec = model.get_sentence_vector("Hello world", method='average')from babelvec.training import train_monolingual
model = train_monolingual(
lang='en',
corpus_path='corpus.txt',
dim=300,
epochs=5,
threads=8 # Optional: specify number of threads
)
model.save('en_300d.bin')Train multiple languages simultaneously for faster training on multi-core servers:
from babelvec.training import train_multiple_languages, get_cpu_count
# Auto-detects CPU cores
print(f"Using {get_cpu_count()} cores")
models = train_multiple_languages(
languages={'en': 'en_corpus.txt', 'ar': 'ar_corpus.txt'},
parallel=True, # Train languages simultaneously
max_workers=2, # Number of parallel training jobs
)from babelvec.training import train_multilingual
models = train_multilingual(
languages=['en', 'ar'],
corpus_paths={'en': 'en.txt', 'ar': 'ar.txt'},
parallel_data={('en', 'ar'): parallel_pairs},
alignment='procrustes',
threads=8 # Optional: specify number of threads
)from babelvec.training import align_models
aligned = align_models(
models={'en': model_en, 'ar': model_ar},
parallel_data={('en', 'ar'): parallel_pairs},
method='procrustes'
)Models save projection matrices alongside the FastText binary:
# Save model
model.save('model.bin')
# Creates: model.bin, model.projection.npy (if aligned), model.meta.json
# Load model - projection is automatically restored
model = BabelVec.load('model.bin')
print(model.is_aligned) # True if projection was loaded| Method | Description |
|---|---|
rope |
Rotary Position Embedding |
decay |
Exponential position decay |
sinusoidal |
Transformer-style positional encoding |
average |
Simple averaging (no position encoding) |
from babelvec.evaluation import cross_lingual_retrieval
metrics = cross_lingual_retrieval(
model_src=model_en,
model_tgt=model_ar,
parallel_sentences=test_pairs,
method='rope'
)
print(f"Recall@1: {metrics['recall@1']:.3f}")BabelVec includes a curated family assignment system for 355 Wikipedia languages, optimized for joint multilingual training.
from babelvec.families import get_family_key, get_family_languages, get_training_groups
# Get family for a language
get_family_key("ary") # -> "arabic"
get_family_key("fr") # -> "romance_galloitalic"
# Get all languages in a family
get_family_languages("arabic") # -> ["ar", "ary", "arz"]
# Create training groups (hybrid strategy)
groups = get_training_groups(
languages=["en", "ar", "ary", "arz"],
article_counts={"en": 6000000, "ar": 840000, "ary": 17000, "arz": 40000},
low_resource_threshold=50000
)
# -> {"separate": ["en", "ar"], "joint": {"arabic": ["ary", "arz"]}}Joint training dramatically improves low-resource languages (+200-600% for Arabic dialects) while high-resource languages should be trained separately.
See the examples/ directory:
01_basic_usage.py- Getting started
@misc{babelvec2025,
title = {BabelVec: Position-Aware Cross-Lingual Word Embeddings},
author = {Kamali, Omar},
doi = {10.5281/zenodo.18065206},
publisher = {Zenodo},
year = {2025},
url = {https://github.com/omarkamali/babelvec}
}MIT License - see LICENSE for details.
Copyright © 2025 Omar Kamali