I’m trying to train the model in this blog on all 8 HPUs but I keep running into:
“RuntimeError: Device acquire failed.”
I am running the script below:
from optimum.habana.distributed import DistributedRunner
from optimum.utils import logging
world_size=8 # Number of HPUs to use (1 or 8)
define distributed runner
distributed_runner = DistributedRunner(
command_list=[“train.py”],
world_size=world_size,
use_mpi=True,
multi_hls=False,
)
start job
ret_code = distributed_runner.run()
train.py :
#!/usr/bin/env python
coding: utf-8
In[1]:
import habana_frameworks.torch.core as htcore
import torch
print(f"device available:{htcore.is_available()}“)
print(f"device_count:{htcore.get_device_count()}”)
In[2]:
#get_ipython().system(‘pip install transformers datasets tensorboard matplotlib pandas sklearn’)
#get_ipython().system(‘pip install git+https://github.com/huggingface/optimum-habana.git’)
In[3]:
model_id = “xlm-roberta-large”
gaudi_config_id= “Habana/roberta-large” # more here: Habana (Habana AI)
dataset_id = “AmazonScience/massive”
dataset_configs=[“en-US”,“de-DE”,“fr-FR”,“it-IT”,“pt-PT”,“es-ES”,“nl-NL”]
seed=33
repository_id = “habana-xlm-r-large-amazon-massive”
In[4]:
from datasets import load_dataset, concatenate_datasets, DatasetDict
the columns we want to keep in the dataset
keep_columns = [“utt”, “scenario”]
process individuell datasets
proc_lan_dataset_list=
for lang in dataset_configs:
# load dataset for language
lang_ds = load_dataset(dataset_id, lang)
# only keep the ‘utt’ & 'scenario column
lang_ds = lang_ds.remove_columns([col for col in lang_ds[“train”].column_names if col not in keep_columns])
# rename the columns to match transformers schema
lang_ds = lang_ds.rename_column(“utt”, “text”)
lang_ds = lang_ds.rename_column(“scenario”, “label”)
proc_lan_dataset_list.append(lang_ds)
concat single splits into one
train_dataset = concatenate_datasets([ds[“train”] for ds in proc_lan_dataset_list])
eval_dataset = concatenate_datasets([ds[“validation”] for ds in proc_lan_dataset_list])
create datset dict for easier processing
dataset = DatasetDict(dict(train=train_dataset,validation=eval_dataset))
print(dataset)
In[5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
In[6]:
def process(examples):
tokenized_inputs = tokenizer(
examples[“text”], padding=“max_length”, truncation=True
)
return tokenized_inputs
tokenized_datasets = dataset.map(process, batched=True)
tokenized_datasets[“train”].features
In[7]:
from datasets import load_metric
import numpy as np
define metrics and metrics function
f1_metric = load_metric(“f1”)
accuracy_metric = load_metric( “accuracy”)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
acc = accuracy_metric.compute(predictions=predictions, references=labels)
f1 = f1_metric.compute(predictions=predictions, references=labels, average=“micro”)
return {
“accuracy”: acc[“accuracy”],
“f1”: f1[“f1”],
}
In[8]:
from transformers import AutoModelForSequenceClassification,DataCollatorWithPadding
from optimum.habana import GaudiTrainer, GaudiTrainingArguments
from huggingface_hub import HfFolder
create label2id, id2label dicts for nice outputs for the model
labels = tokenized_datasets[“train”].features[“label”].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
label2id[label] = str(i)
id2label[str(i)] = label
define training args
training_args = GaudiTrainingArguments(
output_dir=repository_id,
use_habana=True,
use_lazy_mode=True,
gaudi_config_name=gaudi_config_id,
num_train_epochs=5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
)
define model
model = AutoModelForSequenceClassification.from_pretrained(
model_id,
num_labels=num_labels,
id2label=id2label,
label2id=label2id,
)
create Trainer
trainer = GaudiTrainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets[“train”],
eval_dataset=tokenized_datasets[“validation”],
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
start training on 1x HPU
trainer.train()
evaluate model
trainer.evaluate(eval_dataset=tokenized_datasets[“validation”])
In:
this is all on the base ubuntu20.04 ami using this docker container:
docker run -ti --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -v $(pwd):/home/ubuntu/dev --workdir=/home/ubuntu/dev vault.habana.ai/gaudi-docker/1.4.1/ubuntu20.04/habanalabs/pytorch-installer-1.10.2:1.4.1-11