From 2312493473eeca85c99307ab1315a63771863cb8 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 9 Aug 2022 20:49:05 -0700
Subject: [PATCH 1/3] replace bf16 with fp32 checkpoint weights

---
 .../universal_to_fp32_checkpoint.py           | 140 ++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 tools/convert_checkpoint/universal_to_fp32_checkpoint.py

diff --git a/tools/convert_checkpoint/universal_to_fp32_checkpoint.py b/tools/convert_checkpoint/universal_to_fp32_checkpoint.py
new file mode 100644
index 000000000..3520b1f2c
--- /dev/null
+++ b/tools/convert_checkpoint/universal_to_fp32_checkpoint.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+
+# at the moment this is very much a quick hack to replace half-precision weights with fp32 weights in the existing HF transformers checkpoint seeded from the universal checkpoint
+
+# 1. create a normal Meg-DS checkpoint
+#
+# 2. convert to universal
+#
+# python tools/convert_checkpoint/ds_to_universal.py --input_folder checkpoints/gpt2/global_step3 --output_folder checkpoints/gpt2/global_step3_universal
+#
+# # 3. convert to hf checkpoint or clone an existing one
+#
+# python ../transformers-master/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py --bloom_checkpoint_path checkpoints/gpt2/global_step3 --pytorch_dump_folder_path checkpoints/gpt2/global_step3_hf  --pretraining_tp 1
+#
+#
+# # needed to hack - or need to come up with a json config file
+#         config = BloomConfig()
+#     else:
+#         config = BloomConfig.from_json_file(bloom_config_file)
+
+#     config.hidden_size = 8
+#     config.n_head = 2
+#     config.n_layers = 4
+#     config.vocab_size = 50304
+#     print(config)
+#
+# 4. replace half-precision weights with fp32 weights
+# python tools/convert_checkpoint/universal_to_fp32_checkpoint.py --universal_path checkpoints/gpt2/global_step3_universal --hf_half_path checkpoints/gpt2/global_step3_hf --hf_fp32_path checkpoints/gpt2/global_step3_hf_fp32
+
+
+from argparse import ArgumentParser
+from pathlib import Path
+from pprint import pprint
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+import glob
+import os
+import re
+import shutil
+import torch
+
+parser = ArgumentParser()
+parser.add_argument("--hf_half_path", required=True, type=str, help="path to the HF half path checkpoint")
+parser.add_argument("--universal_path", required=True, type=str, help="path to the universal checkpoint")
+parser.add_argument("--hf_fp32_path", required=True, type=str, help="path to the fp32 version output")
+args = parser.parse_args()
+
+hf_half_path = args.hf_half_path
+universal_path = args.universal_path
+hf_fp32_path = args.hf_fp32_path
+
+# adapted from the conversion script
+def layer_name_mapping(key):
+    """ map Megatron-DeepSpeed weights to transformers """
+    # Handle first and last layers
+    layer_rename_map = {
+        "tied_modules.embed.word_embeddings.weight":      "word_embeddings.weight",
+        "tied_modules.embed.word_embeddings.norm.weight": "word_embeddings_layernorm.weight",
+        "tied_modules.embed.word_embeddings.norm.bias":   "word_embeddings_layernorm.bias",
+        "tied_modules.embed.position_embeddings.weight":   "word_embeddings_layernorm.bias",
+        "weight": "ln_f.weight",
+        "bias":   "ln_f.bias",
+    }
+
+    if key in layer_rename_map:
+        return layer_rename_map[key]
+
+    layer_rename_map2 = {
+        "weight": "ln_f.weight",
+        "bias":   "ln_f.bias",
+    }
+
+    segments = re.split("\.", key)
+    if len(segments) == 2:
+        return layer_rename_map2[segments[1]]
+
+    # Handle transformer blocks
+    try:
+        layer_number, *rest = re.split("\.", key)
+        layer_number = str(int(layer_number) - 3)
+        return ".".join(["h", layer_number] + rest)
+    except:
+        return key
+
+# universal checkpoint name remap
+ds_layer_names = sorted(os.listdir(f"{universal_path}/zero"))
+#ds_layer_names = glob.glob(f"{universal_path}/zero/*", root_dir=f"{universal_path}/zero")
+pprint(ds_layer_names)
+
+key_map = {layer_name_mapping(key):key for key in ds_layer_names}
+pprint(key_map)
+
+# copy non-weight files
+Path(hf_fp32_path).mkdir(parents=True, exist_ok=True)
+hf_files = [x for x in os.listdir(hf_half_path) if not x.endswith("bin") and os.path.isfile(x)]
+pprint(hf_files)
+for f in hf_files:
+    shutil.copy2(f"{hf_half_path}/{f}", f"{hf_fp32_path}/{f}")
+
+# replace half precision with fp32 weights
+hf_checkpoint_files = glob.glob(f"{hf_half_path}/*bin")
+pprint(hf_checkpoint_files)
+for f in hf_checkpoint_files:
+    sd = torch.load(f, map_location="cpu")
+    for k in sd.keys():
+        fp32_path = f"{universal_path}/zero/{key_map[k]}/fp32.pt"
+        print(k, fp32_path)
+        new_value = torch.load(fp32_path, map_location="cpu")
+        sd[k] = new_value
+    f = f.replace(hf_half_path, hf_fp32_path)
+    torch.save(sd, f)
+
+
+
+# tokenizer = AutoTokenizer.from_pretrained(mname)
+# tokenizer.save_pretrained(hf_fp32_path)
+
+config = AutoConfig.from_pretrained(hf_half_path)
+# replicate the existing tiny model but we need longer max_position_embeddings
+config.update(dict(
+    torch_dtype="float32"
+))
+config.save_pretrained(hf_fp32_path)
+
+
+# tiny_model = AutoForConditionalGeneration(config)
+# print(f"num of params {tiny_model.num_parameters()}")
+
+# # Test
+# model_inputs = tokenizer("Making tiny model", return_tensors="pt")
+# gen_tokens = tiny_model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id("fr"))
+# print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
+# #
+
+# # Save
+# mname_tiny = "tiny-m2m_100"
+# tiny_model.half() # makes it smaller
+# tiny_model.save_pretrained(mname_tiny)
+# tokenizer.save_pretrained(mname_tiny)
+
+# print(f"Generated {mname_tiny}")

From 3dcc8057f9d624073882eb2a5812bf4127715a5c Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 9 Aug 2022 21:35:49 -0700
Subject: [PATCH 2/3] cleanup

---
 .../universal_to_fp32_checkpoint.py             | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/tools/convert_checkpoint/universal_to_fp32_checkpoint.py b/tools/convert_checkpoint/universal_to_fp32_checkpoint.py
index 3520b1f2c..eec0214a4 100644
--- a/tools/convert_checkpoint/universal_to_fp32_checkpoint.py
+++ b/tools/convert_checkpoint/universal_to_fp32_checkpoint.py
@@ -122,19 +122,4 @@ def layer_name_mapping(key):
 config.save_pretrained(hf_fp32_path)
 
 
-# tiny_model = AutoForConditionalGeneration(config)
-# print(f"num of params {tiny_model.num_parameters()}")
-
-# # Test
-# model_inputs = tokenizer("Making tiny model", return_tensors="pt")
-# gen_tokens = tiny_model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id("fr"))
-# print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
-# #
-
-# # Save
-# mname_tiny = "tiny-m2m_100"
-# tiny_model.half() # makes it smaller
-# tiny_model.save_pretrained(mname_tiny)
-# tokenizer.save_pretrained(mname_tiny)
-
-# print(f"Generated {mname_tiny}")
+print("Done")

From 4d3de0c1b1b02d7c5d79e902616b492a3e544636 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 11 Aug 2022 11:09:56 -0700
Subject: [PATCH 3/3] fix

---
 .../universal_to_fp32_checkpoint.py              | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/convert_checkpoint/universal_to_fp32_checkpoint.py b/tools/convert_checkpoint/universal_to_fp32_checkpoint.py
index eec0214a4..ecbfd25a7 100644
--- a/tools/convert_checkpoint/universal_to_fp32_checkpoint.py
+++ b/tools/convert_checkpoint/universal_to_fp32_checkpoint.py
@@ -30,7 +30,7 @@
 
 from argparse import ArgumentParser
 from pathlib import Path
-from pprint import pprint
+import pprint
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 import glob
 import os
@@ -56,11 +56,12 @@ def layer_name_mapping(key):
         "tied_modules.embed.word_embeddings.weight":      "word_embeddings.weight",
         "tied_modules.embed.word_embeddings.norm.weight": "word_embeddings_layernorm.weight",
         "tied_modules.embed.word_embeddings.norm.bias":   "word_embeddings_layernorm.bias",
-        "tied_modules.embed.position_embeddings.weight":   "word_embeddings_layernorm.bias",
         "weight": "ln_f.weight",
         "bias":   "ln_f.bias",
     }
 
+    # we ignore "tied_modules.embed.position_embeddings.weight" as it's deterministic
+
     if key in layer_rename_map:
         return layer_rename_map[key]
 
@@ -83,27 +84,26 @@ def layer_name_mapping(key):
 
 # universal checkpoint name remap
 ds_layer_names = sorted(os.listdir(f"{universal_path}/zero"))
-#ds_layer_names = glob.glob(f"{universal_path}/zero/*", root_dir=f"{universal_path}/zero")
-pprint(ds_layer_names)
+#pprint.pprint(ds_layer_names)
 
 key_map = {layer_name_mapping(key):key for key in ds_layer_names}
-pprint(key_map)
+print("remap", pprint.pformat(key_map))
 
 # copy non-weight files
 Path(hf_fp32_path).mkdir(parents=True, exist_ok=True)
 hf_files = [x for x in os.listdir(hf_half_path) if not x.endswith("bin") and os.path.isfile(x)]
-pprint(hf_files)
+print("HF Checkpoint non-bin files", pprint.pformat(hf_files))
 for f in hf_files:
     shutil.copy2(f"{hf_half_path}/{f}", f"{hf_fp32_path}/{f}")
 
 # replace half precision with fp32 weights
 hf_checkpoint_files = glob.glob(f"{hf_half_path}/*bin")
-pprint(hf_checkpoint_files)
+print("HF Checkpoint bin files", pprint.pformat(hf_checkpoint_files))
 for f in hf_checkpoint_files:
     sd = torch.load(f, map_location="cpu")
     for k in sd.keys():
         fp32_path = f"{universal_path}/zero/{key_map[k]}/fp32.pt"
-        print(k, fp32_path)
+        print(f"{k} from {fp32_path}")
         new_value = torch.load(fp32_path, map_location="cpu")
         sd[k] = new_value
     f = f.replace(hf_half_path, hf_fp32_path)