Add OpenAI MGD support

LakshyAAAgrawal · LakshyAAAgrawal · commit c98f7003fca1 · 2023-11-21T13:02:22.000+05:30
diff --git a/README.md b/README.md
@@ -162,7 +162,17 @@ pytest tests/multilspy
 
 ## 5. Monitor-Guided Decoding
 
-A monitor under the Monitor-Guided Decoding framework, is instantiated using `multilspy` as the LSP client, and as a logits-processor to guide the LM decoding. [src/monitors4codegen/monitor_guided_decoding/monitor.py](src/monitors4codegen/monitor_guided_decoding/monitor.py) provides the class `MGDLogitsProcessor` which can be used with any HuggingFace Language Model, as a `LogitsProcessor` to guide the LM using MGD. [src/monitors4codegen/monitor_guided_decoding/dereferences_monitor.py](src/monitors4codegen/monitor_guided_decoding/dereferences_monitor.py) provides the instantiation for dereferences monitor. Unit tests for the dereferences monitor are present in [tests/monitor_guided_decoding/test_dereferences_monitor_java.py](tests/monitor_guided_decoding/test_dereferences_monitor_java.py), which also provide usage examples for the dereferences monitor.
+A monitor under the Monitor-Guided Decoding framework, is instantiated using `multilspy` as the LSP client, and provides maskgen to guide the LM decoding. The monitor interface is defined as class `Monitor` in file [src/monitors4codegen/monitor_guided_decoding/monitor.py](src/monitors4codegen/monitor_guided_decoding/monitor.py). The interface is implemented by various monitors supporting different properties like valid identifier dereferences, valid number of arguments, valid typestate method calls, etc.
+
+### MGD with HuggingFace models
+[src/monitors4codegen/monitor_guided_decoding/hf_gen.py](src/monitors4codegen/monitor_guided_decoding/hf_gen.py) provides the class `MGDLogitsProcessor` which can be used with any HuggingFace Language Model, as a [`LogitsProcessor`](https://huggingface.co/docs/transformers/internal/generation_utils#logitsprocessor) to guide the LM using MGD. Example uses with [SantaCoder](https://huggingface.co/bigcode/santacoder) model are available in [tests/monitor_guided_decoding/test_dereferences_monitor_java.py](tests/monitor_guided_decoding/test_dereferences_monitor_java.py).
+
+### MGD with OpenAI models
+[src/monitors4codegen/monitor_guided_decoding/openai_gen.py](src/monitors4codegen/monitor_guided_decoding/openai_gen.py) provides the method `openai_mgd` which takes the prompt and a `Monitor` as input, and returns the MGD guided generation using an OpenAI model.
+
+### Monitors
+#### Dereferences Monitor
+[src/monitors4codegen/monitor_guided_decoding/dereferences_monitor.py](src/monitors4codegen/monitor_guided_decoding/dereferences_monitor.py) provides the instantiation of `Monitor` class for dereferences monitor. It can be used to guide LMs to generate valid identifier dereferences. Unit tests for the dereferences monitor are present in [tests/monitor_guided_decoding/test_dereferences_monitor_java.py](tests/monitor_guided_decoding/test_dereferences_monitor_java.py), which also provide usage examples for the dereferences monitor.
 
 ## Contributing
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
   "jedi-language-server==0.41.1",
   "pydantic==1.10.5",
   "code-tokenize==0.2.0",
+  "openai==1.3.3",
   "torch==1.12.0",
   "transformers==4.30.0",
   "tiktoken==0.3.3",
diff --git a/requirements.txt b/requirements.txt
@@ -11,6 +11,7 @@ pytest==7.3.1
 pydantic==1.10.5
 pytest-asyncio==0.21.1
 pygtrie==2.5.0
+openai==1.3.3
 code-tokenize==0.2.0
 --extra-index-url https://download.pytorch.org/whl/cu113
 torch==1.12.0+cu113
diff --git a/src/monitors4codegen/monitor_guided_decoding/hf_gen.py b/src/monitors4codegen/monitor_guided_decoding/hf_gen.py
@@ -0,0 +1,64 @@
+"""
+Provides the definition of a monitor as per the Monitor-Guided Decoding framework
+"""
+
+import asyncio
+import torch
+
+from asyncio.events import AbstractEventLoop
+from typing import List, Union
+from transformers import LogitsProcessor
+
+class MGDLogitsProcessor(LogitsProcessor):
+    """
+    Provides the logits processor for monitor guided decoding
+    """
+
+    loop: AbstractEventLoop
+
+    def __init__(self, monitors: List[Monitor], loop: Union[None, AbstractEventLoop] = None) -> None:
+        super().__init__()
+
+        if loop is None:
+            self.loop = asyncio.get_event_loop()
+        else:
+            self.loop = loop
+
+        self.monitors: List[Monitor] = monitors
+
+    async def process_scores_for_single_input_id(
+        self, segment_idx: int, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        """
+        Asynchronously processes the scores for a single input id using the MGD framework
+        """
+        blacklisted_ids: List[int] = await self.monitors[segment_idx].maskgen(input_ids.tolist())
+        output_scores: torch.FloatTensor = torch.where(
+            torch.tensor([True if i in blacklisted_ids else False for i in range(scores.shape[0])]).to(scores.device),
+            float("-inf") * torch.ones(scores.shape[0]).to(scores.device),
+            scores,
+        ).to(scores)
+        return output_scores
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        This method is called by the HuggingFace decoder, for every token generation with
+        the input_ids (seen so far including prompt) and scores (for the next token).
+        This method processes the scores using the MGD framework.
+        """
+        assert len(input_ids.shape) == 2
+        assert input_ids.shape[0] == len(self.monitors)
+        assert len(scores.shape) == 2
+
+        async def f(input_ids_arg: torch.LongTensor, scores_arg: torch.FloatTensor):
+            new_score_coroutines = [
+                self.process_scores_for_single_input_id(i, input_ids_arg[i], scores_arg[i])
+                for i in range(input_ids_arg.shape[0])
+            ]
+            new_scores = await asyncio.gather(*new_score_coroutines)
+            return tuple(new_scores)
+
+        future = asyncio.run_coroutine_threadsafe(f(input_ids, scores), self.loop)
+        results = future.result()
+        new_scores = torch.stack(results, dim=0).to(scores)
+        return new_scores
diff --git a/src/monitors4codegen/monitor_guided_decoding/monitor.py b/src/monitors4codegen/monitor_guided_decoding/monitor.py
@@ -2,19 +2,13 @@
 Provides the definition of a monitor as per the Monitor-Guided Decoding framework
 """
 
-import asyncio
-import torch
-
-from asyncio.events import AbstractEventLoop
-from typing import List, Tuple, Union
-from transformers import LogitsProcessor
+from typing import List, Tuple
 from monitors4codegen.monitor_guided_decoding.tokenizer_wrapper import TokenizerWrapper
 from monitors4codegen.multilspy import LanguageServer
 from monitors4codegen.multilspy.multilspy_config import Language
 from dataclasses import dataclass
 from monitors4codegen.multilspy.multilspy_utils import TextUtils
 
-
 @dataclass
 class MonitorFileBuffer:
     """
@@ -83,58 +77,3 @@ def update(self, generated_token: str):
         This function updates the state of the monitor, given the generated token.
         """
         raise NotImplementedError()
-
-
-class MGDLogitsProcessor(LogitsProcessor):
-    """
-    Provides the logits processor for monitor guided decoding
-    """
-
-    loop: AbstractEventLoop
-
-    def __init__(self, monitors: List[Monitor], loop: Union[None, AbstractEventLoop] = None) -> None:
-        super().__init__()
-
-        if loop is None:
-            self.loop = asyncio.get_event_loop()
-        else:
-            self.loop = loop
-
-        self.monitors: List[Monitor] = monitors
-
-    async def process_scores_for_single_input_id(
-        self, segment_idx: int, input_ids: torch.LongTensor, scores: torch.FloatTensor
-    ) -> torch.FloatTensor:
-        """
-        Asynchronously processes the scores for a single input id using the MGD framework
-        """
-        blacklisted_ids: List[int] = await self.monitors[segment_idx].maskgen(input_ids.tolist())
-        output_scores: torch.FloatTensor = torch.where(
-            torch.tensor([True if i in blacklisted_ids else False for i in range(scores.shape[0])]).to(scores.device),
-            float("-inf") * torch.ones(scores.shape[0]).to(scores.device),
-            scores,
-        ).to(scores)
-        return output_scores
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        """
-        This method is called by the HuggingFace decoder, for every token generation with
-        the input_ids (seen so far including prompt) and scores (for the next token).
-        This method processes the scores using the MGD framework.
-        """
-        assert len(input_ids.shape) == 2
-        assert input_ids.shape[0] == len(self.monitors)
-        assert len(scores.shape) == 2
-
-        async def f(input_ids_arg: torch.LongTensor, scores_arg: torch.FloatTensor):
-            new_score_coroutines = [
-                self.process_scores_for_single_input_id(i, input_ids_arg[i], scores_arg[i])
-                for i in range(input_ids_arg.shape[0])
-            ]
-            new_scores = await asyncio.gather(*new_score_coroutines)
-            return tuple(new_scores)
-
-        future = asyncio.run_coroutine_threadsafe(f(input_ids, scores), self.loop)
-        results = future.result()
-        new_scores = torch.stack(results, dim=0).to(scores)
-        return new_scores
diff --git a/src/monitors4codegen/monitor_guided_decoding/openai_gen.py b/src/monitors4codegen/monitor_guided_decoding/openai_gen.py
@@ -0,0 +1,167 @@
+"""
+This module provides the functions and classes for running Monitor-Guided Decoding over OpenAI models
+"""
+
+from enum import Enum
+import time
+from typing import List, Set
+import torch
+import asyncio
+
+from openai import OpenAI
+from monitors4codegen.monitor_guided_decoding.monitor import Monitor
+from monitors4codegen.monitor_guided_decoding.tokenizer_wrapper import TikTokenWrapper
+
+class OpenAI_Models(Enum):
+    TD3 = 'text-davinci-003'
+
+def openai_mgd(
+    client: OpenAI,
+    model: OpenAI_Models,
+    tokenizer: TikTokenWrapper,
+    prompt_tokenized: torch.Tensor,
+    temp: float,
+    top_p: float,
+    monitor: Monitor,
+    num_new_tokens: int
+):
+    """
+    This function generates completions with OpenAI models using the Monitor-Guided Decoding scheme.
+    """
+    prompt_tokenized: torch.Tensor = torch.tensor(prompt_tokenized, dtype=torch.int64)
+    assert len(prompt_tokenized.shape) == 1
+
+    all_tokens: torch.Tensor = prompt_tokenized
+    gen_text: bytes = b''
+    
+    gen_tokens: List[int] = []
+
+    tokens_sort_key = {k:[0, 0] for k in tokenizer.all_token_ids}
+    
+    # # TODO: Find a way to prioritize tokens to be blacklisted
+    # # 1. The following code uses info about whether has a break char in it
+    # for token, token_id in tokenizer.vocab_trie.iteritems():
+    #     if token[0] in monitor.all_break_chars:
+    #         tokens_sort_key[token_id][0] = 0 # ".", ", a"
+    #     elif any([c in monitor.all_break_chars for c in token]):
+    #         tokens_sort_key[token_id][0] = 1 # "abc, "
+    #     else:
+    #         tokens_sort_key[token_id][0] = 2
+
+    # # 2. The following code uses frequency of the token in repo as a heuristic
+    # for freq_token, freq in metadata_batch[seq_idx]['token_freq']:
+    #     tokens_sort_key[freq_token][1] = freq
+
+    # # 3. Use a local-small and very fast language model to score the tokens
+
+    # # 4. Use the prompt to score the tokens
+
+    all_text_bytes: bytes = tokenizer.tokenizer.decode_bytes(all_tokens.tolist())
+    prompt_num_tokens: int = all_tokens.shape[0]
+
+    priority_blacklist: List[int] = []
+
+    while all_tokens.shape[0] < prompt_num_tokens + num_new_tokens:
+        num_toks_to_gen = (prompt_num_tokens + num_new_tokens) - all_tokens.shape[0]
+        
+        blacklisted_ids: List[int] = asyncio.run_coroutine_threadsafe(monitor.maskgen(all_tokens.tolist()), monitor.monitor_file_buffer.lsp.server.loop).result()
+        white_listed_ids: Set[int] = set(tokenizer.all_token_ids) - set(blacklisted_ids+[50256])
+
+        logit_bias = {50256:-100}
+
+        for token_id in priority_blacklist:
+            logit_bias[token_id] = -100
+
+        if len(white_listed_ids) <= (300 - len(logit_bias)):
+            for white_token_id in white_listed_ids:
+                logit_bias[white_token_id] = 100
+        else:
+            for candidate_token in sorted(blacklisted_ids, key=lambda x: tokens_sort_key[x], reverse=True):
+                if len(logit_bias) >= 300:
+                    break
+                if candidate_token in blacklisted_ids:
+                    logit_bias[candidate_token] = -100
+
+        exponential_backoff_wait = 1
+        while True:
+            try:
+                prompt_arg: str = all_text_bytes.decode('utf-8', errors='strict')
+            except UnicodeDecodeError:
+                prompt_arg: List[int] = all_tokens.tolist()
+            
+            try:
+                response = client.completions.create(
+                    model=model.value,
+                    prompt=[prompt_arg],
+                    temperature=temp,
+                    max_tokens=num_toks_to_gen if len(logit_bias) <= 1 else 1,
+                    top_p=top_p,
+                    stop=['.'],
+                    logit_bias=logit_bias,
+                    logprobs=5
+                )
+                break
+            except Exception:
+                time.sleep(exponential_backoff_wait)
+                if exponential_backoff_wait < 64:
+                    exponential_backoff_wait = exponential_backoff_wait*2
+                else:
+                    exponential_backoff_wait = 1
+
+        assert len(response.choices) == 1
+
+        def convert_bytesrep_to_bytes(x: str) -> bytes:
+            if x.startswith('bytes:'):
+                return bytes.fromhex(x.replace('bytes:', '').replace('\\x', ''))
+            else:
+                return x.encode()
+
+        tokens_gen_bytes_ = list(map(convert_bytesrep_to_bytes, response.choices[0].logprobs.tokens))
+        tokens_gen_bytes = []
+        dot_found = False
+        for token_bytes in tokens_gen_bytes_:
+            gen_text += token_bytes
+            all_text_bytes += token_bytes
+            tokens_gen_bytes.append(token_bytes)
+            if b'.' in token_bytes:
+                dot_found = True
+                break
+
+        should_manually_add_dot = None
+        if response.choices[0].finish_reason == 'stop':
+            if dot_found:
+                should_manually_add_dot = False
+            else:
+                should_manually_add_dot = True
+        elif response.choices[0].finish_reason == 'length':
+            should_manually_add_dot = False
+        else:
+            raise Exception("Unknown finish reason", response.choices[0].finish_reason)
+
+        tokens_gen = list(map(lambda x: tokenizer.tokenizer.encode_single_token(x), tokens_gen_bytes))
+
+        assert should_manually_add_dot is not None
+        if should_manually_add_dot:
+            gen_text += b'.'
+            all_text_bytes += b'.'
+            tokens_gen.append(tokenizer.tokenizer.encode_single_token('.'))
+        
+        if len(logit_bias) > 1:
+            assert len(tokens_gen) == 1, (print(response), response, launch_debug(locals()))
+            if tokens_gen[0] in blacklisted_ids:
+                priority_blacklist.append(tokens_gen[0])
+                continue
+        priority_blacklist = []
+
+        new_all_tokens = torch.cat([
+            all_tokens,
+            torch.tensor(tokens_gen)
+        ]).to(all_tokens)
+
+        assert len(new_all_tokens.shape) == 1
+        assert new_all_tokens.shape[0] > all_tokens.shape[0], (new_all_tokens.shape, all_tokens.shape, launch_debug(locals()))
+        assert torch.equal(new_all_tokens[:all_tokens.shape[0]], all_tokens)
+        gen_tokens += new_all_tokens[all_tokens.shape[0]:].tolist()
+        all_tokens = new_all_tokens
+
+    return gen_tokens, gen_text.decode()
diff --git a/src/monitors4codegen/monitor_guided_decoding/tokenizer_wrapper.py b/src/monitors4codegen/monitor_guided_decoding/tokenizer_wrapper.py
@@ -6,7 +6,7 @@
 import torch
 import tiktoken
 
-from typing import List, Union
+from typing import List, Set, Union
 from pygtrie import CharTrie
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
@@ -97,11 +97,11 @@ def __init__(self, tokenizer: tiktoken.core.Encoding):
                 self.vocab_trie[decoded_token] = v
                 self.all_token_ids.add(v)
 
-    def decode(self, token_ids: torch.Tensor, *args, **kwargs) -> str:
+    def decode(self, token_ids: Union[List[int], torch.Tensor], *args, **kwargs) -> str:
         """
         Decodes the given token ids to a string
         """
-        token_ids, clean_up_tokenization_spaces, skip_special_tokens = None, None, None
+        clean_up_tokenization_spaces, skip_special_tokens = None, None
         if len(args) == 0:
             pass
         elif len(args) == 1:
@@ -116,10 +116,10 @@ def decode(self, token_ids: torch.Tensor, *args, **kwargs) -> str:
 
         assert not clean_up_tokenization_spaces
         assert skip_special_tokens
-        assert isinstance(token_ids, torch.Tensor)
-        token_ids = token_ids.tolist()
+        if isinstance(token_ids, torch.Tensor):
+            token_ids = token_ids.tolist()
 
-        token_ids = [i for i in token_ids if i not in self.all_special_ids]
+        token_ids: List[int] = [i for i in token_ids if i not in self.all_special_ids]
 
         return self.tokenizer.decode(token_ids)
 
diff --git a/tests/monitor_guided_decoding/test_dereferences_monitor_java.py b/tests/monitor_guided_decoding/test_dereferences_monitor_java.py
diff --git a/tests/monitor_guided_decoding/test_dereferences_monitor_java_openai.py b/tests/monitor_guided_decoding/test_dereferences_monitor_java_openai.py
diff --git a/tests/test_utils.py b/tests/test_utils.py