|
--- |
|
datasets: |
|
- nuprl/EditPackFT-Multi |
|
tags: |
|
- code |
|
--- |
|
# What is this |
|
This is a deepseek coder 7b model trained to predict commit messages for a diff. |
|
|
|
# Languages trained on: |
|
```py |
|
LANGS = [ |
|
"Python", |
|
"Rust", |
|
"JavaScript", |
|
"Java", |
|
"Go", |
|
"C++", |
|
"C#", |
|
"Ruby", |
|
"PHP", |
|
"TypeScript", |
|
"C", |
|
"Scala", |
|
"Swift", |
|
"Kotlin", |
|
"Objective-C", |
|
"Perl", |
|
"Haskell", |
|
"Bash", |
|
"Sh", |
|
"Lua", |
|
"R", |
|
"Julia", |
|
] |
|
``` |
|
|
|
# How to prompt: |
|
```python |
|
import difflib |
|
class NDiff: |
|
def __init__(self, s1, s2): |
|
self.s1 = s1 |
|
self.s2 = s2 |
|
self.diff = difflib.ndiff(s1.split("\n"), s2.split("\n")) |
|
|
|
def __str__(self): |
|
return "\n".join([l for l in self.diff if l[0] != "?"]) |
|
|
|
def str_colored(self): |
|
import colored |
|
|
|
buf = "" |
|
for l in self.diff: |
|
if l[0] == "?": |
|
continue |
|
if l[0] == "-": |
|
buf += colored.stylize(l, colored.fg("red")) |
|
elif l[0] == "+": |
|
buf += colored.stylize(l, colored.fg("green")) |
|
else: |
|
buf += l |
|
buf += "\n" |
|
return buf |
|
|
|
def num_removed(self): |
|
return len([l for l in self.diff if l[0] == "-"]) |
|
|
|
def num_added(self): |
|
return len([l for l in self.diff if l[0] == "+"]) |
|
|
|
def __repr__(self): |
|
return self.__str__() |
|
|
|
def format_prompt(old, new): |
|
diff_header = "<diff>" |
|
instr_header = "<commit_message>" |
|
diff = str(NDiff(old, new)) |
|
return f"{diff_header}\n{diff}\n{instr_header}\n" |
|
|
|
def gen(old, new, max_new_tokens=200, temperature=0.45, top_p=0.90): |
|
prompt = format_prompt(old, new) |
|
toks = tokenizer.encode(prompt, return_tensors="pt").to(model.device) |
|
outs = model.generate(toks, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p) |
|
return [tokenizer.decode(out[len(toks[0]):], skip_special_tokens=True) for out in outs] |
|
``` |
|
|
|
use the "gen" function with the old and new code |
|
|
|
# Example: |
|
```py |
|
- import datasets |
|
- from pathlib import Path |
|
from code_editing.models import CodeLlamaEditModel, LlamaChatModel, EditModel, EditCommand, ChatAdaptorEditModel, OctoCoderChatModel, codellama_edit_prompt_diff, apply_rel_diff_trim, OpenAIChatModel, StarCoderCommitEditModel |
|
from code_editing.humanevalpack import batch_prompts_from_example |
|
from code_editing.utils import gunzip_json_write |
|
from typing import List, Callable |
|
from tqdm import tqdm |
|
|
|
|
|
# NOTE: this is the factory for each model type. to add a new model type, add a new case here |
|
# and implement it in models.py. Also, add a new case in the argument parser below. |
|
- def model_factory(model_type: str, quantize=False, num_gpus=1) -> Callable[[str], EditModel]: |
|
+ def model_factory( |
|
+ model_type: str, |
|
+ quantize=False, |
|
+ num_gpus=1, |
|
+ system_supported=True, |
|
+ ) -> Callable[[str], EditModel]: |
|
if model_type == "codellama" or model_type == "deepseek": |
|
return CodeLlamaEditModel |
|
elif model_type == "starcoder": |
|
return StarCoderCommitEditModel |
|
elif model_type == "codellama-diff": |
|
return (lambda path: CodeLlamaEditModel(path, prompt_format=codellama_edit_prompt_diff, post_process=apply_rel_diff_trim)) |
|
elif model_type == "openai": |
|
return (lambda path: ChatAdaptorEditModel(OpenAIChatModel(path))) |
|
elif model_type == "codellama-chat": |
|
- return (lambda path: ChatAdaptorEditModel(LlamaChatModel(path, quantization=quantize, num_gpus=num_gpus))) |
|
+ return (lambda path: ChatAdaptorEditModel(LlamaChatModel(path, quantization=quantize, num_gpus=num_gpus, system_supported=system_supported))) |
|
elif model_type == "octocoder": |
|
return (lambda path: ChatAdaptorEditModel(OctoCoderChatModel(path, quantization=quantize, num_gpus=num_gpus))) |
|
else: |
|
raise ValueError(f"Unknown model type: {model_type}") |
|
|
|
def complete_problem(example: EditCommand, model: EditModel, batch_size: int, completion_limit: int, **kwargs) -> List[str]: |
|
batches = batch_prompts_from_example(example, batch_size, completion_limit) |
|
|
|
completions = [] |
|
for batch in batches: |
|
resps = model.generate(batch, **kwargs) |
|
for resp in resps: |
|
completions.append(resp["content"]) |
|
|
|
return completions |
|
``` |
|
Produced: |
|
``` |
|
Add system_supported argument to model_factory |
|
``` |