Update README.md

35a3912 verified about 1 year ago

4.45 kB

	---
	datasets:
	- nuprl/EditPackFT-Multi
	tags:
	- code
	---
	# What is this
	This is a deepseek coder 7b model trained to predict commit messages for a diff.

	# Languages trained on:
	```py
	LANGS = [
	"Python",
	"Rust",
	"JavaScript",
	"Java",
	"Go",
	"C++",
	"C#",
	"Ruby",
	"PHP",
	"TypeScript",
	"C",
	"Scala",
	"Swift",
	"Kotlin",
	"Objective-C",
	"Perl",
	"Haskell",
	"Bash",
	"Sh",
	"Lua",
	"R",
	"Julia",
	]
	```

	# How to prompt:
	```python
	import difflib
	class NDiff:
	def __init__(self, s1, s2):
	self.s1 = s1
	self.s2 = s2
	self.diff = difflib.ndiff(s1.split("\n"), s2.split("\n"))

	def __str__(self):
	return "\n".join([l for l in self.diff if l[0] != "?"])

	def str_colored(self):
	import colored

	buf = ""
	for l in self.diff:
	if l[0] == "?":
	continue
	if l[0] == "-":
	buf += colored.stylize(l, colored.fg("red"))
	elif l[0] == "+":
	buf += colored.stylize(l, colored.fg("green"))
	else:
	buf += l
	buf += "\n"
	return buf

	def num_removed(self):
	return len([l for l in self.diff if l[0] == "-"])

	def num_added(self):
	return len([l for l in self.diff if l[0] == "+"])

	def __repr__(self):
	return self.__str__()

	def format_prompt(old, new):
	diff_header = "<diff>"
	instr_header = "<commit_message>"
	diff = str(NDiff(old, new))
	return f"{diff_header}\n{diff}\n{instr_header}\n"

	def gen(old, new, max_new_tokens=200, temperature=0.45, top_p=0.90):
	prompt = format_prompt(old, new)
	toks = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
	outs = model.generate(toks, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p)
	return [tokenizer.decode(out[len(toks[0]):], skip_special_tokens=True) for out in outs]
	```

	use the "gen" function with the old and new code

	# Example:
	```py
	- import datasets
	- from pathlib import Path
	from code_editing.models import CodeLlamaEditModel, LlamaChatModel, EditModel, EditCommand, ChatAdaptorEditModel, OctoCoderChatModel, codellama_edit_prompt_diff, apply_rel_diff_trim, OpenAIChatModel, StarCoderCommitEditModel
	from code_editing.humanevalpack import batch_prompts_from_example
	from code_editing.utils import gunzip_json_write
	from typing import List, Callable
	from tqdm import tqdm


	# NOTE: this is the factory for each model type. to add a new model type, add a new case here
	# and implement it in models.py. Also, add a new case in the argument parser below.
	- def model_factory(model_type: str, quantize=False, num_gpus=1) -> Callable[[str], EditModel]:
	+ def model_factory(
	+ model_type: str,
	+ quantize=False,
	+ num_gpus=1,
	+ system_supported=True,
	+ ) -> Callable[[str], EditModel]:
	if model_type == "codellama" or model_type == "deepseek":
	return CodeLlamaEditModel
	elif model_type == "starcoder":
	return StarCoderCommitEditModel
	elif model_type == "codellama-diff":
	return (lambda path: CodeLlamaEditModel(path, prompt_format=codellama_edit_prompt_diff, post_process=apply_rel_diff_trim))
	elif model_type == "openai":
	return (lambda path: ChatAdaptorEditModel(OpenAIChatModel(path)))
	elif model_type == "codellama-chat":
	- return (lambda path: ChatAdaptorEditModel(LlamaChatModel(path, quantization=quantize, num_gpus=num_gpus)))
	+ return (lambda path: ChatAdaptorEditModel(LlamaChatModel(path, quantization=quantize, num_gpus=num_gpus, system_supported=system_supported)))
	elif model_type == "octocoder":
	return (lambda path: ChatAdaptorEditModel(OctoCoderChatModel(path, quantization=quantize, num_gpus=num_gpus)))
	else:
	raise ValueError(f"Unknown model type: {model_type}")

	def complete_problem(example: EditCommand, model: EditModel, batch_size: int, completion_limit: int, **kwargs) -> List[str]:
	batches = batch_prompts_from_example(example, batch_size, completion_limit)

	completions = []
	for batch in batches:
	resps = model.generate(batch, **kwargs)
	for resp in resps:
	completions.append(resp["content"])

	return completions
	```
	Produced:
	```
	Add system_supported argument to model_factory
	```