These are basic classifiers and a BM25 index of Wikipedia used for data tooling research. Using kenhktsui/llm-data-textbook-quality-fasttext-classifer-v1's classifier (MIT) and TurkuNLP's register classifiers.

import fasttext, os
if not os.path.exists("expert_classify.ftz"):
    os.system("wget http://dl.turkunlp.org/register-labeling-model/fasttext_model.bin")
    os.system("wget https://huggingface.co/ontocord/riverbed/resolve/main/rj_model.bin")
    os.system("wget https://huggingface.co/kenhktsui/llm-data-textbook-quality-fasttext-classifer-v1/resolve/main/model_textbook_quality.bin")
    os.system("wget https://huggingface.co/ontocord/riverbed/resolve/main/expert_classify.ftz")

### red pajama filter. pred_label "__label__wiki" is data we do not wish to keep.
red_pajama_model = fasttext.load_model("rj_model.bin")
(pred_label, pred_prob) = red_pajama_model.predict(text)
if pred_label == "__label__cc":
     pred_prob = 1 - pred_prob


### turkunlp registry labeler: https://github.com/TurkuNLP/register-labeling
domain_model = fasttext.load_model("fasttext_model.bin")
(pred_label, pred_prob) = domain_model.predict(text)

### Pile domain such as github, arxiv, etc.
pile_model = fasttext.load_model("expert_classify.ftz")
(pred_label, pred_prob) = pile_model.predict(text)

### Textbook quality - e.g., textbooks are all you need
textbook_model = fasttext.load_model("model_textbook_quality.bin")
(pred_label, pred_prob) = pile_model.predict(text)

See the files here: https://huggingface.co/ontocord/riverbed/tree/main

This includes a a small whoosh search index of wikidata useful for background knowledge for LLMs.

installation:


if not os.path.exists("./wikidata_bm25_whoosh"):
  os.system("git clone https://huggingface.co/ontocord/riverbed")
  os.system("pip install -q whoosh")
import whoosh.index as whoosh_index
from whoosh.qparser import QueryParser
from whoosh.analysis import StemmingAnalyzer, Filter
class MyFilter(Filter):
  def __call__(self, tokens):

    for t in tokens:
        t.text = t.text.lower()
        if len(t.text) > 5:
          yield t
          t.text = t.text[:5]
        yield t

try:
  if qp is None: assert False
except:
  bm25_dir = "./riverbed"
  index = whoosh_index.open_dir(bm25_dir)
  searcher = index.searcher()
  qp = QueryParser("content", schema=index.schema)
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no library tag.