File size: 7,684 Bytes
7a8b33f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
import argparse
import json
import multiprocessing as mp
from zsvision.zs_multiproc import starmap_with_kwargs
from zsvision.zs_utils import BlockTimer
from text_utils import is_unique_verbatim_quote, parse_passage_quote_and_claim
from llm_api_utils import (
call_openai_with_exponetial_backoff,
estimate_cost_of_text_generation_api_call,
init_openai_with_api_key,
)
class FixAnchors:
def __init__(
self,
temperature=0,
model="gpt-3.5-turbo",
filter_str="",
processes=8,
refresh=False,
):
self.temperature = temperature
self.model = model
self.filter_str = filter_str
self.processes = processes
self.refresh = refresh
def fix_passage_anchor(
self,
idx: int,
total: int,
original_passage: str,
claim_with_metadata: dict,
):
init_openai_with_api_key()
print(f"Processing claim with metadata {idx + 1} of {total}")
# we remove newlines
original_passage = original_passage.replace("\n", " ")
assert not claim_with_metadata[
"is_unique_and_verbatim"
], "We should only fix broken passage anchors"
prompt = f"""\
Task:
A co-worker was tasked with identifying a unique, verbatim quote from a passage that underpins a particular claim. \
Unfortunately, they made a mistake and the quote they identified is not unique and verbatim. \
Your task is to fix their quote so that it is both verbatim and unique.
-----
Here is an example passage, together with the claim and the erroneous quote.
Passage:
In 1940, she was interned in a French camp as an enemy alien, but managed to escape and eventually make her way to the United States in 1941. \
Arendt's experiences during this time would deeply influence her work on totalitarianism and human rights. \
In New York, she began to immerse herself in academic life, working as an editor, journalist, and lecturer. \
Her first major work, *The Origins of Totalitarianism*, published in 1951, explored the common roots of Nazism and Stalinism, and established her as a significant voice in political philosophy. \
## A Life Of Controversial, Influential Works \
Throughout her career, Arendt wrote a number of seminal, and controversial, works. *The Human Condition* (1958) examined the role of politics in modern societies and introduced the concept of "the public realm" – the space where individuals act and participate in political life. \
This exploration of freedom and action would become a recurring theme in her writings. \
Her 1963 publication, *Eichmann in Jerusalem: A Report on the Banality of Evil*, based on her coverage of Adolf Eichmann's trial, ignited significant controversy. \
Arendt argued that Eichmann, a key architect of the Holocaust, was not a monster but rather an ordinary bureaucrat who unquestioningly followed orders. \
The idea of the "banality of evil" continues to influence discussions on the nature of evil and moral responsibility. \
Arendt's later works, such as *On Revolution* (1963) and *Between Past and Future* (1968), sought to further unravel the complexities of power, authority, and rebellion. \
Her writings on these subjects continue to resonate with present-day political struggles, as well as with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work). \
Claim:
*The Origins of Totalitarianism* established Arendt as a significant voice in political philosophy.
Initial attempt at a unique and verbatim quote:
[The Origins of Totalitarianism] established her as a significant voice in political philosophy.
Correct (unique and verbatim) quote:
Her first major work, *The Origins of Totalitarianism*, published in 1951, explored the common roots of Nazism and Stalinism, and established her as a significant voice in political philosophy.
-----
Passage:
{original_passage}
Claim:
{claim_with_metadata["claim"]}
Initial attempt at a unique verbatim quote:
{claim_with_metadata["verbatim_quote"]}
Correct (unique and verbatim) quote:\
"""
persona = "You are a careful research assistant who helps with fact-checking and editing informative articles."
system_message = {"role": "system", "content": persona}
user_message = {"role": "user", "content": prompt}
messages = [system_message, user_message]
with BlockTimer(f"Using OpenAI API to extract claims with {self.model}"):
response = call_openai_with_exponetial_backoff(
model=self.model,
temperature=self.temperature,
messages=messages,
)
cost = estimate_cost_of_text_generation_api_call(
model=self.model, response=response, verbose=True
)
content = response.choices[0].message.content
verbatim_quote = content.rstrip()
is_unique_and_verbatim = is_unique_verbatim_quote(
verbatim_quote=verbatim_quote, original_passage=original_passage
)
assert (
is_unique_and_verbatim
), f"Failed to fix passage anchor: {claim_with_metadata['verbatim_quote']} was updated to {verbatim_quote} but is not unique and verbatim"
claim_with_metadata["verbatim_quote"] = verbatim_quote
return {"claim_with_metadata": claim_with_metadata, "cost": cost}
def fix_passage_anchors(self, claims_with_metadata, original_passage: str):
kwarg_list = []
valid_claims_with_metadata = []
invalid_claims_with_metadata = []
for idx, claim_with_metadata in enumerate(claims_with_metadata):
# remove newlines from the passage to avoid a confusing prompt format
if not claim_with_metadata["is_unique_and_verbatim"]:
invalid_claims_with_metadata.append(claim_with_metadata)
else:
valid_claims_with_metadata.append(claim_with_metadata)
for idx, claim_with_metadata in enumerate(invalid_claims_with_metadata):
kwarg_list.append(
{
"idx": idx,
"total": len(invalid_claims_with_metadata),
"claim_with_metadata": claim_with_metadata,
"original_passage": original_passage,
}
)
if self.processes == 1:
results = []
for kwargs in kwarg_list:
try:
results.append(self.fix_passage_anchor(**kwargs))
except Exception as e:
print(f"Exception in step2: {e}, model: {self.model}")
print("Skipping this claim!")
if self.model == "gpt-4":
pass
else:
raise e
else: # multiprocess
func = self.fix_passage_anchor
with mp.Pool(processes=self.processes) as pool:
results = starmap_with_kwargs(
pool=pool, func=func, kwargs_iter=kwarg_list
)
cost = sum([result["cost"] for result in results])
for result in results:
valid_claims_with_metadata.append(result["claim_with_metadata"])
# remove the is_unique_and_verbatim field (no longer needed)
for claim_with_metadata in valid_claims_with_metadata:
del claim_with_metadata["is_unique_and_verbatim"]
print(
f"Returning {len(valid_claims_with_metadata)} claims with metadat (cost: {cost} USD)"
)
return valid_claims_with_metadata
|