Spaces:
Running
Running
| # Copyright 2022 The OpenAI team and The HuggingFace Team. All rights reserved. | |
| # Most of the code is copy pasted from the original whisper repository | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import re | |
| import unicodedata | |
| from fractions import Fraction | |
| from typing import Iterator, List, Match, Optional, Union | |
| import regex | |
| abbr = { | |
| "accessorise": "accessorize", | |
| "accessorised": "accessorized", | |
| "accessorises": "accessorizes", | |
| "accessorising": "accessorizing", | |
| "acclimatisation": "acclimatization", | |
| "acclimatise": "acclimatize", | |
| "acclimatised": "acclimatized", | |
| "acclimatises": "acclimatizes", | |
| "acclimatising": "acclimatizing", | |
| "accoutrements": "accouterments", | |
| "aeon": "eon", | |
| "aeons": "eons", | |
| "aerogramme": "aerogram", | |
| "aerogrammes": "aerograms", | |
| "aeroplane": "airplane", | |
| "aeroplanes": "airplanes", | |
| "aesthete": "esthete", | |
| "aesthetes": "esthetes", | |
| "aesthetic": "esthetic", | |
| "aesthetically": "esthetically", | |
| "aesthetics": "esthetics", | |
| "aetiology": "etiology", | |
| "ageing": "aging", | |
| "aggrandisement": "aggrandizement", | |
| "agonise": "agonize", | |
| "agonised": "agonized", | |
| "agonises": "agonizes", | |
| "agonising": "agonizing", | |
| "agonisingly": "agonizingly", | |
| "almanack": "almanac", | |
| "almanacks": "almanacs", | |
| "aluminium": "aluminum", | |
| "amortisable": "amortizable", | |
| "amortisation": "amortization", | |
| "amortisations": "amortizations", | |
| "amortise": "amortize", | |
| "amortised": "amortized", | |
| "amortises": "amortizes", | |
| "amortising": "amortizing", | |
| "amphitheatre": "amphitheater", | |
| "amphitheatres": "amphitheaters", | |
| "anaemia": "anemia", | |
| "anaemic": "anemic", | |
| "anaesthesia": "anesthesia", | |
| "anaesthetic": "anesthetic", | |
| "anaesthetics": "anesthetics", | |
| "anaesthetise": "anesthetize", | |
| "anaesthetised": "anesthetized", | |
| "anaesthetises": "anesthetizes", | |
| "anaesthetising": "anesthetizing", | |
| "anaesthetist": "anesthetist", | |
| "anaesthetists": "anesthetists", | |
| "anaesthetize": "anesthetize", | |
| "anaesthetized": "anesthetized", | |
| "anaesthetizes": "anesthetizes", | |
| "anaesthetizing": "anesthetizing", | |
| "analogue": "analog", | |
| "analogues": "analogs", | |
| "analyse": "analyze", | |
| "analysed": "analyzed", | |
| "analyses": "analyzes", | |
| "analysing": "analyzing", | |
| "anglicise": "anglicize", | |
| "anglicised": "anglicized", | |
| "anglicises": "anglicizes", | |
| "anglicising": "anglicizing", | |
| "annualised": "annualized", | |
| "antagonise": "antagonize", | |
| "antagonised": "antagonized", | |
| "antagonises": "antagonizes", | |
| "antagonising": "antagonizing", | |
| "apologise": "apologize", | |
| "apologised": "apologized", | |
| "apologises": "apologizes", | |
| "apologising": "apologizing", | |
| "appal": "appall", | |
| "appals": "appalls", | |
| "appetiser": "appetizer", | |
| "appetisers": "appetizers", | |
| "appetising": "appetizing", | |
| "appetisingly": "appetizingly", | |
| "arbour": "arbor", | |
| "arbours": "arbors", | |
| "archaeologically": "archeologically", | |
| "archaeologist": "archeologist", | |
| "archaeologists": "archeologists", | |
| "archaeology": "archeology</span>", | |
| "archeological": "archaeological", | |
| "ardour": "ardor", | |
| "armour": "armor", | |
| "armoured": "armored", | |
| "armourer": "armorer", | |
| "armourers": "armorers", | |
| "armouries": "armories", | |
| "armoury": "armory", | |
| "artefact": "artifact", | |
| "artefacts": "artifacts", | |
| "authorise": "authorize", | |
| "authorised": "authorized", | |
| "authorises": "authorizes", | |
| "authorising": "authorizing", | |
| "axe": "ax", | |
| "backpedalled": "backpedaled", | |
| "backpedalling": "backpedaling", | |
| "bannister": "banister", | |
| "bannisters": "banisters", | |
| "baptise": "baptize", | |
| "baptised": "baptized", | |
| "baptises": "baptizes", | |
| "baptising": "baptizing", | |
| "bastardise": "bastardize", | |
| "bastardised": "bastardized", | |
| "bastardises": "bastardizes", | |
| "bastardising": "bastardizing", | |
| "battleax": "battleaxe", | |
| "baulk": "balk", | |
| "baulked": "balked", | |
| "baulking": "balking", | |
| "baulks": "balks", | |
| "bedevilled": "bedeviled", | |
| "bedevilling": "bedeviling", | |
| "behaviour": "behavior", | |
| "behavioural": "behavioral", | |
| "behaviourism": "behaviorism", | |
| "behaviourist": "behaviorist", | |
| "behaviourists": "behaviorists", | |
| "behaviours": "behaviors", | |
| "behove": "behoove", | |
| "behoved": "behooved", | |
| "behoves": "behooves", | |
| "bejewelled": "bejeweled", | |
| "belabour": "belabor", | |
| "belaboured": "belabored", | |
| "belabouring": "belaboring", | |
| "belabours": "belabors", | |
| "bevelled": "beveled", | |
| "bevvies": "bevies", | |
| "bevvy": "bevy", | |
| "biassed": "biased", | |
| "biassing": "biasing", | |
| "bingeing": "binging", | |
| "bougainvillaea": "bougainvillea", | |
| "bougainvillaeas": "bougainvilleas", | |
| "bowdlerise": "bowdlerize", | |
| "bowdlerised": "bowdlerized", | |
| "bowdlerises": "bowdlerizes", | |
| "bowdlerising": "bowdlerizing", | |
| "breathalyse": "breathalyze", | |
| "breathalysed": "breathalyzed", | |
| "breathalyser": "breathalyzer", | |
| "breathalysers": "breathalyzers", | |
| "breathalyses": "breathalyzes", | |
| "breathalysing": "breathalyzing", | |
| "brutalise": "brutalize", | |
| "brutalised": "brutalized", | |
| "brutalises": "brutalizes", | |
| "brutalising": "brutalizing", | |
| "busses": "buses", | |
| "bussing": "busing", | |
| "caesarean": "cesarean", | |
| "caesareans": "cesareans", | |
| "calibre": "caliber", | |
| "calibres": "calibers", | |
| "calliper": "caliper", | |
| "callipers": "calipers", | |
| "callisthenics": "calisthenics", | |
| "canalise": "canalize", | |
| "canalised": "canalized", | |
| "canalises": "canalizes", | |
| "canalising": "canalizing", | |
| "cancelation": "cancellation", | |
| "cancelations": "cancellations", | |
| "cancelled": "canceled", | |
| "cancelling": "canceling", | |
| "candour": "candor", | |
| "cannibalise": "cannibalize", | |
| "cannibalised": "cannibalized", | |
| "cannibalises": "cannibalizes", | |
| "cannibalising": "cannibalizing", | |
| "canonise": "canonize", | |
| "canonised": "canonized", | |
| "canonises": "canonizes", | |
| "canonising": "canonizing", | |
| "capitalise": "capitalize", | |
| "capitalised": "capitalized", | |
| "capitalises": "capitalizes", | |
| "capitalising": "capitalizing", | |
| "caramelise": "caramelize", | |
| "caramelised": "caramelized", | |
| "caramelises": "caramelizes", | |
| "caramelising": "caramelizing", | |
| "carbonise": "carbonize", | |
| "carbonised": "carbonized", | |
| "carbonises": "carbonizes", | |
| "carbonising": "carbonizing", | |
| "carolled": "caroled", | |
| "carolling": "caroling", | |
| "catalogue": "catalog", | |
| "catalogued": "cataloged", | |
| "catalogues": "catalogs", | |
| "cataloguing": "cataloging", | |
| "catalyse": "catalyze", | |
| "catalysed": "catalyzed", | |
| "catalyses": "catalyzes", | |
| "catalysing": "catalyzing", | |
| "categorise": "categorize", | |
| "categorised": "categorized", | |
| "categorises": "categorizes", | |
| "categorising": "categorizing", | |
| "cauterise": "cauterize", | |
| "cauterised": "cauterized", | |
| "cauterises": "cauterizes", | |
| "cauterising": "cauterizing", | |
| "cavilled": "caviled", | |
| "cavilling": "caviling", | |
| "centigramme": "centigram", | |
| "centigrammes": "centigrams", | |
| "centilitre": "centiliter", | |
| "centilitres": "centiliters", | |
| "centimetre": "centimeter", | |
| "centimetres": "centimeters", | |
| "centralise": "centralize", | |
| "centralised": "centralized", | |
| "centralises": "centralizes", | |
| "centralising": "centralizing", | |
| "centre": "center", | |
| "centred": "centered", | |
| "centrefold": "centerfold", | |
| "centrefolds": "centerfolds", | |
| "centrepiece": "centerpiece", | |
| "centrepieces": "centerpieces", | |
| "centres": "centers", | |
| "channelled": "channeled", | |
| "channelling": "channeling", | |
| "characterise": "characterize", | |
| "characterised": "characterized", | |
| "characterises": "characterizes", | |
| "characterising": "characterizing", | |
| "cheque": "check", | |
| "chequebook": "checkbook", | |
| "chequebooks": "checkbooks", | |
| "chequered": "checkered", | |
| "cheques": "checks", | |
| "chilli": "chili", | |
| "chimaera": "chimera", | |
| "chimaeras": "chimeras", | |
| "chiselled": "chiseled", | |
| "chiselling": "chiseling", | |
| "circularise": "circularize", | |
| "circularised": "circularized", | |
| "circularises": "circularizes", | |
| "circularising": "circularizing", | |
| "civilise": "civilize", | |
| "civilised": "civilized", | |
| "civilises": "civilizes", | |
| "civilising": "civilizing", | |
| "clamour": "clamor", | |
| "clamoured": "clamored", | |
| "clamouring": "clamoring", | |
| "clamours": "clamors", | |
| "clangour": "clangor", | |
| "clarinettist": "clarinetist", | |
| "clarinettists": "clarinetists", | |
| "collectivise": "collectivize", | |
| "collectivised": "collectivized", | |
| "collectivises": "collectivizes", | |
| "collectivising": "collectivizing", | |
| "colonisation": "colonization", | |
| "colonise": "colonize", | |
| "colonised": "colonized", | |
| "coloniser": "colonizer", | |
| "colonisers": "colonizers", | |
| "colonises": "colonizes", | |
| "colonising": "colonizing", | |
| "colour": "color", | |
| "colourant": "colorant", | |
| "colourants": "colorants", | |
| "coloured": "colored", | |
| "coloureds": "coloreds", | |
| "colourful": "colorful", | |
| "colourfully": "colorfully", | |
| "colouring": "coloring", | |
| "colourize": "colorize", | |
| "colourized": "colorized", | |
| "colourizes": "colorizes", | |
| "colourizing": "colorizing", | |
| "colourless": "colorless", | |
| "colours": "colors", | |
| "commercialise": "commercialize", | |
| "commercialised": "commercialized", | |
| "commercialises": "commercializes", | |
| "commercialising": "commercializing", | |
| "compartmentalise": "compartmentalize", | |
| "compartmentalised": "compartmentalized", | |
| "compartmentalises": "compartmentalizes", | |
| "compartmentalising": "compartmentalizing", | |
| "computerise": "computerize", | |
| "computerised": "computerized", | |
| "computerises": "computerizes", | |
| "computerising": "computerizing", | |
| "conceptualise": "conceptualize", | |
| "conceptualised": "conceptualized", | |
| "conceptualises": "conceptualizes", | |
| "conceptualising": "conceptualizing", | |
| "connexion": "connection", | |
| "connexions": "connections", | |
| "contextualise": "contextualize", | |
| "contextualised": "contextualized", | |
| "contextualises": "contextualizes", | |
| "contextualising": "contextualizing", | |
| "cosier": "cozier", | |
| "cosies": "cozies", | |
| "cosiest": "coziest", | |
| "cosily": "cozily", | |
| "cosiness": "coziness", | |
| "cosy": "cozy", | |
| "councillor": "councilor", | |
| "councillors": "councilors", | |
| "counselled": "counseled", | |
| "counselling": "counseling", | |
| "counsellor": "counselor", | |
| "counsellors": "counselors", | |
| "crenelated": "crenellated", | |
| "criminalise": "criminalize", | |
| "criminalised": "criminalized", | |
| "criminalises": "criminalizes", | |
| "criminalising": "criminalizing", | |
| "criticise": "criticize", | |
| "criticised": "criticized", | |
| "criticises": "criticizes", | |
| "criticising": "criticizing", | |
| "crueller": "crueler", | |
| "cruellest": "cruelest", | |
| "crystallisation": "crystallization", | |
| "crystallise": "crystallize", | |
| "crystallised": "crystallized", | |
| "crystallises": "crystallizes", | |
| "crystallising": "crystallizing", | |
| "cudgelled": "cudgeled", | |
| "cudgelling": "cudgeling", | |
| "customise": "customize", | |
| "customised": "customized", | |
| "customises": "customizes", | |
| "customising": "customizing", | |
| "cypher": "cipher", | |
| "cyphers": "ciphers", | |
| "decentralisation": "decentralization", | |
| "decentralise": "decentralize", | |
| "decentralised": "decentralized", | |
| "decentralises": "decentralizes", | |
| "decentralising": "decentralizing", | |
| "decriminalisation": "decriminalization", | |
| "decriminalise": "decriminalize", | |
| "decriminalised": "decriminalized", | |
| "decriminalises": "decriminalizes", | |
| "decriminalising": "decriminalizing", | |
| "defence": "defense", | |
| "defenceless": "defenseless", | |
| "defences": "defenses", | |
| "dehumanisation": "dehumanization", | |
| "dehumanise": "dehumanize", | |
| "dehumanised": "dehumanized", | |
| "dehumanises": "dehumanizes", | |
| "dehumanising": "dehumanizing", | |
| "demeanour": "demeanor", | |
| "demilitarisation": "demilitarization", | |
| "demilitarise": "demilitarize", | |
| "demilitarised": "demilitarized", | |
| "demilitarises": "demilitarizes", | |
| "demilitarising": "demilitarizing", | |
| "demobilisation": "demobilization", | |
| "demobilise": "demobilize", | |
| "demobilised": "demobilized", | |
| "demobilises": "demobilizes", | |
| "demobilising": "demobilizing", | |
| "democratisation": "democratization", | |
| "democratise": "democratize", | |
| "democratised": "democratized", | |
| "democratises": "democratizes", | |
| "democratising": "democratizing", | |
| "demonise": "demonize", | |
| "demonised": "demonized", | |
| "demonises": "demonizes", | |
| "demonising": "demonizing", | |
| "demoralisation": "demoralization", | |
| "demoralise": "demoralize", | |
| "demoralised": "demoralized", | |
| "demoralises": "demoralizes", | |
| "demoralising": "demoralizing", | |
| "denationalisation": "denationalization", | |
| "denationalise": "denationalize", | |
| "denationalised": "denationalized", | |
| "denationalises": "denationalizes", | |
| "denationalising": "denationalizing", | |
| "deodorise": "deodorize", | |
| "deodorised": "deodorized", | |
| "deodorises": "deodorizes", | |
| "deodorising": "deodorizing", | |
| "depersonalise": "depersonalize", | |
| "depersonalised": "depersonalized", | |
| "depersonalises": "depersonalizes", | |
| "depersonalising": "depersonalizing", | |
| "deputise": "deputize", | |
| "deputised": "deputized", | |
| "deputises": "deputizes", | |
| "deputising": "deputizing", | |
| "desensitisation": "desensitization", | |
| "desensitise": "desensitize", | |
| "desensitised": "desensitized", | |
| "desensitises": "desensitizes", | |
| "desensitising": "desensitizing", | |
| "destabilisation": "destabilization", | |
| "destabilise": "destabilize", | |
| "destabilised": "destabilized", | |
| "destabilises": "destabilizes", | |
| "destabilising": "destabilizing", | |
| "dialled": "dialed", | |
| "dialling": "dialing", | |
| "dialogue": "dialog", | |
| "dialogues": "dialogs", | |
| "diarrhoea": "diarrhea", | |
| "digitise": "digitize", | |
| "digitised": "digitized", | |
| "digitises": "digitizes", | |
| "digitising": "digitizing", | |
| "disc": "disk", | |
| "discolour": "discolor", | |
| "discoloured": "discolored", | |
| "discolouring": "discoloring", | |
| "discolours": "discolors", | |
| "discs": "disks", | |
| "disembowelled": "disemboweled", | |
| "disembowelling": "disemboweling", | |
| "disfavour": "disfavor", | |
| "dishevelled": "disheveled", | |
| "dishonour": "dishonor", | |
| "dishonourable": "dishonorable", | |
| "dishonourably": "dishonorably", | |
| "dishonoured": "dishonored", | |
| "dishonouring": "dishonoring", | |
| "dishonours": "dishonors", | |
| "disorganisation": "disorganization", | |
| "disorganised": "disorganized", | |
| "distil": "distill", | |
| "distils": "distills", | |
| "dramatisation": "dramatization", | |
| "dramatisations": "dramatizations", | |
| "dramatise": "dramatize", | |
| "dramatised": "dramatized", | |
| "dramatises": "dramatizes", | |
| "dramatising": "dramatizing", | |
| "draught": "draft", | |
| "draughtboard": "draftboard", | |
| "draughtboards": "draftboards", | |
| "draughtier": "draftier", | |
| "draughtiest": "draftiest", | |
| "draughts": "drafts", | |
| "draughtsman": "draftsman", | |
| "draughtsmanship": "draftsmanship", | |
| "draughtsmen": "draftsmen", | |
| "draughtswoman": "draftswoman", | |
| "draughtswomen": "draftswomen", | |
| "draughty": "drafty", | |
| "drivelled": "driveled", | |
| "drivelling": "driveling", | |
| "duelled": "dueled", | |
| "duelling": "dueling", | |
| "economise": "economize", | |
| "economised": "economized", | |
| "economises": "economizes", | |
| "economising": "economizing", | |
| "editorialise": "editorialize", | |
| "editorialised": "editorialized", | |
| "editorialises": "editorializes", | |
| "editorialising": "editorializing", | |
| "edoema": "edema", | |
| "empathise": "empathize", | |
| "empathised": "empathized", | |
| "empathises": "empathizes", | |
| "empathising": "empathizing", | |
| "emphasise": "emphasize", | |
| "emphasised": "emphasized", | |
| "emphasises": "emphasizes", | |
| "emphasising": "emphasizing", | |
| "enamelled": "enameled", | |
| "enamelling": "enameling", | |
| "enamoured": "enamored", | |
| "encyclopaedia": "encyclopedia", | |
| "encyclopaedias": "encyclopedias", | |
| "encyclopaedic": "encyclopedic", | |
| "endeavour": "endeavor", | |
| "endeavoured": "endeavored", | |
| "endeavouring": "endeavoring", | |
| "endeavours": "endeavors", | |
| "energise": "energize", | |
| "energised": "energized", | |
| "energises": "energizes", | |
| "energising": "energizing", | |
| "enrol": "enroll", | |
| "enrols": "enrolls", | |
| "enthral": "enthrall", | |
| "enthrals": "enthralls", | |
| "epaulette": "epaulet", | |
| "epaulettes": "epaulets", | |
| "epicentre": "epicenter", | |
| "epicentres": "epicenters", | |
| "epilogue": "epilog", | |
| "epilogues": "epilogs", | |
| "epitomise": "epitomize", | |
| "epitomised": "epitomized", | |
| "epitomises": "epitomizes", | |
| "epitomising": "epitomizing", | |
| "equalisation": "equalization", | |
| "equalise": "equalize", | |
| "equalised": "equalized", | |
| "equaliser": "equalizer", | |
| "equalisers": "equalizers", | |
| "equalises": "equalizes", | |
| "equalising": "equalizing", | |
| "eulogise": "eulogize", | |
| "eulogised": "eulogized", | |
| "eulogises": "eulogizes", | |
| "eulogising": "eulogizing", | |
| "evangelise": "evangelize", | |
| "evangelised": "evangelized", | |
| "evangelises": "evangelizes", | |
| "evangelising": "evangelizing", | |
| "exorcise": "exorcize", | |
| "exorcised": "exorcized", | |
| "exorcises": "exorcizes", | |
| "exorcising": "exorcizing", | |
| "extemporisation": "extemporization", | |
| "extemporise": "extemporize", | |
| "extemporised": "extemporized", | |
| "extemporises": "extemporizes", | |
| "extemporising": "extemporizing", | |
| "externalisation": "externalization", | |
| "externalisations": "externalizations", | |
| "externalise": "externalize", | |
| "externalised": "externalized", | |
| "externalises": "externalizes", | |
| "externalising": "externalizing", | |
| "factorise": "factorize", | |
| "factorised": "factorized", | |
| "factorises": "factorizes", | |
| "factorising": "factorizing", | |
| "faecal": "fecal", | |
| "faeces": "feces", | |
| "familiarisation": "familiarization", | |
| "familiarise": "familiarize", | |
| "familiarised": "familiarized", | |
| "familiarises": "familiarizes", | |
| "familiarising": "familiarizing", | |
| "fantasise": "fantasize", | |
| "fantasised": "fantasized", | |
| "fantasises": "fantasizes", | |
| "fantasising": "fantasizing", | |
| "favour": "favor", | |
| "favourable": "favorable", | |
| "favourably": "favorably", | |
| "favoured": "favored", | |
| "favouring": "favoring", | |
| "favourite": "favorite", | |
| "favourites": "favorites", | |
| "favouritism": "favoritism", | |
| "favours": "favors", | |
| "feminise": "feminize", | |
| "feminised": "feminized", | |
| "feminises": "feminizes", | |
| "feminising": "feminizing", | |
| "fertilisation": "fertilization", | |
| "fertilise": "fertilize", | |
| "fertilised": "fertilized", | |
| "fertiliser": "fertilizer", | |
| "fertilisers": "fertilizers", | |
| "fertilises": "fertilizes", | |
| "fertilising": "fertilizing", | |
| "fervour": "fervor", | |
| "fibre": "fiber", | |
| "fibreglass": "fiberglass", | |
| "fibres": "fibers", | |
| "fictionalisation": "fictionalization", | |
| "fictionalisations": "fictionalizations", | |
| "fictionalise": "fictionalize", | |
| "fictionalised": "fictionalized", | |
| "fictionalises": "fictionalizes", | |
| "fictionalising": "fictionalizing", | |
| "fillet": "filet", | |
| "filleted": "fileted", | |
| "filleting": "fileting", | |
| "fillets": "filets", | |
| "finalisation": "finalization", | |
| "finalise": "finalize", | |
| "finalised": "finalized", | |
| "finalises": "finalizes", | |
| "finalising": "finalizing", | |
| "flautist": "flutist", | |
| "flautists": "flutists", | |
| "flavour": "flavor", | |
| "flavoured": "flavored", | |
| "flavouring": "flavoring", | |
| "flavourings": "flavorings", | |
| "flavourless": "flavorless", | |
| "flavours": "flavors", | |
| "flavoursome": "flavorsome", | |
| "flyer / flier": "flier / flyer", | |
| "foetal": "fetal", | |
| "foetid": "fetid", | |
| "foetus": "fetus", | |
| "foetuses": "fetuses", | |
| "formalisation": "formalization", | |
| "formalise": "formalize", | |
| "formalised": "formalized", | |
| "formalises": "formalizes", | |
| "formalising": "formalizing", | |
| "fossilisation": "fossilization", | |
| "fossilise": "fossilize", | |
| "fossilised": "fossilized", | |
| "fossilises": "fossilizes", | |
| "fossilising": "fossilizing", | |
| "fraternisation": "fraternization", | |
| "fraternise": "fraternize", | |
| "fraternised": "fraternized", | |
| "fraternises": "fraternizes", | |
| "fraternising": "fraternizing", | |
| "fulfil": "fulfill", | |
| "fulfilment": "fulfillment", | |
| "fulfils": "fulfills", | |
| "funnelled": "funneled", | |
| "funnelling": "funneling", | |
| "gage": "gauge", | |
| "gaged": "gauged", | |
| "gages": "gauges", | |
| "gaging": "gauging", | |
| "galvanise": "galvanize", | |
| "galvanised": "galvanized", | |
| "galvanises": "galvanizes", | |
| "galvanising": "galvanizing", | |
| "gambolled": "gamboled", | |
| "gambolling": "gamboling", | |
| "gaol": "jail", | |
| "gaolbird": "jailbird", | |
| "gaolbirds": "jailbirds", | |
| "gaolbreak": "jailbreak", | |
| "gaolbreaks": "jailbreaks", | |
| "gaoled": "jailed", | |
| "gaoler": "jailer", | |
| "gaolers": "jailers", | |
| "gaoling": "jailing", | |
| "gaols": "jails", | |
| "gasses": "gases", | |
| "generalisation": "generalization", | |
| "generalisations": "generalizations", | |
| "generalise": "generalize", | |
| "generalised": "generalized", | |
| "generalises": "generalizes", | |
| "generalising": "generalizing", | |
| "ghettoise": "ghettoize", | |
| "ghettoised": "ghettoized", | |
| "ghettoises": "ghettoizes", | |
| "ghettoising": "ghettoizing", | |
| "gipsies": "gypsies", | |
| "glamor": "glamour", | |
| "glamorise": "glamorize", | |
| "glamorised": "glamorized", | |
| "glamorises": "glamorizes", | |
| "glamorising": "glamorizing", | |
| "globalisation": "globalization", | |
| "globalise": "globalize", | |
| "globalised": "globalized", | |
| "globalises": "globalizes", | |
| "globalising": "globalizing", | |
| "glueing": "gluing", | |
| "goitre": "goiter", | |
| "goitres": "goiters", | |
| "gonorrhoea": "gonorrhea", | |
| "gramme": "gram", | |
| "grammes": "grams", | |
| "gravelled": "graveled", | |
| "grey": "gray", | |
| "greyed": "grayed", | |
| "greying": "graying", | |
| "greyish": "grayish", | |
| "greyness": "grayness", | |
| "greys": "grays", | |
| "grovelled": "groveled", | |
| "grovelling": "groveling", | |
| "groyne": "groin", | |
| "groynes": "groins", | |
| "gruelling": "grueling", | |
| "gruellingly": "gruelingly", | |
| "gryphon": "griffin", | |
| "gryphons": "griffins", | |
| "gynaecological": "gynecological", | |
| "gynaecologist": "gynecologist", | |
| "gynaecologists": "gynecologists", | |
| "gynaecology": "gynecology", | |
| "haematological": "hematological", | |
| "haematologist": "hematologist", | |
| "haematologists": "hematologists", | |
| "haematology": "hematology", | |
| "haemoglobin": "hemoglobin", | |
| "haemophilia": "hemophilia", | |
| "haemophiliac": "hemophiliac", | |
| "haemophiliacs": "hemophiliacs", | |
| "haemorrhage": "hemorrhage", | |
| "haemorrhaged": "hemorrhaged", | |
| "haemorrhages": "hemorrhages", | |
| "haemorrhaging": "hemorrhaging", | |
| "haemorrhoids": "hemorrhoids", | |
| "harbour": "harbor", | |
| "harboured": "harbored", | |
| "harbouring": "harboring", | |
| "harbours": "harbors", | |
| "harmonisation": "harmonization", | |
| "harmonise": "harmonize", | |
| "harmonised": "harmonized", | |
| "harmonises": "harmonizes", | |
| "harmonising": "harmonizing", | |
| "homoeopath": "homeopath", | |
| "homoeopathic": "homeopathic", | |
| "homoeopaths": "homeopaths", | |
| "homoeopathy": "homeopathy", | |
| "homogenise": "homogenize", | |
| "homogenised": "homogenized", | |
| "homogenises": "homogenizes", | |
| "homogenising": "homogenizing", | |
| "honour": "honor", | |
| "honourable": "honorable", | |
| "honourably": "honorably", | |
| "honoured": "honored", | |
| "honouring": "honoring", | |
| "honours": "honors", | |
| "hospitalisation": "hospitalization", | |
| "hospitalise": "hospitalize", | |
| "hospitalised": "hospitalized", | |
| "hospitalises": "hospitalizes", | |
| "hospitalising": "hospitalizing", | |
| "humanise": "humanize", | |
| "humanised": "humanized", | |
| "humanises": "humanizes", | |
| "humanising": "humanizing", | |
| "humour": "humor", | |
| "humoured": "humored", | |
| "humouring": "humoring", | |
| "humourless": "humorless", | |
| "humours": "humors", | |
| "hybridise": "hybridize", | |
| "hybridised": "hybridized", | |
| "hybridises": "hybridizes", | |
| "hybridising": "hybridizing", | |
| "hypnotise": "hypnotize", | |
| "hypnotised": "hypnotized", | |
| "hypnotises": "hypnotizes", | |
| "hypnotising": "hypnotizing", | |
| "hypothesise": "hypothesize", | |
| "hypothesised": "hypothesized", | |
| "hypothesises": "hypothesizes", | |
| "hypothesising": "hypothesizing", | |
| "idealisation": "idealization", | |
| "idealise": "idealize", | |
| "idealised": "idealized", | |
| "idealises": "idealizes", | |
| "idealising": "idealizing", | |
| "idolise": "idolize", | |
| "idolised": "idolized", | |
| "idolises": "idolizes", | |
| "idolising": "idolizing", | |
| "immobilisation": "immobilization", | |
| "immobilise": "immobilize", | |
| "immobilised": "immobilized", | |
| "immobiliser": "immobilizer", | |
| "immobilisers": "immobilizers", | |
| "immobilises": "immobilizes", | |
| "immobilising": "immobilizing", | |
| "immortalise": "immortalize", | |
| "immortalised": "immortalized", | |
| "immortalises": "immortalizes", | |
| "immortalising": "immortalizing", | |
| "immunisation": "immunization", | |
| "immunise": "immunize", | |
| "immunised": "immunized", | |
| "immunises": "immunizes", | |
| "immunising": "immunizing", | |
| "impanelled": "impaneled", | |
| "impanelling": "impaneling", | |
| "imperilled": "imperiled", | |
| "imperilling": "imperiling", | |
| "individualise": "individualize", | |
| "individualised": "individualized", | |
| "individualises": "individualizes", | |
| "individualising": "individualizing", | |
| "industrialise": "industrialize", | |
| "industrialised": "industrialized", | |
| "industrialises": "industrializes", | |
| "industrialising": "industrializing", | |
| "inflexion": "inflection", | |
| "inflexions": "inflections", | |
| "initialise": "initialize", | |
| "initialised": "initialized", | |
| "initialises": "initializes", | |
| "initialising": "initializing", | |
| "initialled": "initialed", | |
| "initialling": "initialing", | |
| "instal": "install", | |
| "instalment": "installment", | |
| "instalments": "installments", | |
| "instals": "installs", | |
| "instil": "instill", | |
| "instils": "instills", | |
| "institutionalisation": "institutionalization", | |
| "institutionalise": "institutionalize", | |
| "institutionalised": "institutionalized", | |
| "institutionalises": "institutionalizes", | |
| "institutionalising": "institutionalizing", | |
| "intellectualise": "intellectualize", | |
| "intellectualised": "intellectualized", | |
| "intellectualises": "intellectualizes", | |
| "intellectualising": "intellectualizing", | |
| "internalisation": "internalization", | |
| "internalise": "internalize", | |
| "internalised": "internalized", | |
| "internalises": "internalizes", | |
| "internalising": "internalizing", | |
| "internationalisation": "internationalization", | |
| "internationalise": "internationalize", | |
| "internationalised": "internationalized", | |
| "internationalises": "internationalizes", | |
| "internationalising": "internationalizing", | |
| "ionisation": "ionization", | |
| "ionise": "ionize", | |
| "ionised": "ionized", | |
| "ioniser": "ionizer", | |
| "ionisers": "ionizers", | |
| "ionises": "ionizes", | |
| "ionising": "ionizing", | |
| "italicise": "italicize", | |
| "italicised": "italicized", | |
| "italicises": "italicizes", | |
| "italicising": "italicizing", | |
| "itemise": "itemize", | |
| "itemised": "itemized", | |
| "itemises": "itemizes", | |
| "itemising": "itemizing", | |
| "jeopardise": "jeopardize", | |
| "jeopardised": "jeopardized", | |
| "jeopardises": "jeopardizes", | |
| "jeopardising": "jeopardizing", | |
| "jewelled": "jeweled", | |
| "jeweller": "jeweler", | |
| "jewellers": "jewelers", | |
| "jewellery": "jewelry", | |
| "judgement": "judgment", | |
| "kilogramme": "kilogram", | |
| "kilogrammes": "kilograms", | |
| "kilometre": "kilometer", | |
| "kilometres": "kilometers", | |
| "labelled": "labeled", | |
| "labelling": "labeling", | |
| "labour": "labor", | |
| "laboured": "labored", | |
| "labourer": "laborer", | |
| "labourers": "laborers", | |
| "labouring": "laboring", | |
| "labours": "labors", | |
| "lacklustre": "lackluster", | |
| "legalisation": "legalization", | |
| "legalise": "legalize", | |
| "legalised": "legalized", | |
| "legalises": "legalizes", | |
| "legalising": "legalizing", | |
| "legitimise": "legitimize", | |
| "legitimised": "legitimized", | |
| "legitimises": "legitimizes", | |
| "legitimising": "legitimizing", | |
| "leukaemia": "leukemia", | |
| "levelled": "leveled", | |
| "leveller": "leveler", | |
| "levellers": "levelers", | |
| "levelling": "leveling", | |
| "libelled": "libeled", | |
| "libelling": "libeling", | |
| "libellous": "libelous", | |
| "liberalisation": "liberalization", | |
| "liberalise": "liberalize", | |
| "liberalised": "liberalized", | |
| "liberalises": "liberalizes", | |
| "liberalising": "liberalizing", | |
| "licence": "license", | |
| "licenced": "licensed", | |
| "licences": "licenses", | |
| "licencing": "licensing", | |
| "likeable": "likable", | |
| "lionisation": "lionization", | |
| "lionise": "lionize", | |
| "lionised": "lionized", | |
| "lionises": "lionizes", | |
| "lionising": "lionizing", | |
| "liquidise": "liquidize", | |
| "liquidised": "liquidized", | |
| "liquidiser": "liquidizer", | |
| "liquidisers": "liquidizers", | |
| "liquidises": "liquidizes", | |
| "liquidising": "liquidizing", | |
| "litre": "liter", | |
| "litres": "liters", | |
| "localise": "localize", | |
| "localised": "localized", | |
| "localises": "localizes", | |
| "localising": "localizing", | |
| "louvre": "louver", | |
| "louvred": "louvered", | |
| "louvres": "louvers", | |
| "lustre": "luster", | |
| "magnetise": "magnetize", | |
| "magnetised": "magnetized", | |
| "magnetises": "magnetizes", | |
| "magnetising": "magnetizing", | |
| "manoeuvrability": "maneuverability", | |
| "manoeuvrable": "maneuverable", | |
| "manoeuvre": "maneuver", | |
| "manoeuvred": "maneuvered", | |
| "manoeuvres": "maneuvers", | |
| "manoeuvring": "maneuvering", | |
| "manoeuvrings": "maneuverings", | |
| "marginalisation": "marginalization", | |
| "marginalise": "marginalize", | |
| "marginalised": "marginalized", | |
| "marginalises": "marginalizes", | |
| "marginalising": "marginalizing", | |
| "marshalled": "marshaled", | |
| "marshalling": "marshaling", | |
| "marvelled": "marveled", | |
| "marvelling": "marveling", | |
| "marvellous": "marvelous", | |
| "marvellously": "marvelously", | |
| "materialisation": "materialization", | |
| "materialise": "materialize", | |
| "materialised": "materialized", | |
| "materialises": "materializes", | |
| "materialising": "materializing", | |
| "maximisation": "maximization", | |
| "maximise": "maximize", | |
| "maximised": "maximized", | |
| "maximises": "maximizes", | |
| "maximising": "maximizing", | |
| "meagre": "meager", | |
| "mechanisation": "mechanization", | |
| "mechanise": "mechanize", | |
| "mechanised": "mechanized", | |
| "mechanises": "mechanizes", | |
| "mechanising": "mechanizing", | |
| "mediaeval": "medieval", | |
| "memorialise": "memorialize", | |
| "memorialised": "memorialized", | |
| "memorialises": "memorializes", | |
| "memorialising": "memorializing", | |
| "memorise": "memorize", | |
| "memorised": "memorized", | |
| "memorises": "memorizes", | |
| "memorising": "memorizing", | |
| "mesmerise": "mesmerize", | |
| "mesmerised": "mesmerized", | |
| "mesmerises": "mesmerizes", | |
| "mesmerising": "mesmerizing", | |
| "metabolise": "metabolize", | |
| "metabolised": "metabolized", | |
| "metabolises": "metabolizes", | |
| "metabolising": "metabolizing", | |
| "metre": "meter", | |
| "metres": "meters", | |
| "mhm": "hmm", | |
| "micrometre": "micrometer", | |
| "micrometres": "micrometers", | |
| "militarise": "militarize", | |
| "militarised": "militarized", | |
| "militarises": "militarizes", | |
| "militarising": "militarizing", | |
| "milligramme": "milligram", | |
| "milligrammes": "milligrams", | |
| "millilitre": "milliliter", | |
| "millilitres": "milliliters", | |
| "millimetre": "millimeter", | |
| "millimetres": "millimeters", | |
| "miniaturisation": "miniaturization", | |
| "miniaturise": "miniaturize", | |
| "miniaturised": "miniaturized", | |
| "miniaturises": "miniaturizes", | |
| "miniaturising": "miniaturizing", | |
| "minibusses": "minibuses", | |
| "minimise": "minimize", | |
| "minimised": "minimized", | |
| "minimises": "minimizes", | |
| "minimising": "minimizing", | |
| "misbehaviour": "misbehavior", | |
| "misdemeanour": "misdemeanor", | |
| "misdemeanours": "misdemeanors", | |
| "misspelt": "misspelled", | |
| "mitre": "miter", | |
| "mitres": "miters", | |
| "mm": "hmm", | |
| "mmm": "hmm", | |
| "mobilisation": "mobilization", | |
| "mobilise": "mobilize", | |
| "mobilised": "mobilized", | |
| "mobilises": "mobilizes", | |
| "mobilising": "mobilizing", | |
| "modelled": "modeled", | |
| "modeller": "modeler", | |
| "modellers": "modelers", | |
| "modelling": "modeling", | |
| "modernise": "modernize", | |
| "modernised": "modernized", | |
| "modernises": "modernizes", | |
| "modernising": "modernizing", | |
| "moisturise": "moisturize", | |
| "moisturised": "moisturized", | |
| "moisturiser": "moisturizer", | |
| "moisturisers": "moisturizers", | |
| "moisturises": "moisturizes", | |
| "moisturising": "moisturizing", | |
| "monologue": "monolog", | |
| "monologues": "monologs", | |
| "monopolisation": "monopolization", | |
| "monopolise": "monopolize", | |
| "monopolised": "monopolized", | |
| "monopolises": "monopolizes", | |
| "monopolising": "monopolizing", | |
| "moralise": "moralize", | |
| "moralised": "moralized", | |
| "moralises": "moralizes", | |
| "moralising": "moralizing", | |
| "motorised": "motorized", | |
| "mould": "mold", | |
| "moulded": "molded", | |
| "moulder": "molder", | |
| "mouldered": "moldered", | |
| "mouldering": "moldering", | |
| "moulders": "molders", | |
| "mouldier": "moldier", | |
| "mouldiest": "moldiest", | |
| "moulding": "molding", | |
| "mouldings": "moldings", | |
| "moulds": "molds", | |
| "mouldy": "moldy", | |
| "moult": "molt", | |
| "moulted": "molted", | |
| "moulting": "molting", | |
| "moults": "molts", | |
| "moustache": "mustache", | |
| "moustached": "mustached", | |
| "moustaches": "mustaches", | |
| "moustachioed": "mustachioed", | |
| "multicoloured": "multicolored", | |
| "nationalisation": "nationalization", | |
| "nationalisations": "nationalizations", | |
| "nationalise": "nationalize", | |
| "nationalised": "nationalized", | |
| "nationalises": "nationalizes", | |
| "nationalising": "nationalizing", | |
| "naturalisation": "naturalization", | |
| "naturalise": "naturalize", | |
| "naturalised": "naturalized", | |
| "naturalises": "naturalizes", | |
| "naturalising": "naturalizing", | |
| "neighbour": "neighbor", | |
| "neighbourhood": "neighborhood", | |
| "neighbourhoods": "neighborhoods", | |
| "neighbouring": "neighboring", | |
| "neighbourliness": "neighborliness", | |
| "neighbourly": "neighborly", | |
| "neighbours": "neighbors", | |
| "neutralisation": "neutralization", | |
| "neutralise": "neutralize", | |
| "neutralised": "neutralized", | |
| "neutralises": "neutralizes", | |
| "neutralising": "neutralizing", | |
| "normalisation": "normalization", | |
| "normalise": "normalize", | |
| "normalised": "normalized", | |
| "normalises": "normalizes", | |
| "normalising": "normalizing", | |
| "odour": "odor", | |
| "odourless": "odorless", | |
| "odours": "odors", | |
| "oesophagus": "esophagus", | |
| "oesophaguses": "esophaguses", | |
| "oestrogen": "estrogen", | |
| "offence": "offense", | |
| "offences": "offenses", | |
| "omelette": "omelet", | |
| "omelettes": "omelets", | |
| "optimise": "optimize", | |
| "optimised": "optimized", | |
| "optimises": "optimizes", | |
| "optimising": "optimizing", | |
| "organisation": "organization", | |
| "organisational": "organizational", | |
| "organisations": "organizations", | |
| "organise": "organize", | |
| "organised": "organized", | |
| "organiser": "organizer", | |
| "organisers": "organizers", | |
| "organises": "organizes", | |
| "organising": "organizing", | |
| "orthopaedic": "orthopedic", | |
| "orthopaedics": "orthopedics", | |
| "ostracise": "ostracize", | |
| "ostracised": "ostracized", | |
| "ostracises": "ostracizes", | |
| "ostracising": "ostracizing", | |
| "outmanoeuvre": "outmaneuver", | |
| "outmanoeuvred": "outmaneuvered", | |
| "outmanoeuvres": "outmaneuvers", | |
| "outmanoeuvring": "outmaneuvering", | |
| "overemphasise": "overemphasize", | |
| "overemphasised": "overemphasized", | |
| "overemphasises": "overemphasizes", | |
| "overemphasising": "overemphasizing", | |
| "oxidisation": "oxidization", | |
| "oxidise": "oxidize", | |
| "oxidised": "oxidized", | |
| "oxidises": "oxidizes", | |
| "oxidising": "oxidizing", | |
| "paederast": "pederast", | |
| "paederasts": "pederasts", | |
| "paediatric": "pediatric", | |
| "paediatrician": "pediatrician", | |
| "paediatricians": "pediatricians", | |
| "paediatrics": "pediatrics", | |
| "paedophile": "pedophile", | |
| "paedophiles": "pedophiles", | |
| "paedophilia": "pedophilia", | |
| "palaeolithic": "paleolithic", | |
| "palaeontologist": "paleontologist", | |
| "palaeontologists": "paleontologists", | |
| "palaeontology": "paleontology", | |
| "panelled": "paneled", | |
| "panelling": "paneling", | |
| "panellist": "panelist", | |
| "panellists": "panelists", | |
| "paralyse": "paralyze", | |
| "paralysed": "paralyzed", | |
| "paralyses": "paralyzes", | |
| "paralysing": "paralyzing", | |
| "parcelled": "parceled", | |
| "parcelling": "parceling", | |
| "parlour": "parlor", | |
| "parlours": "parlors", | |
| "particularise": "particularize", | |
| "particularised": "particularized", | |
| "particularises": "particularizes", | |
| "particularising": "particularizing", | |
| "passivisation": "passivization", | |
| "passivise": "passivize", | |
| "passivised": "passivized", | |
| "passivises": "passivizes", | |
| "passivising": "passivizing", | |
| "pasteurisation": "pasteurization", | |
| "pasteurise": "pasteurize", | |
| "pasteurised": "pasteurized", | |
| "pasteurises": "pasteurizes", | |
| "pasteurising": "pasteurizing", | |
| "patronise": "patronize", | |
| "patronised": "patronized", | |
| "patronises": "patronizes", | |
| "patronising": "patronizing", | |
| "patronisingly": "patronizingly", | |
| "pedalled": "pedaled", | |
| "pedalling": "pedaling", | |
| "pedestrianisation": "pedestrianization", | |
| "pedestrianise": "pedestrianize", | |
| "pedestrianised": "pedestrianized", | |
| "pedestrianises": "pedestrianizes", | |
| "pedestrianising": "pedestrianizing", | |
| "penalise": "penalize", | |
| "penalised": "penalized", | |
| "penalises": "penalizes", | |
| "penalising": "penalizing", | |
| "pencilled": "penciled", | |
| "pencilling": "penciling", | |
| "personalise": "personalize", | |
| "personalised": "personalized", | |
| "personalises": "personalizes", | |
| "personalising": "personalizing", | |
| "pharmacopoeia": "pharmacopeia", | |
| "pharmacopoeias": "pharmacopeias", | |
| "philosophise": "philosophize", | |
| "philosophised": "philosophized", | |
| "philosophises": "philosophizes", | |
| "philosophising": "philosophizing", | |
| "philtre": "filter", | |
| "philtres": "filters", | |
| "phoney": "phony", | |
| "plagiarise": "plagiarize", | |
| "plagiarised": "plagiarized", | |
| "plagiarises": "plagiarizes", | |
| "plagiarising": "plagiarizing", | |
| "plough": "plow", | |
| "ploughed": "plowed", | |
| "ploughing": "plowing", | |
| "ploughman": "plowman", | |
| "ploughmen": "plowmen", | |
| "ploughs": "plows", | |
| "ploughshare": "plowshare", | |
| "ploughshares": "plowshares", | |
| "polarisation": "polarization", | |
| "polarise": "polarize", | |
| "polarised": "polarized", | |
| "polarises": "polarizes", | |
| "polarising": "polarizing", | |
| "politicisation": "politicization", | |
| "politicise": "politicize", | |
| "politicised": "politicized", | |
| "politicises": "politicizes", | |
| "politicising": "politicizing", | |
| "popularisation": "popularization", | |
| "popularise": "popularize", | |
| "popularised": "popularized", | |
| "popularises": "popularizes", | |
| "popularising": "popularizing", | |
| "pouffe": "pouf", | |
| "pouffes": "poufs", | |
| "practise": "practice", | |
| "practised": "practiced", | |
| "practises": "practices", | |
| "practising": "practicing", | |
| "praesidium": "presidium", | |
| "praesidiums": "presidiums", | |
| "pressurisation": "pressurization", | |
| "pressurise": "pressurize", | |
| "pressurised": "pressurized", | |
| "pressurises": "pressurizes", | |
| "pressurising": "pressurizing", | |
| "pretence": "pretense", | |
| "pretences": "pretenses", | |
| "primaeval": "primeval", | |
| "prioritisation": "prioritization", | |
| "prioritise": "prioritize", | |
| "prioritised": "prioritized", | |
| "prioritises": "prioritizes", | |
| "prioritising": "prioritizing", | |
| "privatisation": "privatization", | |
| "privatisations": "privatizations", | |
| "privatise": "privatize", | |
| "privatised": "privatized", | |
| "privatises": "privatizes", | |
| "privatising": "privatizing", | |
| "professionalisation": "professionalization", | |
| "professionalise": "professionalize", | |
| "professionalised": "professionalized", | |
| "professionalises": "professionalizes", | |
| "professionalising": "professionalizing", | |
| "programme": "program", | |
| "programmes": "programs", | |
| "prologue": "prolog", | |
| "prologues": "prologs", | |
| "propagandise": "propagandize", | |
| "propagandised": "propagandized", | |
| "propagandises": "propagandizes", | |
| "propagandising": "propagandizing", | |
| "proselytise": "proselytize", | |
| "proselytised": "proselytized", | |
| "proselytiser": "proselytizer", | |
| "proselytisers": "proselytizers", | |
| "proselytises": "proselytizes", | |
| "proselytising": "proselytizing", | |
| "psychoanalyse": "psychoanalyze", | |
| "psychoanalysed": "psychoanalyzed", | |
| "psychoanalyses": "psychoanalyzes", | |
| "psychoanalysing": "psychoanalyzing", | |
| "publicise": "publicize", | |
| "publicised": "publicized", | |
| "publicises": "publicizes", | |
| "publicising": "publicizing", | |
| "pulverisation": "pulverization", | |
| "pulverise": "pulverize", | |
| "pulverised": "pulverized", | |
| "pulverises": "pulverizes", | |
| "pulverising": "pulverizing", | |
| "pummelled": "pummel", | |
| "pummelling": "pummeled", | |
| "pyjama": "pajama", | |
| "pyjamas": "pajamas", | |
| "pzazz": "pizzazz", | |
| "quarrelled": "quarreled", | |
| "quarrelling": "quarreling", | |
| "radicalise": "radicalize", | |
| "radicalised": "radicalized", | |
| "radicalises": "radicalizes", | |
| "radicalising": "radicalizing", | |
| "rancour": "rancor", | |
| "randomise": "randomize", | |
| "randomised": "randomized", | |
| "randomises": "randomizes", | |
| "randomising": "randomizing", | |
| "rationalisation": "rationalization", | |
| "rationalisations": "rationalizations", | |
| "rationalise": "rationalize", | |
| "rationalised": "rationalized", | |
| "rationalises": "rationalizes", | |
| "rationalising": "rationalizing", | |
| "ravelled": "raveled", | |
| "ravelling": "raveling", | |
| "realisable": "realizable", | |
| "realisation": "realization", | |
| "realisations": "realizations", | |
| "realise": "realize", | |
| "realised": "realized", | |
| "realises": "realizes", | |
| "realising": "realizing", | |
| "recognisable": "recognizable", | |
| "recognisably": "recognizably", | |
| "recognisance": "recognizance", | |
| "recognise": "recognize", | |
| "recognised": "recognized", | |
| "recognises": "recognizes", | |
| "recognising": "recognizing", | |
| "reconnoitre": "reconnoiter", | |
| "reconnoitred": "reconnoitered", | |
| "reconnoitres": "reconnoiters", | |
| "reconnoitring": "reconnoitering", | |
| "refuelled": "refueled", | |
| "refuelling": "refueling", | |
| "regularisation": "regularization", | |
| "regularise": "regularize", | |
| "regularised": "regularized", | |
| "regularises": "regularizes", | |
| "regularising": "regularizing", | |
| "remodelled": "remodeled", | |
| "remodelling": "remodeling", | |
| "remould": "remold", | |
| "remoulded": "remolded", | |
| "remoulding": "remolding", | |
| "remoulds": "remolds", | |
| "reorganisation": "reorganization", | |
| "reorganisations": "reorganizations", | |
| "reorganise": "reorganize", | |
| "reorganised": "reorganized", | |
| "reorganises": "reorganizes", | |
| "reorganising": "reorganizing", | |
| "revelled": "reveled", | |
| "reveller": "reveler", | |
| "revellers": "revelers", | |
| "revelling": "reveling", | |
| "revitalise": "revitalize", | |
| "revitalised": "revitalized", | |
| "revitalises": "revitalizes", | |
| "revitalising": "revitalizing", | |
| "revolutionise": "revolutionize", | |
| "revolutionised": "revolutionized", | |
| "revolutionises": "revolutionizes", | |
| "revolutionising": "revolutionizing", | |
| "rhapsodise": "rhapsodize", | |
| "rhapsodised": "rhapsodized", | |
| "rhapsodises": "rhapsodizes", | |
| "rhapsodising": "rhapsodizing", | |
| "rigour": "rigor", | |
| "rigours": "rigors", | |
| "ritualised": "ritualized", | |
| "rivalled": "rivaled", | |
| "rivalling": "rivaling", | |
| "romanticise": "romanticize", | |
| "romanticised": "romanticized", | |
| "romanticises": "romanticizes", | |
| "romanticising": "romanticizing", | |
| "rumour": "rumor", | |
| "rumoured": "rumored", | |
| "rumours": "rumors", | |
| "sabre": "saber", | |
| "sabres": "sabers", | |
| "saltpetre": "saltpeter", | |
| "sanitise": "sanitize", | |
| "sanitised": "sanitized", | |
| "sanitises": "sanitizes", | |
| "sanitising": "sanitizing", | |
| "satirise": "satirize", | |
| "satirised": "satirized", | |
| "satirises": "satirizes", | |
| "satirising": "satirizing", | |
| "saviour": "savior", | |
| "saviours": "saviors", | |
| "savour": "savor", | |
| "savoured": "savored", | |
| "savouries": "savories", | |
| "savouring": "savoring", | |
| "savours": "savors", | |
| "savoury": "savory", | |
| "scandalise": "scandalize", | |
| "scandalised": "scandalized", | |
| "scandalises": "scandalizes", | |
| "scandalising": "scandalizing", | |
| "sceptic": "skeptic", | |
| "sceptical": "skeptical", | |
| "sceptically": "skeptically", | |
| "scepticism": "skepticism", | |
| "sceptics": "skeptics", | |
| "sceptre": "scepter", | |
| "sceptres": "scepters", | |
| "scrutinise": "scrutinize", | |
| "scrutinised": "scrutinized", | |
| "scrutinises": "scrutinizes", | |
| "scrutinising": "scrutinizing", | |
| "secularisation": "secularization", | |
| "secularise": "secularize", | |
| "secularised": "secularized", | |
| "secularises": "secularizes", | |
| "secularising": "secularizing", | |
| "sensationalise": "sensationalize", | |
| "sensationalised": "sensationalized", | |
| "sensationalises": "sensationalizes", | |
| "sensationalising": "sensationalizing", | |
| "sensitise": "sensitize", | |
| "sensitised": "sensitized", | |
| "sensitises": "sensitizes", | |
| "sensitising": "sensitizing", | |
| "sentimentalise": "sentimentalize", | |
| "sentimentalised": "sentimentalized", | |
| "sentimentalises": "sentimentalizes", | |
| "sentimentalising": "sentimentalizing", | |
| "sepulchre": "sepulcher", | |
| "sepulchres": "sepulchers", | |
| "serialisation": "serialization", | |
| "serialisations": "serializations", | |
| "serialise": "serialize", | |
| "serialised": "serialized", | |
| "serialises": "serializes", | |
| "serialising": "serializing", | |
| "sermonise": "sermonize", | |
| "sermonised": "sermonized", | |
| "sermonises": "sermonizes", | |
| "sermonising": "sermonizing", | |
| "sheikh": "sheik", | |
| "shovelled": "shoveled", | |
| "shovelling": "shoveling", | |
| "shrivelled": "shriveled", | |
| "shrivelling": "shriveling", | |
| "signalise": "signalize", | |
| "signalised": "signalized", | |
| "signalises": "signalizes", | |
| "signalising": "signalizing", | |
| "signalled": "signaled", | |
| "signalling": "signaling", | |
| "smoulder": "smolder", | |
| "smouldered": "smoldered", | |
| "smouldering": "smoldering", | |
| "smoulders": "smolders", | |
| "snivelled": "sniveled", | |
| "snivelling": "sniveling", | |
| "snorkelled": "snorkeled", | |
| "snorkelling": "snorkeling", | |
| "snowplough": "snowplow", | |
| "snowploughs": "snowplow", | |
| "socialisation": "socialization", | |
| "socialise": "socialize", | |
| "socialised": "socialized", | |
| "socialises": "socializes", | |
| "socialising": "socializing", | |
| "sodomise": "sodomize", | |
| "sodomised": "sodomized", | |
| "sodomises": "sodomizes", | |
| "sodomising": "sodomizing", | |
| "solemnise": "solemnize", | |
| "solemnised": "solemnized", | |
| "solemnises": "solemnizes", | |
| "solemnising": "solemnizing", | |
| "sombre": "somber", | |
| "specialisation": "specialization", | |
| "specialisations": "specializations", | |
| "specialise": "specialize", | |
| "specialised": "specialized", | |
| "specialises": "specializes", | |
| "specialising": "specializing", | |
| "spectre": "specter", | |
| "spectres": "specters", | |
| "spiralled": "spiraled", | |
| "spiralling": "spiraling", | |
| "splendour": "splendor", | |
| "splendours": "splendors", | |
| "squirrelled": "squirreled", | |
| "squirrelling": "squirreling", | |
| "stabilisation": "stabilization", | |
| "stabilise": "stabilize", | |
| "stabilised": "stabilized", | |
| "stabiliser": "stabilizer", | |
| "stabilisers": "stabilizers", | |
| "stabilises": "stabilizes", | |
| "stabilising": "stabilizing", | |
| "standardisation": "standardization", | |
| "standardise": "standardize", | |
| "standardised": "standardized", | |
| "standardises": "standardizes", | |
| "standardising": "standardizing", | |
| "stencilled": "stenciled", | |
| "stencilling": "stenciling", | |
| "sterilisation": "sterilization", | |
| "sterilisations": "sterilizations", | |
| "sterilise": "sterilize", | |
| "sterilised": "sterilized", | |
| "steriliser": "sterilizer", | |
| "sterilisers": "sterilizers", | |
| "sterilises": "sterilizes", | |
| "sterilising": "sterilizing", | |
| "stigmatisation": "stigmatization", | |
| "stigmatise": "stigmatize", | |
| "stigmatised": "stigmatized", | |
| "stigmatises": "stigmatizes", | |
| "stigmatising": "stigmatizing", | |
| "storey": "story", | |
| "storeys": "stories", | |
| "subsidisation": "subsidization", | |
| "subsidise": "subsidize", | |
| "subsidised": "subsidized", | |
| "subsidiser": "subsidizer", | |
| "subsidisers": "subsidizers", | |
| "subsidises": "subsidizes", | |
| "subsidising": "subsidizing", | |
| "succour": "succor", | |
| "succoured": "succored", | |
| "succouring": "succoring", | |
| "succours": "succors", | |
| "sulphate": "sulfate", | |
| "sulphates": "sulfates", | |
| "sulphide": "sulfide", | |
| "sulphides": "sulfides", | |
| "sulphur": "sulfur", | |
| "sulphurous": "sulfurous", | |
| "summarise": "summarize", | |
| "summarised": "summarized", | |
| "summarises": "summarizes", | |
| "summarising": "summarizing", | |
| "swivelled": "swiveled", | |
| "swivelling": "swiveling", | |
| "symbolise": "symbolize", | |
| "symbolised": "symbolized", | |
| "symbolises": "symbolizes", | |
| "symbolising": "symbolizing", | |
| "sympathise": "sympathize", | |
| "sympathised": "sympathized", | |
| "sympathiser": "sympathizer", | |
| "sympathisers": "sympathizers", | |
| "sympathises": "sympathizes", | |
| "sympathising": "sympathizing", | |
| "synchronisation": "synchronization", | |
| "synchronise": "synchronize", | |
| "synchronised": "synchronized", | |
| "synchronises": "synchronizes", | |
| "synchronising": "synchronizing", | |
| "synthesise": "synthesize", | |
| "synthesised": "synthesized", | |
| "synthesiser": "synthesizer", | |
| "synthesisers": "synthesizers", | |
| "synthesises": "synthesizes", | |
| "synthesising": "synthesizing", | |
| "syphon": "siphon", | |
| "syphoned": "siphoned", | |
| "syphoning": "siphoning", | |
| "syphons": "siphons", | |
| "systematisation": "systematization", | |
| "systematise": "systematize", | |
| "systematised": "systematized", | |
| "systematises": "systematizes", | |
| "systematising": "systematizing", | |
| "tantalise": "tantalize", | |
| "tantalised": "tantalized", | |
| "tantalises": "tantalizes", | |
| "tantalising": "tantalizing", | |
| "tantalisingly": "tantalizingly", | |
| "tasselled": "tasseled", | |
| "technicolour": "technicolor", | |
| "temporise": "temporize", | |
| "temporised": "temporized", | |
| "temporises": "temporizes", | |
| "temporising": "temporizing", | |
| "tenderise": "tenderize", | |
| "tenderised": "tenderized", | |
| "tenderises": "tenderizes", | |
| "tenderising": "tenderizing", | |
| "terrorise": "terrorize", | |
| "terrorised": "terrorized", | |
| "terrorises": "terrorizes", | |
| "terrorising": "terrorizing", | |
| "theatre": "theater", | |
| "theatregoer": "theatergoer", | |
| "theatregoers": "theatergoers", | |
| "theatres": "theaters", | |
| "theorise": "theorize", | |
| "theorised": "theorized", | |
| "theorises": "theorizes", | |
| "theorising": "theorizing", | |
| "tonne": "ton", | |
| "tonnes": "tons", | |
| "towelled": "toweled", | |
| "towelling": "toweling", | |
| "toxaemia": "toxemia", | |
| "tranquillise": "tranquilize", | |
| "tranquillised": "tranquilized", | |
| "tranquilliser": "tranquilizer", | |
| "tranquillisers": "tranquilizers", | |
| "tranquillises": "tranquilizes", | |
| "tranquillising": "tranquilizing", | |
| "tranquillity": "tranquility", | |
| "tranquillize": "tranquilize", | |
| "tranquillized": "tranquilized", | |
| "tranquillizer": "tranquilizer", | |
| "tranquillizers": "tranquilizers", | |
| "tranquillizes": "tranquilizes", | |
| "tranquillizing": "tranquilizing", | |
| "tranquilly": "tranquility", | |
| "transistorised": "transistorized", | |
| "traumatise": "traumatize", | |
| "traumatised": "traumatized", | |
| "traumatises": "traumatizes", | |
| "traumatising": "traumatizing", | |
| "travelled": "traveled", | |
| "traveller": "traveler", | |
| "travellers": "travelers", | |
| "travelling": "traveling", | |
| "travelog": "travelogue", | |
| "travelogs": "travelogues", | |
| "trialled": "trialed", | |
| "trialling": "trialing", | |
| "tricolour": "tricolor", | |
| "tricolours": "tricolors", | |
| "trivialise": "trivialize", | |
| "trivialised": "trivialized", | |
| "trivialises": "trivializes", | |
| "trivialising": "trivializing", | |
| "tumour": "tumor", | |
| "tumours": "tumors", | |
| "tunnelled": "tunneled", | |
| "tunnelling": "tunneling", | |
| "tyrannise": "tyrannize", | |
| "tyrannised": "tyrannized", | |
| "tyrannises": "tyrannizes", | |
| "tyrannising": "tyrannizing", | |
| "tyre": "tire", | |
| "tyres": "tires", | |
| "unauthorised": "unauthorized", | |
| "uncivilised": "uncivilized", | |
| "underutilised": "underutilized", | |
| "unequalled": "unequaled", | |
| "unfavourable": "unfavorable", | |
| "unfavourably": "unfavorably", | |
| "unionisation": "unionization", | |
| "unionise": "unionize", | |
| "unionised": "unionized", | |
| "unionises": "unionizes", | |
| "unionising": "unionizing", | |
| "unorganised": "unorganized", | |
| "unravelled": "unraveled", | |
| "unravelling": "unraveling", | |
| "unrecognisable": "unrecognizable", | |
| "unrecognised": "unrecognized", | |
| "unrivalled": "unrivaled", | |
| "unsavoury": "unsavory", | |
| "untrammelled": "untrammeled", | |
| "urbanisation": "urbanization", | |
| "urbanise": "urbanize", | |
| "urbanised": "urbanized", | |
| "urbanises": "urbanizes", | |
| "urbanising": "urbanizing", | |
| "utilisable": "utilizable", | |
| "utilisation": "utilization", | |
| "utilise": "utilize", | |
| "utilised": "utilized", | |
| "utilises": "utilizes", | |
| "utilising": "utilizing", | |
| "valour": "valor", | |
| "vandalise": "vandalize", | |
| "vandalised": "vandalized", | |
| "vandalises": "vandalizes", | |
| "vandalising": "vandalizing", | |
| "vaporisation": "vaporization", | |
| "vaporise": "vaporize", | |
| "vaporised": "vaporized", | |
| "vaporises": "vaporizes", | |
| "vaporising": "vaporizing", | |
| "vapour": "vapor", | |
| "vapours": "vapors", | |
| "verbalise": "verbalize", | |
| "verbalised": "verbalized", | |
| "verbalises": "verbalizes", | |
| "verbalising": "verbalizing", | |
| "victimisation": "victimization", | |
| "victimise": "victimize", | |
| "victimised": "victimized", | |
| "victimises": "victimizes", | |
| "victimising": "victimizing", | |
| "videodisc": "videodisk", | |
| "videodiscs": "videodisks", | |
| "vigour": "vigor", | |
| "visualisation": "visualization", | |
| "visualisations": "visualizations", | |
| "visualise": "visualize", | |
| "visualised": "visualized", | |
| "visualises": "visualizes", | |
| "visualising": "visualizing", | |
| "vocalisation": "vocalization", | |
| "vocalisations": "vocalizations", | |
| "vocalise": "vocalize", | |
| "vocalised": "vocalized", | |
| "vocalises": "vocalizes", | |
| "vocalising": "vocalizing", | |
| "vulcanised": "vulcanized", | |
| "vulgarisation": "vulgarization", | |
| "vulgarise": "vulgarize", | |
| "vulgarised": "vulgarized", | |
| "vulgarises": "vulgarizes", | |
| "vulgarising": "vulgarizing", | |
| "waggon": "wagon", | |
| "waggons": "wagons", | |
| "watercolour": "watercolor", | |
| "watercolours": "watercolors", | |
| "weaselled": "weaseled", | |
| "weaselling": "weaseling", | |
| "westernisation": "westernization", | |
| "westernise": "westernize", | |
| "westernised": "westernized", | |
| "westernises": "westernizes", | |
| "westernising": "westernizing", | |
| "womanise": "womanize", | |
| "womanised": "womanized", | |
| "womaniser": "womanizer", | |
| "womanisers": "womanizers", | |
| "womanises": "womanizes", | |
| "womanising": "womanizing", | |
| "woollen": "woolen", | |
| "woollens": "woolens", | |
| "woollies": "woolies", | |
| "woolly": "wooly", | |
| "worshipped": "worshiped", | |
| "worshipper": "worshiper", | |
| "worshipping": "worshiping", | |
| "yodelled": "yodeled", | |
| "yodelling": "yodeling", | |
| "yoghourt": "yogurt", | |
| "yoghourts": "yogurts", | |
| "yoghurt": "yogurt", | |
| "yoghurts": "yogurts", | |
| } | |
| # non-ASCII letters that are not separated by "NFKD" normalization | |
| ADDITIONAL_DIACRITICS = { | |
| "œ": "oe", | |
| "Œ": "OE", | |
| "ø": "o", | |
| "Ø": "O", | |
| "æ": "ae", | |
| "Æ": "AE", | |
| "ß": "ss", | |
| "ẞ": "SS", | |
| "đ": "d", | |
| "Đ": "D", | |
| "ð": "d", | |
| "Ð": "D", | |
| "þ": "th", | |
| "Þ": "th", | |
| "ł": "l", | |
| "Ł": "L", | |
| } | |
| def remove_symbols_and_diacritics(s: str, keep=""): | |
| """ | |
| Replace any other markers, symbols, and punctuations with a space, and drop any diacritics | |
| (category 'Mn' and some manual mappings) | |
| """ | |
| def replace_character(char): | |
| if char in keep: | |
| return char | |
| elif char in ADDITIONAL_DIACRITICS: | |
| return ADDITIONAL_DIACRITICS[char] | |
| elif unicodedata.category(char) == "Mn": | |
| return "" | |
| elif unicodedata.category(char)[0] in "MSP": | |
| return " " | |
| return char | |
| return "".join(replace_character(c) for c in unicodedata.normalize("NFKD", s)) | |
| def remove_symbols(s: str): | |
| """ | |
| Replace any other markers, symbols, punctuations with a space, keeping diacritics | |
| """ | |
| return "".join( | |
| " " if unicodedata.category(c)[0] in "MSP" else c | |
| for c in unicodedata.normalize("NFKC", s) | |
| ) | |
| class BasicTextNormalizer: | |
| def __init__(self, remove_diacritics: bool = False, split_letters: bool = False): | |
| self.clean = ( | |
| remove_symbols_and_diacritics if remove_diacritics else remove_symbols | |
| ) | |
| self.split_letters = split_letters | |
| def __call__(self, s: str): | |
| s = s.lower() | |
| s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets | |
| s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis | |
| s = self.clean(s).lower() | |
| if self.split_letters: | |
| s = " ".join(regex.findall(r"\X", s, regex.U)) | |
| s = re.sub( | |
| r"\s+", " ", s | |
| ) # replace any successive whitespace characters with a space | |
| return s | |
| class EnglishNumberNormalizer: | |
| """ | |
| Convert any spelled-out numbers into arabic numbers, while handling: | |
| - remove any commas | |
| - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc. | |
| - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars` | |
| - spell out `one` and `ones` | |
| - interpret successive single-digit numbers as nominal: `one oh one` -> `101` | |
| """ | |
| def __init__(self): | |
| super().__init__() | |
| self.zeros = {"o", "oh", "zero"} | |
| # fmt: off | |
| self.ones = { | |
| name: i | |
| for i, name in enumerate( | |
| [ | |
| "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", | |
| "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", | |
| "eighteen", "nineteen"], | |
| start=1, | |
| ) | |
| } | |
| # fmt: on | |
| self.ones_plural = { | |
| "sixes" if name == "six" else name + "s": (value, "s") | |
| for name, value in self.ones.items() | |
| } | |
| self.ones_ordinal = { | |
| "zeroth": (0, "th"), | |
| "first": (1, "st"), | |
| "second": (2, "nd"), | |
| "third": (3, "rd"), | |
| "fifth": (5, "th"), | |
| "twelfth": (12, "th"), | |
| **{ | |
| name + ("h" if name.endswith("t") else "th"): (value, "th") | |
| for name, value in self.ones.items() | |
| if value > 3 and value != 5 and value != 12 | |
| }, | |
| } | |
| self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal} | |
| self.tens = { | |
| "twenty": 20, | |
| "thirty": 30, | |
| "forty": 40, | |
| "fifty": 50, | |
| "sixty": 60, | |
| "seventy": 70, | |
| "eighty": 80, | |
| "ninety": 90, | |
| } | |
| self.tens_plural = { | |
| name.replace("y", "ies"): (value, "s") for name, value in self.tens.items() | |
| } | |
| self.tens_ordinal = { | |
| name.replace("y", "ieth"): (value, "th") | |
| for name, value in self.tens.items() | |
| } | |
| self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal} | |
| self.multipliers = { | |
| "hundred": 100, | |
| "thousand": 1_000, | |
| "million": 1_000_000, | |
| "billion": 1_000_000_000, | |
| "trillion": 1_000_000_000_000, | |
| "quadrillion": 1_000_000_000_000_000, | |
| "quintillion": 1_000_000_000_000_000_000, | |
| "sextillion": 1_000_000_000_000_000_000_000, | |
| "septillion": 1_000_000_000_000_000_000_000_000, | |
| "octillion": 1_000_000_000_000_000_000_000_000_000, | |
| "nonillion": 1_000_000_000_000_000_000_000_000_000_000, | |
| "decillion": 1_000_000_000_000_000_000_000_000_000_000_000, | |
| } | |
| self.multipliers_plural = { | |
| name + "s": (value, "s") for name, value in self.multipliers.items() | |
| } | |
| self.multipliers_ordinal = { | |
| name + "th": (value, "th") for name, value in self.multipliers.items() | |
| } | |
| self.multipliers_suffixed = { | |
| **self.multipliers_plural, | |
| **self.multipliers_ordinal, | |
| } | |
| self.decimals = {*self.ones, *self.tens, *self.zeros} | |
| self.preceding_prefixers = { | |
| "minus": "-", | |
| "negative": "-", | |
| "plus": "+", | |
| "positive": "+", | |
| } | |
| self.following_prefixers = { | |
| "pound": "£", | |
| "pounds": "£", | |
| "euro": "€", | |
| "euros": "€", | |
| "dollar": "$", | |
| "dollars": "$", | |
| "cent": "¢", | |
| "cents": "¢", | |
| } | |
| self.prefixes = set( | |
| list(self.preceding_prefixers.values()) | |
| + list(self.following_prefixers.values()) | |
| ) | |
| self.suffixers = { | |
| "per": {"cent": "%"}, | |
| "percent": "%", | |
| } | |
| self.specials = {"and", "double", "triple", "point"} | |
| self.words = { | |
| key | |
| for mapping in [ | |
| self.zeros, | |
| self.ones, | |
| self.ones_suffixed, | |
| self.tens, | |
| self.tens_suffixed, | |
| self.multipliers, | |
| self.multipliers_suffixed, | |
| self.preceding_prefixers, | |
| self.following_prefixers, | |
| self.suffixers, | |
| self.specials, | |
| ] | |
| for key in mapping | |
| } | |
| self.literal_words = {"one", "ones"} | |
| def process_words(self, words: List[str]) -> Iterator[str]: | |
| prefix: Optional[str] = None | |
| value: Optional[Union[str, int]] = None | |
| skip = False | |
| def to_fraction(s: str): | |
| try: | |
| return Fraction(s) | |
| except ValueError: | |
| return None | |
| def output(result: Union[str, int]): | |
| nonlocal prefix, value | |
| result = str(result) | |
| if prefix is not None: | |
| result = prefix + result | |
| value = None | |
| prefix = None | |
| return result | |
| if len(words) == 0: | |
| return | |
| for i, current in enumerate(words): | |
| prev = words[i - 1] if i != 0 else None | |
| next = words[i + 1] if i != len(words) - 1 else None | |
| if skip: | |
| skip = False | |
| continue | |
| next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next) | |
| has_prefix = current[0] in self.prefixes | |
| current_without_prefix = current[1:] if has_prefix else current | |
| if re.match(r"^\d+(\.\d+)?$", current_without_prefix): | |
| # arabic numbers (potentially with signs and fractions) | |
| f = to_fraction(current_without_prefix) | |
| if f is None: | |
| raise ValueError("Converting the fraction failed") | |
| if value is not None: | |
| if isinstance(value, str) and value.endswith("."): | |
| # concatenate decimals / ip address components | |
| value = str(value) + str(current) | |
| continue | |
| else: | |
| yield output(value) | |
| prefix = current[0] if has_prefix else prefix | |
| if f.denominator == 1: | |
| value = f.numerator # store integers as int | |
| else: | |
| value = current_without_prefix | |
| elif current not in self.words: | |
| # non-numeric words | |
| if value is not None: | |
| yield output(value) | |
| yield output(current) | |
| elif current in self.zeros: | |
| value = str(value or "") + "0" | |
| elif current in self.ones: | |
| ones = self.ones[current] | |
| if value is None: | |
| value = ones | |
| elif isinstance(value, str) or prev in self.ones: | |
| if ( | |
| prev in self.tens and ones < 10 | |
| ): # replace the last zero with the digit | |
| value = value[:-1] + str(ones) | |
| else: | |
| value = str(value) + str(ones) | |
| elif ones < 10: | |
| if value % 10 == 0: | |
| value += ones | |
| else: | |
| value = str(value) + str(ones) | |
| else: # eleven to nineteen | |
| if value % 100 == 0: | |
| value += ones | |
| else: | |
| value = str(value) + str(ones) | |
| elif current in self.ones_suffixed: | |
| # ordinal or cardinal; yield the number right away | |
| ones, suffix = self.ones_suffixed[current] | |
| if value is None: | |
| yield output(str(ones) + suffix) | |
| elif isinstance(value, str) or prev in self.ones: | |
| if prev in self.tens and ones < 10: | |
| yield output(value[:-1] + str(ones) + suffix) | |
| else: | |
| yield output(str(value) + str(ones) + suffix) | |
| elif ones < 10: | |
| if value % 10 == 0: | |
| yield output(str(value + ones) + suffix) | |
| else: | |
| yield output(str(value) + str(ones) + suffix) | |
| else: # eleven to nineteen | |
| if value % 100 == 0: | |
| yield output(str(value + ones) + suffix) | |
| else: | |
| yield output(str(value) + str(ones) + suffix) | |
| value = None | |
| elif current in self.tens: | |
| tens = self.tens[current] | |
| if value is None: | |
| value = tens | |
| elif isinstance(value, str): | |
| value = str(value) + str(tens) | |
| else: | |
| if value % 100 == 0: | |
| value += tens | |
| else: | |
| value = str(value) + str(tens) | |
| elif current in self.tens_suffixed: | |
| # ordinal or cardinal; yield the number right away | |
| tens, suffix = self.tens_suffixed[current] | |
| if value is None: | |
| yield output(str(tens) + suffix) | |
| elif isinstance(value, str): | |
| yield output(str(value) + str(tens) + suffix) | |
| else: | |
| if value % 100 == 0: | |
| yield output(str(value + tens) + suffix) | |
| else: | |
| yield output(str(value) + str(tens) + suffix) | |
| elif current in self.multipliers: | |
| multiplier = self.multipliers[current] | |
| if value is None: | |
| value = multiplier | |
| elif isinstance(value, str) or value == 0: | |
| f = to_fraction(value) | |
| p = f * multiplier if f is not None else None | |
| if f is not None and p.denominator == 1: | |
| value = p.numerator | |
| else: | |
| yield output(value) | |
| value = multiplier | |
| else: | |
| before = value // 1000 * 1000 | |
| residual = value % 1000 | |
| value = before + residual * multiplier | |
| elif current in self.multipliers_suffixed: | |
| multiplier, suffix = self.multipliers_suffixed[current] | |
| if value is None: | |
| yield output(str(multiplier) + suffix) | |
| elif isinstance(value, str): | |
| f = to_fraction(value) | |
| p = f * multiplier if f is not None else None | |
| if f is not None and p.denominator == 1: | |
| yield output(str(p.numerator) + suffix) | |
| else: | |
| yield output(value) | |
| yield output(str(multiplier) + suffix) | |
| else: # int | |
| before = value // 1000 * 1000 | |
| residual = value % 1000 | |
| value = before + residual * multiplier | |
| yield output(str(value) + suffix) | |
| value = None | |
| elif current in self.preceding_prefixers: | |
| # apply prefix (positive, minus, etc.) if it precedes a number | |
| if value is not None: | |
| yield output(value) | |
| if next in self.words or next_is_numeric: | |
| prefix = self.preceding_prefixers[current] | |
| else: | |
| yield output(current) | |
| elif current in self.following_prefixers: | |
| # apply prefix (dollars, cents, etc.) only after a number | |
| if value is not None: | |
| prefix = self.following_prefixers[current] | |
| yield output(value) | |
| else: | |
| yield output(current) | |
| elif current in self.suffixers: | |
| # apply suffix symbols (percent -> '%') | |
| if value is not None: | |
| suffix = self.suffixers[current] | |
| if isinstance(suffix, dict): | |
| if next in suffix: | |
| yield output(str(value) + suffix[next]) | |
| skip = True | |
| else: | |
| yield output(value) | |
| yield output(current) | |
| else: | |
| yield output(str(value) + suffix) | |
| else: | |
| yield output(current) | |
| elif current in self.specials: | |
| if next not in self.words and not next_is_numeric: | |
| # apply special handling only if the next word can be numeric | |
| if value is not None: | |
| yield output(value) | |
| yield output(current) | |
| elif current == "and": | |
| # ignore "and" after hundreds, thousands, etc. | |
| if prev not in self.multipliers: | |
| if value is not None: | |
| yield output(value) | |
| yield output(current) | |
| elif current == "double" or current == "triple": | |
| if next in self.ones or next in self.zeros: | |
| repeats = 2 if current == "double" else 3 | |
| ones = self.ones.get(next, 0) | |
| value = str(value or "") + str(ones) * repeats | |
| skip = True | |
| else: | |
| if value is not None: | |
| yield output(value) | |
| yield output(current) | |
| elif current == "point": | |
| if next in self.decimals or next_is_numeric: | |
| value = str(value or "") + "." | |
| else: | |
| # should all have been covered at this point | |
| raise ValueError(f"Unexpected token: {current}") | |
| else: | |
| # all should have been covered at this point | |
| raise ValueError(f"Unexpected token: {current}") | |
| if value is not None: | |
| yield output(value) | |
| def preprocess(self, s: str): | |
| # replace "<number> and a half" with "<number> point five" | |
| results = [] | |
| segments = re.split(r"\band\s+a\s+half\b", s) | |
| for i, segment in enumerate(segments): | |
| if len(segment.strip()) == 0: | |
| continue | |
| if i == len(segments) - 1: | |
| results.append(segment) | |
| else: | |
| results.append(segment) | |
| last_word = segment.rsplit(maxsplit=2)[-1] | |
| if last_word in self.decimals or last_word in self.multipliers: | |
| results.append("point five") | |
| else: | |
| results.append("and a half") | |
| s = " ".join(results) | |
| # put a space at number/letter boundary | |
| s = re.sub(r"([a-z])([0-9])", r"\1 \2", s) | |
| s = re.sub(r"([0-9])([a-z])", r"\1 \2", s) | |
| # but remove spaces which could be a suffix | |
| s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s) | |
| return s | |
| def postprocess(self, s: str): | |
| def combine_cents(m: Match): | |
| try: | |
| currency = m.group(1) | |
| integer = m.group(2) | |
| cents = int(m.group(3)) | |
| return f"{currency}{integer}.{cents:02d}" | |
| except ValueError: | |
| return m.string | |
| def extract_cents(m: Match): | |
| try: | |
| return f"¢{int(m.group(1))}" | |
| except ValueError: | |
| return m.string | |
| # apply currency postprocessing; "$2 and ¢7" -> "$2.07" | |
| s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s) | |
| s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s) | |
| # write "one(s)" instead of "1(s)", just for the readability | |
| s = re.sub(r"\b1(s?)\b", r"one\1", s) | |
| return s | |
| def __call__(self, s: str): | |
| s = self.preprocess(s) | |
| s = " ".join(word for word in self.process_words(s.split()) if word is not None) | |
| s = self.postprocess(s) | |
| return s | |
| class EnglishSpellingNormalizer: | |
| """ | |
| Applies British-American spelling mappings as listed in [1]. | |
| [1] https://www.tysto.com/uk-us-spelling-list.html | |
| """ | |
| def __init__(self, english_spelling_mapping): | |
| self.mapping = english_spelling_mapping | |
| def __call__(self, s: str): | |
| return " ".join(self.mapping.get(word, word) for word in s.split()) | |
| class EnglishTextNormalizer: | |
| def __init__(self, english_spelling_mapping=abbr): | |
| self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b" | |
| self.replacers = { | |
| # common contractions | |
| r"\bwon't\b": "will not", | |
| r"\bcan't\b": "can not", | |
| r"\blet's\b": "let us", | |
| r"\bain't\b": "aint", | |
| r"\by'all\b": "you all", | |
| r"\bwanna\b": "want to", | |
| r"\bgotta\b": "got to", | |
| r"\bgonna\b": "going to", | |
| r"\bi'ma\b": "i am going to", | |
| r"\bimma\b": "i am going to", | |
| r"\bwoulda\b": "would have", | |
| r"\bcoulda\b": "could have", | |
| r"\bshoulda\b": "should have", | |
| r"\bma'am\b": "madam", | |
| # contractions in titles/prefixes | |
| r"\bmr\b": "mister ", | |
| r"\bmrs\b": "missus ", | |
| r"\bst\b": "saint ", | |
| r"\bdr\b": "doctor ", | |
| r"\bprof\b": "professor ", | |
| r"\bcapt\b": "captain ", | |
| r"\bgov\b": "governor ", | |
| r"\bald\b": "alderman ", | |
| r"\bgen\b": "general ", | |
| r"\bsen\b": "senator ", | |
| r"\brep\b": "representative ", | |
| r"\bpres\b": "president ", | |
| r"\brev\b": "reverend ", | |
| r"\bhon\b": "honorable ", | |
| r"\basst\b": "assistant ", | |
| r"\bassoc\b": "associate ", | |
| r"\blt\b": "lieutenant ", | |
| r"\bcol\b": "colonel ", | |
| r"\bjr\b": "junior ", | |
| r"\bsr\b": "senior ", | |
| r"\besq\b": "esquire ", | |
| # prefect tenses, ideally it should be any past participles, but it's harder.. | |
| r"'d been\b": " had been", | |
| r"'s been\b": " has been", | |
| r"'d gone\b": " had gone", | |
| r"'s gone\b": " has gone", | |
| r"'d done\b": " had done", # "'s done" is ambiguous | |
| r"'s got\b": " has got", | |
| # general contractions | |
| r"n't\b": " not", | |
| r"'re\b": " are", | |
| r"'s\b": " is", | |
| r"'d\b": " would", | |
| r"'ll\b": " will", | |
| r"'t\b": " not", | |
| r"'ve\b": " have", | |
| r"'m\b": " am", | |
| } | |
| self.standardize_numbers = EnglishNumberNormalizer() | |
| self.standardize_spellings = EnglishSpellingNormalizer(english_spelling_mapping) | |
| def __call__(self, s: str): | |
| s = s.lower() | |
| s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets | |
| s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis | |
| s = re.sub(self.ignore_patterns, "", s) | |
| s = re.sub( | |
| r"\s+'", "'", s | |
| ) # standardize when there's a space before an apostrophe | |
| for pattern, replacement in self.replacers.items(): | |
| s = re.sub(pattern, replacement, s) | |
| s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits | |
| s = re.sub(r"\.([^0-9]|$)", r" \1", s) # remove periods not followed by numbers | |
| s = remove_symbols_and_diacritics( | |
| s, keep=".%$¢€£" | |
| ) # keep some symbols for numerics | |
| s = self.standardize_numbers(s) | |
| s = self.standardize_spellings(s) | |
| # now remove prefix/suffix symbols that are not preceded/followed by numbers | |
| s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s) | |
| s = re.sub(r"([^0-9])%", r"\1 ", s) | |
| s = re.sub( | |
| r"\s+", " ", s | |
| ) # replace any successive whitespace characters with a space | |
| return s | |
| text_normalizer = EnglishTextNormalizer() | |