diff --git a/backend/oasst_backend/tree_manager.py b/backend/oasst_backend/tree_manager.py index 45fcc69555..5297cde8d0 100644 --- a/backend/oasst_backend/tree_manager.py +++ b/backend/oasst_backend/tree_manager.py @@ -1085,7 +1085,7 @@ def _query_need_review( def query_prompts_need_review(self, lang: str) -> list[Message]: """ - Select initial prompt messages with less then required rankings in active message tree + Select initial prompt messages with less than required rankings in active message tree (active == True in message_tree_state) """ return self._query_need_review( @@ -1094,7 +1094,7 @@ def query_prompts_need_review(self, lang: str) -> list[Message]: def query_replies_need_review(self, lang: str) -> list[Message]: """ - Select child messages (parent_id IS NOT NULL) with less then required rankings + Select child messages (parent_id IS NOT NULL) with less than required rankings in active message tree (active == True in message_tree_state) """ return self._query_need_review(message_tree_state.State.GROWING, self.cfg.num_reviews_reply, False, lang) diff --git a/data/datasets/TSSB-3M/generate_dataset.py b/data/datasets/TSSB-3M/generate_dataset.py index ec785e83bd..e182d79fd4 100644 --- a/data/datasets/TSSB-3M/generate_dataset.py +++ b/data/datasets/TSSB-3M/generate_dataset.py @@ -117,7 +117,7 @@ def clean(text): def clean_PII(text): - # Remove sign-off messege generated by `git commit --signoff`, eg. "Signed-off-by: user_name " + # Remove sign-off message generated by `git commit --signoff`, eg. "Signed-off-by: user_name " signoff_index = text.rfind("\n\nSigned-off-by:") if signoff_index != -1: # Remove the sign-off string from the commit message diff --git a/data/datasets/instructional_codesearchnet_python/Summarize_codesearchnet_for_python.ipynb b/data/datasets/instructional_codesearchnet_python/Summarize_codesearchnet_for_python.ipynb index 0610cfb00d..c392c33df2 100644 --- a/data/datasets/instructional_codesearchnet_python/Summarize_codesearchnet_for_python.ipynb +++ b/data/datasets/instructional_codesearchnet_python/Summarize_codesearchnet_for_python.ipynb @@ -26,7 +26,7 @@ "id": "K9sCPQzIb278" }, "source": [ - "### DOWLOAD THE DATASET" + "### DOWNLOAD THE DATASET" ] }, { @@ -156,7 +156,7 @@ "id": "3MxfnNxX2n0m" }, "source": [ - "### GENERATE THE SUMMARIES AND ANOTATE THE DATASET" + "### GENERATE THE SUMMARIES AND ANNOTATE THE DATASET" ] }, { diff --git a/data/datasets/recipes/tasty_recipes.ipynb b/data/datasets/recipes/tasty_recipes.ipynb index b064172c92..cacb6acdb6 100644 --- a/data/datasets/recipes/tasty_recipes.ipynb +++ b/data/datasets/recipes/tasty_recipes.ipynb @@ -158,7 +158,7 @@ " for i, instruction in enumerate(ingredient_and_instructions[row[\"slug\"]][\"instructions\"]):\n", " instructions += f\"\\n{i+1}. {convert_fraction_unicode_chars_to_strings(instruction['display_text'])}\"\n", "\n", - " # Constuct the full response\n", + " # Construct the full response\n", " response = f\"\"\"Here's a recipe for {recipe_name}:\n", "\n", "Ingredients:\n", diff --git a/data/datasets/safety_directory/child_help/child_help.py b/data/datasets/safety_directory/child_help/child_help.py index 62954a22a3..eccca36cf8 100644 --- a/data/datasets/safety_directory/child_help/child_help.py +++ b/data/datasets/safety_directory/child_help/child_help.py @@ -195,7 +195,7 @@ "Ligne Verte 147 Madagascar": { "region": "Madagascar", "page": "https://childhelplineinternational.org/madagascar-ligne-verte-147-madagascar/", - "description": "Ligne Verte 147 is a child helpline for reporting cases of mistreatment, violence, abuse and exploitation against children and is is free, available 24/7 and accessible everywhere in Madagascar.", + "description": "Ligne Verte 147 is a child helpline for reporting cases of mistreatment, violence, abuse and exploitation against children and is free, available 24/7 and accessible everywhere in Madagascar.", "contacts": { "Website": {"type": "website", "link": "https://arozaza.mg/"}, "147": {"type": "phone", "link": "tel:147"}, @@ -529,7 +529,7 @@ "Línea Libre": { "region": "Chile", "page": "https://childhelplineinternational.org/chile-linea-libre/", - "description": "Línea Libre is is a psychological support channel aimed at girls, boys and young people, which is attended directly by psychologists trained to contain, guide, intervene in crises, and address mental health concerns or rights violations. It is available Monday to Saturday from 10:00 a.m. to 10:00 p.m. through three channels: phone email, and chat via our app.", + "description": "Línea Libre is a psychological support channel aimed at girls, boys and young people, which is attended directly by psychologists trained to contain, guide, intervene in crises, and address mental health concerns or rights violations. It is available Monday to Saturday from 10:00 a.m. to 10:00 p.m. through three channels: phone email, and chat via our app.", "contacts": { "Website": {"type": "website", "link": "http://www.linealibre.cl/"}, "1515": {"type": "phone", "link": "tel:1515"}, @@ -2110,7 +2110,7 @@ "Hotline 919": { "region": "Qatar", "page": "https://childhelplineinternational.org/qatar-hotline-919/", - "description": "Hotline 919 provides provides free confidential consultations (social, psychological and legal) for women and children and also provides support to protect and rehabilitate children and women who are victims of violence and family breakdown.", + "description": "Hotline 919 provides free confidential consultations (social, psychological and legal) for women and children and also provides support to protect and rehabilitate children and women who are victims of violence and family breakdown.", "contacts": { "Website": {"type": "website", "link": "http://www.aman.org.qa/"}, "919": {"type": "phone", "link": "tel:919"}, diff --git a/data/datasets/safety_directory/emergency_infos/wikipedia_emergency_info.js b/data/datasets/safety_directory/emergency_infos/wikipedia_emergency_info.js index a36cd5a5d3..f0feca9ad5 100644 --- a/data/datasets/safety_directory/emergency_infos/wikipedia_emergency_info.js +++ b/data/datasets/safety_directory/emergency_infos/wikipedia_emergency_info.js @@ -1,5 +1,5 @@ /** - * Developper console script used to generate the associated json file. + * Developer console script used to generate the associated json file. * Wikipedia URL : https://en.wikipedia.org/wiki/List_of_suicide_crisis_lines * Author : Lucas Oulieu */ diff --git a/data/datasets/tv_dialogue/README.md b/data/datasets/tv_dialogue/README.md index 2df7d03d9d..d607c2952e 100644 --- a/data/datasets/tv_dialogue/README.md +++ b/data/datasets/tv_dialogue/README.md @@ -47,7 +47,7 @@ How's it going? on Huggingface! They are examples on Huggingface. -CUT OUT TO ANOTHER SCENCE +CUT OUT TO ANOTHER SCENE We are somewhere else [PERSON 1 (v.o)] I wonder where we are? diff --git a/docs/docs/architecture/inference.md b/docs/docs/architecture/inference.md index 02daf6cc6c..3b6c83d104 100644 --- a/docs/docs/architecture/inference.md +++ b/docs/docs/architecture/inference.md @@ -111,7 +111,7 @@ The inference server is built around [FastAPI](https://fastapi.tiangolo.com/). for any other currently pending messages in the chat to `inference.MessageState.cancelled`. 3. After updating the `message` table, we create a RedisQueue for this - specific message and enque the message. + specific message and enqueue the message. 4. Finally, we return an `inference.MessageRead` (a Pydantic model) to the client. This is the object contains the needed `message_id`. diff --git a/inference/server/oasst_inference_server/compliance.py b/inference/server/oasst_inference_server/compliance.py index 0950a9cc8b..625ee3af9c 100644 --- a/inference/server/oasst_inference_server/compliance.py +++ b/inference/server/oasst_inference_server/compliance.py @@ -57,7 +57,7 @@ async def run_compliance_check(websocket: fastapi.WebSocket, worker_id: str, wor Run a compliance check for the given worker: - Find a suitable compliance check assistant message - Task the worker with generating a response with the same context - - Compare the respons against the existing completed message + - Compare the response against the existing completed message - Update the database with the outcome """ async with deps.manual_create_session() as session: diff --git a/model/model_eval/manual/create_synth_import.py b/model/model_eval/manual/create_synth_import.py index 5f5760765d..2572cd6359 100644 --- a/model/model_eval/manual/create_synth_import.py +++ b/model/model_eval/manual/create_synth_import.py @@ -83,7 +83,7 @@ def main(): reply_texts.add(m.text) if len(unique_replies) < 2: - print("Skipping enty with < 2 unique replies") + print("Skipping entry with < 2 unique replies") continue prompt_message = ExportMessageNode( diff --git a/model/model_training/custom_datasets/formatting.py b/model/model_training/custom_datasets/formatting.py index a5bfdc7394..e3c7ec2e29 100644 --- a/model/model_training/custom_datasets/formatting.py +++ b/model/model_training/custom_datasets/formatting.py @@ -74,7 +74,7 @@ def system_tag( shuffle(properties) - # ensure that potentially multi-line conext field comes last + # ensure that potentially multi-line context field comes last if self.context: properties.append(("context", self.context)) diff --git a/model/model_training/models/__init__.py b/model/model_training/models/__init__.py index 8ada11ab17..44c0968ee4 100644 --- a/model/model_training/models/__init__.py +++ b/model/model_training/models/__init__.py @@ -2,7 +2,7 @@ def freeze_top_n_layers(model, target_layers): - # its possible we can simply detect which module is a ModuleList + # it's possible we can simply detect which module is a ModuleList # and simply freeze the module without doing string parsing for name, param in model.named_parameters(): if "embed" in name: diff --git a/model/model_training/models/patching_falcon.py b/model/model_training/models/patching_falcon.py index 292c9aa13a..dbf6e8de28 100644 --- a/model/model_training/models/patching_falcon.py +++ b/model/model_training/models/patching_falcon.py @@ -19,7 +19,7 @@ def falcon_forward_with_flash_attn( ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: """ head_mask, alibi & output_attention are not supported. - Reference to the original `FalconAttention.forwad()` method which this patch replaces: + Reference to the original `FalconAttention.forward()` method which this patch replaces: https://github.com/huggingface/transformers/blob/c965d302791cf935d6ea7776428749be678cf509/src/transformers/models/falcon/modeling_falcon.py#L281 """ diff --git a/model/pretokenizer/README.md b/model/pretokenizer/README.md index 6c89e8da51..9429e652d4 100644 --- a/model/pretokenizer/README.md +++ b/model/pretokenizer/README.md @@ -19,7 +19,7 @@ python -m pip install ../../oasst-data/ ### Configuration -The datamix to proces can be configured with one or multiple sections in the +The datamix to process can be configured with one or multiple sections in the `configs/pretokenize.yaml` file. ### Example usage diff --git a/model/pretokenizer/tokenizer.py b/model/pretokenizer/tokenizer.py index 5ee75a32a4..18e53851a6 100644 --- a/model/pretokenizer/tokenizer.py +++ b/model/pretokenizer/tokenizer.py @@ -310,7 +310,7 @@ def bos(self): def eod(self): if self._eod_id is not None: return self._eod_id - return self._eos_id # in case noe eod we can patch this up with an eos + return self._eos_id # in case no eod we can patch this up with an eos @property def eos_token_id(self): diff --git a/notebooks/TSSB-3M-bugs-dataset/TSSB-3M-bugs_dataset.ipynb b/notebooks/TSSB-3M-bugs-dataset/TSSB-3M-bugs_dataset.ipynb index a5577e3c86..a1bf0e25a6 100644 --- a/notebooks/TSSB-3M-bugs-dataset/TSSB-3M-bugs_dataset.ipynb +++ b/notebooks/TSSB-3M-bugs-dataset/TSSB-3M-bugs_dataset.ipynb @@ -707,10 +707,10 @@ "\n", "g = Github()\n", "\n", - "# TO DO, find a way to get a commmit from SHA\n", + "# TO DO, find a way to get a commit from SHA\n", "# 1. Use GitHub API\n", "# 2. Download repos with their history\n", - "# 3. Web scaping" + "# 3. Web scraping" ] }, { diff --git a/notebooks/data-augmentation/essay-revision/essay-revision.ipynb b/notebooks/data-augmentation/essay-revision/essay-revision.ipynb index 667a5cd597..3c22fb7778 100644 --- a/notebooks/data-augmentation/essay-revision/essay-revision.ipynb +++ b/notebooks/data-augmentation/essay-revision/essay-revision.ipynb @@ -201,7 +201,7 @@ }, "outputs": [], "source": [ - "# Make grammar erros (more like: change random words into words of similar meaning)\n", + "# Make grammar errors (more like: change random words into words of similar meaning)\n", "import nltk\n", "from nltk.corpus import wordnet\n", "import random\n", diff --git a/notebooks/data-augmentation/unified-qa/unified-qa.ipynb b/notebooks/data-augmentation/unified-qa/unified-qa.ipynb index ce9cbed6e8..56478a306f 100644 --- a/notebooks/data-augmentation/unified-qa/unified-qa.ipynb +++ b/notebooks/data-augmentation/unified-qa/unified-qa.ipynb @@ -1004,7 +1004,7 @@ "metadata": {}, "outputs": [], "source": [ - "random.seed(20) # for reproduciablity" + "random.seed(20) # for reproducibility" ] }, { @@ -1025,7 +1025,7 @@ "def convert_unified_qa(dataset_url):\n", " # download using pandas\n", " ds = pd.read_csv(dataset_url, on_bad_lines=\"skip\", names=[\"Question\", \"Answer\"], sep=\"\\t\")\n", - " # get name for metatdata\n", + " # get name for metadata\n", " ds_name = dataset_url.split(\"/unifiedqa/data/\")[1].split(\"/\")[0]\n", " # get conversation templates list\n", " conv_funcs = converter_functions[ds_name]\n", @@ -1038,7 +1038,7 @@ " answer = item.Answer\n", " if question == np.nan or answer == np.nan:\n", " print(\"Skipped\")\n", - " # get a random conversation generatore function\n", + " # get a random conversation generator function\n", " conv_func = random.choice(conv_funcs)\n", " try:\n", " conv_list = conv_func(question, answer)\n", diff --git a/notebooks/data-augmentation/wikidata-qa/wikidata.ipynb b/notebooks/data-augmentation/wikidata-qa/wikidata.ipynb index 7a6662b16d..b4403f987e 100644 --- a/notebooks/data-augmentation/wikidata-qa/wikidata.ipynb +++ b/notebooks/data-augmentation/wikidata-qa/wikidata.ipynb @@ -810,7 +810,7 @@ " \"{sub} is used mostly for {a}.\",\n", " \"{name} is mostly known for {a}.\",\n", " ],\n", - " \"P487\": [\"{a}\", \"The {name} emoji is {a}.\", \"The {a} character repesents {name}.\"],\n", + " \"P487\": [\"{a}\", \"The {name} emoji is {a}.\", \"The {a} character represents {name}.\"],\n", " \"P509\": [\"{name} died of {a}.\", \"The cause of {pos} death was {a}.\"],\n", " \"P527\": [\"{name} are made of {a}.\", \"They are made of {a}.\"],\n", " \"P569\": [\"{name} was born on {a}.\", \"{pos} birthday is on the {a}.\"],\n", @@ -828,12 +828,12 @@ " ],\n", " \"P580\": [\"{name} started in {a}.\", \"{name} first started at {a}.\"],\n", " \"P582\": [\"{name} ended in {a}.\", \"{name} lasted until {a}.\"],\n", - " \"P625\": [\"{name} is lcoated at {a}.\", \"The coordinates for {name} are {a}.\", \"{pos} GPS location is {a}.\"],\n", + " \"P625\": [\"{name} is located at {a}.\", \"The coordinates for {name} are {a}.\", \"{pos} GPS location is {a}.\"],\n", " \"P837\": [\"{name} is celebrated on {a}.\", \"{name} is on {a}.\"],\n", " \"P856\": [\n", " \"The URL for {name} is: {a}\",\n", " \"See {a}\",\n", - " \"The URL of {pos} webiste is {a}\",\n", + " \"The URL of {pos} website is {a}\",\n", " \"{pos} web address is: {a}\",\n", " ],\n", " \"P973\": [\n", @@ -855,7 +855,7 @@ " \"P2043\": [\"{name} is {a} long.\", \"{sub} has a length of {a}.\"],\n", " \"P2044\": [\"{name} is {a} tall.\", \"{name} is {a} above sea level.\", \"{pos} elevation is {a}.\"],\n", " \"P2046\": [\"{name}'s area is {a}\", \"{pos} area is {a}.\"],\n", - " \"P2049\": [\"{name}'s widht is {a}.\", \"{name} is {a} wide.\"],\n", + " \"P2049\": [\"{name}'s width is {a}.\", \"{name} is {a} wide.\"],\n", " \"P2250\": [\"{name} have a life expectancy of {a}.\", \"{pos} life expectancy is about {a}.\"],\n", " \"P2283\": [\n", " \"{name} uses {a} to work.\",\n", @@ -887,20 +887,20 @@ " \"{pos} {l} children are {a}.\",\n", " ],\n", " \"P50\": [\"{name} was co-written by {a}.\", \"The authors of {name} are {a}.\"],\n", - " \"P57\": [\"{name} was direcrted by the following people: {a}.\", \"{a} were the directors of {name}.\"],\n", + " \"P57\": [\"{name} was directed by the following people: {a}.\", \"{a} were the directors of {name}.\"],\n", " \"P61\": [\"{pos} inventors are {a}.\", \"{name} was discovered by {a}.\"],\n", " \"P106\": [\"{name} has multiple occupations: {a}.\", \"{name}'s job titles are: {a}.\"],\n", " \"P169\": [\"{name} is the CEO of multiple companies, such as {a}.\", \"{sub} is the CEO at {a}.\"],\n", " \"P225\": [\"The taxon names for {name} are {a}.\", \"The proper scientific terms for {name} are {a}.\"],\n", " \"P246\": [\"The elements of {name} are {a}.\", \"The symbols for {name} are {a}.\"],\n", " \"P274\": [\"The formulas for {name} are {a}.\", \"The chemical formulas of the compound {name} are {a}.\"],\n", - " \"P487\": [\"The {name} emojis are {a}.\", \"The characters {a} repesent {name}.\"],\n", + " \"P487\": [\"The {name} emojis are {a}.\", \"The characters {a} represent {name}.\"],\n", " \"P527\": [\"The ingredients of {name} are {a}.\", \"{a} are all parts needed for {name}.\"],\n", " \"P575\": [\n", " \"Sources disagree on the exact date, it is said that {name} was invented in {a}.\",\n", " \"{name} was discovered multiple times at {a}.\",\n", " ],\n", - " \"P856\": [\"The URLs for {name} are: {a}\", \"See {a}\", \"The URLs of {pos} webiste are {a}\"],\n", + " \"P856\": [\"The URLs for {name} are: {a}\", \"See {a}\", \"The URLs of {pos} website are {a}\"],\n", " \"P625\": [\n", " \"{name} can be found under the following GPS locations: {a}.\",\n", " \"The coordinates for {name} are {a}.\",\n", diff --git a/notebooks/detoxify-evaluation/detoxify-evaluation.ipynb b/notebooks/detoxify-evaluation/detoxify-evaluation.ipynb index 5a380ea967..26f4344b20 100644 --- a/notebooks/detoxify-evaluation/detoxify-evaluation.ipynb +++ b/notebooks/detoxify-evaluation/detoxify-evaluation.ipynb @@ -327,11 +327,11 @@ "\n", "| Model name | Not obviously toxic| Not obviously non-toxic | Obviously toxic| Obviously non-toxic|\n", "| :---: | :---: | :---: |:---: | :---: |\n", - "|original| failed at all, easily accepted racist, sexist overally toxic prompts that were well formulated |Very sensitive on swear words, failed to reckognize context| good performance|good performance|\n", - "|unbiased|Managed to find some hidden toxicity but not on all sentences| Very sensitive explicit language but shown ability to recognize context| Did well but failed to reckognize some gender stereotype mockery | good performance\n", - "|multilingual|Managed to find some hidden toxicity but not on all sentences| Very sensitive explicit language but shown ability to recognize context| Did well but failed to reckognize some gender stereotype mockery | good performance\n", + "|original| failed at all, easily accepted racist, sexist overally toxic prompts that were well formulated |Very sensitive on swear words, failed to recognize context| good performance|good performance|\n", + "|unbiased|Managed to find some hidden toxicity but not on all sentences| Very sensitive explicit language but shown ability to recognize context| Did well but failed to recognize some gender stereotype mockery | good performance\n", + "|multilingual|Managed to find some hidden toxicity but not on all sentences| Very sensitive explicit language but shown ability to recognize context| Did well but failed to recognize some gender stereotype mockery | good performance\n", "\n", - "Subjectivly 'unbiased' looks like the best performing model. \n", + "Subjectively 'unbiased' looks like the best performing model. \n", "\n", "I don't think it would do well as a security layer in a live version of open assistant unless we do some finetuning first, because it can be fooled to pass toxicity if it's presented in formal language. \n", "\n", diff --git a/notebooks/diverse/diverse.ipynb b/notebooks/diverse/diverse.ipynb index 31e29acf63..ec3e761b59 100644 --- a/notebooks/diverse/diverse.ipynb +++ b/notebooks/diverse/diverse.ipynb @@ -164,7 +164,7 @@ " answers = re.findall(r\"Answer:?(.*?)#\", item.replace(\"\\n\", \" \"))\n", " questions = re.findall(r\"Question:?(.*?) Answer:\", item.replace(\"\\n\", \" \"))\n", "\n", - " # The last question does not contain an aswer so we drop it every time.\n", + " # The last question does not contain an answer so we drop it every time.\n", " if len(answers) < len(questions):\n", " questions.pop(-1)\n", "\n", diff --git a/website/src/components/Chat/ChatConversationTree.tsx b/website/src/components/Chat/ChatConversationTree.tsx index c83ec1f133..95eb9c15da 100644 --- a/website/src/components/Chat/ChatConversationTree.tsx +++ b/website/src/components/Chat/ChatConversationTree.tsx @@ -111,7 +111,7 @@ const TreeChildren = ({ {...props} canRetry={isLeaf} showEncourageMessage={props.showEncourageMessage && isLeaf} - // TODO refacor away from this dirty hack + // TODO refactor away from this dirty hack id={isLeaf && currentTree.role === "assistant" ? LAST_ASSISTANT_MESSAGE_ID : undefined} data-id={currentTree.id} pagingSlot={ diff --git a/website/src/components/Messages/LabelInputGroup.tsx b/website/src/components/Messages/LabelInputGroup.tsx index b3e983900c..a864627853 100644 --- a/website/src/components/Messages/LabelInputGroup.tsx +++ b/website/src/components/Messages/LabelInputGroup.tsx @@ -32,7 +32,7 @@ interface LabelInputGroupProps { * * Note that Label is a type that include a name, like "spam" or "fails_task", and a widget value, * like "yes_no". - * The LabelYesNoGroup will then look for spam.question or fails_task.qustion strings in the translation files. + * The LabelYesNoGroup will then look for spam.question or fails_task.question strings in the translation files. * */ export const LabelInputGroup = ({ diff --git a/website/src/components/Tasks/Task/Task.tsx b/website/src/components/Tasks/Task/Task.tsx index 2721a2952a..161deaabcb 100644 --- a/website/src/components/Tasks/Task/Task.tsx +++ b/website/src/components/Tasks/Task/Task.tsx @@ -99,7 +99,7 @@ export const Task = () => { case "DEFAULT_WARN": return { mode: "EDIT", replyValidity: "DEFAULT" }; case "SUBMITTED": - // allow return to edit from subbmitted mode (error happen during submitting task) + // allow return to edit from submitted mode (error happen during submitting task) return { mode: "EDIT", replyValidity: "VALID" }; default: return status; diff --git a/website/src/lib/oasst_api_client.ts b/website/src/lib/oasst_api_client.ts index 61b7a27cec..71650d35d0 100644 --- a/website/src/lib/oasst_api_client.ts +++ b/website/src/lib/oasst_api_client.ts @@ -230,7 +230,7 @@ export class OasstApiClient { } /** - * Modify a message's content and save it's previous content as a revision + * Modify a message's content and save its previous content as a revision */ async edit_message(message_id: string, user: BackendUserCore, new_content: string) { return this.post(`/api/v1/messages/${message_id}/edit`, {