microsoft · borchero · Jul 21, 2024 · Jul 21, 2024 · Jul 21, 2024 · Jul 21, 2024
@@ -39,7 +39,7 @@ jobs:
             This pull request has been automatically locked since there has not been any recent activity since it was closed.
             To start a new related discussion, open a new issue at https://github.com/microsoft/LightGBM/issues
             including a reference to this.
-          # what shoulld the locking status be?
+          # what should the locking status be?
           issue-lock-reason: 'resolved'
           pr-lock-reason: 'resolved'
           process-only: 'issues, prs'
@@ -35,3 +35,9 @@ repos:
       - id: ruff-format
         args: ["--config", "python-package/pyproject.toml"]
         types_or: [python, jupyter]
+  - repo: https://github.com/crate-ci/typos
+    rev: v1.23.2
+    hooks:
+      - id: typos
+        args: ["--force-exclude"]
+        exclude: (\.gitignore$)|(^\.editorconfig$)
@@ -0,0 +1,21 @@
+default.extend-ignore-re = [
+  "/Ot",
+  "mis-alignment",
+  "mis-spelled",
+  "posix-seh-rt",
+]
+
+[default.extend-words]
+MAPE = "MAPE"
+datas = "datas"
+interprete = "interprete"
+mape = "mape"
+splitted = "splitted"
+
+[default.extend-identifiers]
+ERRORs = "ERRORs"
+GAM = "GAM"
+ND24s = "ND24s"
+WARNINGs = "WARNINGs"
+fullset = "fullset"
+thess = "thess"
@@ -5,7 +5,7 @@ option(USE_SWIG "Enable SWIG to generate Java API" OFF)
 option(USE_TIMETAG "Set to ON to output time costs" OFF)
 option(USE_CUDA "Enable CUDA-accelerated training " OFF)
 option(USE_DEBUG "Set to ON for Debug mode" OFF)
-option(USE_SANITIZER "Use santizer flags" OFF)
+option(USE_SANITIZER "Use sanitizer flags" OFF)
 option(USE_HOMEBREW_FALLBACK "(macOS-only) also look in 'brew --prefix' for libraries (e.g. OpenMP)" ON)
 set(
   ENABLED_SANITIZERS

@@ -1114,7 +1114,7 @@ predict.lgb.Booster <- function(object,
 #'
 #'          Requesting a different prediction type or passing parameters to \link{predict.lgb.Booster}
 #'          will cause it to ignore the fast-predict configuration and take the slow route instead
-#'          (but be aware that an existing configuration might not always be overriden by supplying
+#'          (but be aware that an existing configuration might not always be overridden by supplying
 #'          different parameters or prediction type, so make sure to check that the output is what
 #'          was expected when a prediction is to be made on a single row for something different than
 #'          what is configured).
@@ -1128,7 +1128,7 @@ predict.lgb.Booster <- function(object,
 #'          and as such, this function will produce an error if passing \code{csr=TRUE} and
 #'          \code{type = "contrib"} together.
 #' @inheritParams lgb_predict_shared_params
-#' @param model LighGBM model object (class \code{lgb.Booster}).
+#' @param model LightGBM model object (class \code{lgb.Booster}).
 #'
 #'              \bold{The object will be modified in-place}.
 #' @param csr Whether the prediction function is going to be called on sparse CSR inputs.

@@ -9,7 +9,7 @@
 #'   \item{\code{Feature}: Feature names in the model.}
 #'   \item{\code{Gain}: The total gain of this feature's splits.}
 #'   \item{\code{Cover}: The number of observation related to this feature.}
-#'   \item{\code{Frequency}: The number of times a feature splited in trees.}
+#'   \item{\code{Frequency}: The number of times a feature split in trees.}
 #' }
 #'
 #' @examples

@@ -10,7 +10,7 @@
 #'        \emph{New in version 4.4.0}
 #'
 #' @return
-#' A \code{data.table} with detailed information about model trees' nodes and leafs.
+#' A \code{data.table} with detailed information about model trees' nodes and leaves.
 #'
 #' The columns of the \code{data.table} are:
 #'

@@ -139,7 +139,7 @@ NULL
 #'                    system, but be aware that getting the number of cores detected correctly requires package
 #'                    \code{RhpcBLASctl} to be installed.
 #'
-#'                    This parameter gets overriden by \code{num_threads} and its aliases under \code{params}
+#'                    This parameter gets overridden by \code{num_threads} and its aliases under \code{params}
 #'                    if passed there.
 #'
 #'                    \emph{New in version 4.0.0}

@@ -51,7 +51,7 @@ logregobj <- function(preds, dtrain) {
 
 # User-defined evaluation function returns a pair (metric_name, result, higher_better)
 # NOTE: when you do customized loss function, the default prediction value is margin
-# This may make built-in evalution metric calculate wrong results
+# This may make built-in evaluation metric calculate wrong results
 # For example, we are doing logistic loss, the prediction is score before logistic transformation
 # Keep this in mind when you use the customization, and maybe you need write customized evaluation function
 evalerror <- function(preds, dtrain) {

@@ -29,7 +29,7 @@ logregobj <- function(preds, dtrain) {
 
 # User-defined evaluation function returns a pair (metric_name, result, higher_better)
 # NOTE: when you do customized loss function, the default prediction value is margin
-# This may make built-in evalution metric calculate wrong results
+# This may make built-in evaluation metric calculate wrong results
 # For example, we are doing logistic loss, the prediction is score before logistic transformation
 # The built-in evaluation error assumes input is after logistic transformation
 # Keep this in mind when you use the customization, and maybe you need write customized evaluation function

@@ -9,7 +9,7 @@ set.seed(708L)
 #               to an accumulator then returns the current value.
 #               This is used to mock the situation where an evaluation
 #               metric increases every iteration
-ACCUMULATOR_NAME <- "INCREASING_METRIC_ACUMULATOR"
+ACCUMULATOR_NAME <- "INCREASING_METRIC_ACCUMULATOR"
 assign(x = ACCUMULATOR_NAME, value = 0.0, envir = .GlobalEnv)
 
 .increasing_metric <- function(preds, dtrain) {
@@ -1777,7 +1777,7 @@ test_that("lgb.train() works with early stopping for regression with a metric th
     , early_stopping_rounds + 1L
   )
 
-  # Booster should understand thatt all three of these metrics should be minimized
+  # Booster should understand that all three of these metrics should be minimized
   eval_info <- bst$.__enclos_env__$private$get_eval_info()
   expect_identical(eval_info, c("mape", "rmse", "l1"))
   expect_identical(

@@ -14,7 +14,7 @@ logregobj <- function(preds, dtrain) {
 
 # User-defined evaluation function returns a pair (metric_name, result, higher_better)
 # NOTE: when you do customized loss function, the default prediction value is margin
-# This may make built-in evalution metric calculate wrong results
+# This may make built-in evaluation metric calculate wrong results
 # Keep this in mind when you use the customization, and maybe you need write customized evaluation function
 evalerror <- function(preds, dtrain) {
   labels <- get_field(dtrain, "label")

@@ -5,7 +5,7 @@
     log(x / (1.0 - x))
 }
 
-test_that("lgb.intereprete works as expected for binary classification", {
+test_that("lgb.interprete works as expected for binary classification", {
     data(agaricus.train, package = "lightgbm")
     train <- agaricus.train
     dtrain <- lgb.Dataset(train$data, label = train$label)

@@ -5,7 +5,7 @@
     log(x / (1.0 - x))
 }
 
-test_that("lgb.plot.interepretation works as expected for binary classification", {
+test_that("lgb.plot.interpretation works as expected for binary classification", {
     data(agaricus.train, package = "lightgbm")
     train <- agaricus.train
     dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -57,7 +57,7 @@ test_that("lgb.plot.interepretation works as expected for binary classification"
     expect_null(plot_res)
 })
 
-test_that("lgb.plot.interepretation works as expected for multiclass classification", {
+test_that("lgb.plot.interpretation works as expected for multiclass classification", {
     data(iris)
 
     # We must convert factors to numeric

@@ -18,7 +18,7 @@ macro(enable_sanitizer sanitizer)
     set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=undefined -fno-sanitize-recover=undefined")
 
   else()
-    message(FATAL_ERROR "Santizer ${sanitizer} not supported.")
+    message(FATAL_ERROR "Sanitizer ${sanitizer} not supported.")
   endif()
 endmacro()
 

@@ -8,9 +8,9 @@ These builds of LightGBM all train on the CPU. For GPU-enabled builds, see [the
 
 Follow the general installation instructions [on the Docker site](https://docs.docker.com/install/):
 
-* [macOS](https://docs.docker.com/docker-for-mac/install/)
-* [Ubuntu](https://docs.docker.com/install/linux/docker-ce/ubuntu/)
-* [Windows](https://docs.docker.com/docker-for-windows/install/)
+- [macOS](https://docs.docker.com/docker-for-mac/install/)
+- [Ubuntu](https://docs.docker.com/install/linux/docker-ce/ubuntu/)
+- [Windows](https://docs.docker.com/docker-for-windows/install/)
- [macOS](https://docs.docker.com/docker-for-mac/install/)
- [Ubuntu](https://docs.docker.com/install/linux/docker-ce/ubuntu/)
- [Windows](https://docs.docker.com/docker-for-windows/install/)
+* [macOS](https://docs.docker.com/docker-for-mac/install/)
+* [Ubuntu](https://docs.docker.com/install/linux/docker-ce/ubuntu/)
+* [Windows](https://docs.docker.com/docker-for-windows/install/)
- [macOS](https://docs.docker.com/docker-for-mac/install/)
- [Ubuntu](https://docs.docker.com/install/linux/docker-ce/ubuntu/)
- [Windows](https://docs.docker.com/docker-for-windows/install/)
+* [macOS](https://docs.docker.com/docker-for-mac/install/)
+* [Ubuntu](https://docs.docker.com/install/linux/docker-ce/ubuntu/)
+* [Windows](https://docs.docker.com/docker-for-windows/install/)
 
 ## Using CLI Version of LightGBM via Docker
 
@@ -55,7 +55,7 @@ After this runs, a LightGBM model can be found at `LightGBM-CLI-model.txt`.
 
 For more details on how to configure and use the LightGBM CLI, see https://lightgbm.readthedocs.io/en/latest/Quick-Start.html.
 
-## Running the Python-package Сontainer
+## Running the Python-package Container
 
 Build an image with the LightGBM Python package installed.
 
@@ -114,7 +114,7 @@ docker run \
     python
 ```
 
-## Running the R-package Сontainer
+## Running the R-package Container
 
 Build an image with the LightGBM R package installed.
 

@@ -24,15 +24,15 @@ $(function() {
     /* Collapse specified sections in the installation guide */
     if(window.location.pathname.toLocaleLowerCase().indexOf('installation-guide') != -1) {
         $('<style>.closed, .opened {cursor: pointer;} .closed:before, .opened:before {font-family: FontAwesome; display: inline-block; padding-right: 6px;} .closed:before {content: "\\f078";} .opened:before {content: "\\f077";}</style>').appendTo('body');
-        var collapsable = [
+        var collapsible = [
             '#build-threadless-version-not-recommended',
             '#build-mpi-version',
             '#build-gpu-version',
             '#build-cuda-version',
             '#build-java-wrapper',
             '#build-c-unit-tests'
         ];
-        $.each(collapsable, function(_, val) {
+        $.each(collapsible, function(_, val) {
             var header = val + ' > :header:first';
             var content = val + ' :not(:header:first)';
             $(header).addClass('closed');

@@ -64,7 +64,7 @@ num_leaves = 31
 # alias: tree
 tree_learner = serial
 
-# number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu.
+# number of threads for multi-threading. One thread will use one CPU, default is set to #cpu.
 # num_threads = 8
 
 # feature sub-sample, will random select 80% feature to train on each iteration

@@ -20,7 +20,7 @@ objective = regression
 # binary_error
 metric = l2
 
-# frequence for metric output
+# frequency for metric output
 metric_freq = 1
 
 # true if need output metric for training data, alias: tranining_metric, train_metric
@@ -36,12 +36,12 @@ max_bin = 255
 # forcedbins_filename = forced_bins.json
 
 # training data
-# if exsting weight file, should name to "regression.train.weight"
+# if existing weight file, should name to "regression.train.weight"
 # alias: train_data, train
 data = regression.train
 
 # validation data, support multi validation data, separated by ','
-# if exsting weight file, should name to "regression.test.weight"
+# if existing weight file, should name to "regression.test.weight"
 # alias: valid, test, test_data,
 valid_data = regression.test
 
@@ -62,7 +62,7 @@ num_leaves = 31
 # alias: tree
 tree_learner = serial
 
-# number of threads for multi-threading. One thread will use one CPU, default is setted to #cpu.
+# number of threads for multi-threading. One thread will use one CPU, default is set to #cpu.
 # num_threads = 8
 
 # feature sub-sample, will random select 80% feature to train on each iteration
@@ -72,7 +72,7 @@ feature_fraction = 0.9
 # Support bagging (data sub-sample), will perform bagging every 5 iterations
 bagging_freq = 5
 
-# Bagging farction, will random select 80% data on bagging
+# Bagging fraction, will random select 80% data on bagging
 # alias: sub_row
 bagging_fraction = 0.8
 

@@ -115,7 +115,7 @@ __device__ __forceinline__ T ShuffleReduceSumWarp(T value, const data_size_t len
   return value;
 }
 
-// reduce values from an 1-dimensional block (block size must be no greather than 1024)
+// reduce values from an 1-dimensional block (block size must be no greater than 1024)
 template <typename T>
 __device__ __forceinline__ T ShuffleReduceSum(T value, T* shared_mem_buffer, const size_t len) {
   const uint32_t warpLane = threadIdx.x % warpSize;
@@ -145,7 +145,7 @@ __device__ __forceinline__ T ShuffleReduceMaxWarp(T value, const data_size_t len
   return value;
 }
 
-// reduce values from an 1-dimensional block (block size must be no greather than 1024)
+// reduce values from an 1-dimensional block (block size must be no greater than 1024)
 template <typename T>
 __device__ __forceinline__ T ShuffleReduceMax(T value, T* shared_mem_buffer, const size_t len) {
   const uint32_t warpLane = threadIdx.x % warpSize;
@@ -196,7 +196,7 @@ __device__ __forceinline__ T ShuffleReduceMinWarp(T value, const data_size_t len
   return value;
 }
 
-// reduce values from an 1-dimensional block (block size must be no greather than 1024)
+// reduce values from an 1-dimensional block (block size must be no greater than 1024)
 template <typename T>
 __device__ __forceinline__ T ShuffleReduceMin(T value, T* shared_mem_buffer, const size_t len) {
   const uint32_t warpLane = threadIdx.x % warpSize;

@@ -376,7 +376,7 @@ class Metadata {
   std::vector<data_size_t> query_boundaries_;
   /*! \brief Query weights */
   std::vector<label_t> query_weights_;
-  /*! \brief Number of querys */
+  /*! \brief Number of queries */
   data_size_t num_queries_;
   /*! \brief Number of Initial score, used to check correct weight file */
   int64_t num_init_score_;

@@ -925,11 +925,11 @@ class AlignmentAllocator {
 
   inline ~AlignmentAllocator() throw() {}
 
-  inline pointer adress(reference r) {
+  inline pointer address(reference r) {
     return &r;
   }
 
-  inline const_pointer adress(const_reference r) const {
+  inline const_pointer address(const_reference r) const {
     return &r;
   }
 

@@ -22,9 +22,9 @@ class Random {
   */
   Random() {
     std::random_device rd;
-    auto genrator = std::mt19937(rd());
+    auto generator = std::mt19937(rd());
     std::uniform_int_distribution<int> distribution(0, x);
-    x = distribution(genrator);
+    x = distribution(generator);
   }
   /*!
   * \brief Constructor, with specific seed

@@ -3509,7 +3509,7 @@ def add_features_from(self, other: "Dataset") -> "Dataset":
             _log_warning(err_msg)
         self.feature_name = self.get_feature_name()
         _log_warning(
-            "Reseting categorical features.\n"
+            "Resetting categorical features.\n"
             "You can set new categorical features via ``set_categorical_feature`` method"
         )
         self.categorical_feature = "auto"