"Fossies" - the Fresh Open Source Software Archive

Member "transformers-4.21.1/examples/research_projects/codeparrot/scripts/arguments.py" (4 Aug 2022, 10173 Bytes) of package /linux/misc/transformers-4.21.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the last Fossies "Diffs" side-by-side code changes report for "arguments.py": 4.20.1_vs_4.21.0.

    1 from dataclasses import dataclass, field
    2 from typing import Optional
    3 
    4 
    5 @dataclass
    6 class TrainingArguments:
    7     """
    8     Configuration for training model.
    9     """
   10 
   11     model_ckpt: Optional[str] = field(
   12         default="lvwerra/codeparrot", metadata={"help": "Model name or path of model to be trained."}
   13     )
   14     save_dir: Optional[str] = field(
   15         default="./", metadata={"help": "Save dir where model repo is cloned and models updates are saved to."}
   16     )
   17     dataset_name_train: Optional[str] = field(
   18         default="lvwerra/codeparrot-clean-train", metadata={"help": "Name or path of training dataset."}
   19     )
   20     dataset_name_valid: Optional[str] = field(
   21         default="lvwerra/codeparrot-clean-valid", metadata={"help": "Name or path of validation dataset."}
   22     )
   23     train_batch_size: Optional[int] = field(default=2, metadata={"help": "Batch size for training."})
   24     valid_batch_size: Optional[int] = field(default=2, metadata={"help": "Batch size for evaluation."})
   25     weight_decay: Optional[float] = field(default=0.1, metadata={"help": "Value of weight decay."})
   26     shuffle_buffer: Optional[int] = field(
   27         default=10000, metadata={"help": "Size of buffer used to shuffle streaming dataset."}
   28     )
   29     learning_rate: Optional[float] = field(default=2e-4, metadata={"help": "Learning rate fo training."})
   30     lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "Learning rate."})
   31     num_warmup_steps: Optional[int] = field(
   32         default=750, metadata={"help": "Number of warmup steps in the learning rate schedule."}
   33     )
   34     gradient_accumulation_steps: Optional[int] = field(
   35         default=16, metadata={"help": "Number of gradient accumulation steps."}
   36     )
   37     gradient_checkpointing: Optional[bool] = field(
   38         default=True, metadata={"help": "Use gradient checkpointing to reduce memory footprint."}
   39     )
   40     max_train_steps: Optional[int] = field(default=50000, metadata={"help": "Maximum number of training steps."})
   41     max_eval_steps: Optional[int] = field(
   42         default=-1, metadata={"help": "Maximum number of evaluation steps. If -1 the full dataset is evaluated."}
   43     )
   44     seq_length: Optional[int] = field(default=1024, metadata={"help": "Sequence lengths used for training."})
   45     seed: Optional[int] = field(default=1, metadata={"help": "Training seed."})
   46     save_checkpoint_steps: Optional[int] = field(
   47         default=1024,
   48         metadata={"help": "Interval to save checkpoints. Measured as number of forward passes not training steps."},
   49     )
   50     resume_from_checkpoint: Optional[str] = field(
   51         default=None, metadata={"help": "States path if the training should continue from a checkpoint folder."}
   52     )
   53     tokenized: Optional[bool] = field(default=False, metadata={"help": "If True the data is pretokenized."})
   54 
   55 
   56 @dataclass
   57 class EvaluationArguments:
   58     """
   59     Configuration for evaluating model.
   60     """
   61 
   62     model_ckpt: Optional[str] = field(
   63         default="lvwerra/codeparrot", metadata={"help": "Model name or path of model to be evaluated."}
   64     )
   65     dataset_name: Optional[str] = field(
   66         default="lvwerra/codeparrot-clean-valid", metadata={"help": "Name or path of validation dataset."}
   67     )
   68     batch_size: Optional[int] = field(default=2, metadata={"help": "Batch size used for evaluation."})
   69     max_eval_steps: Optional[int] = field(
   70         default=-1, metadata={"help": "Maximum number of evaluation steps. If -1 the full dataset is evaluated."}
   71     )
   72     seq_length: Optional[int] = field(default=1024, metadata={"help": "Length of sequences to be evaluated."})
   73     seed: Optional[int] = field(default=1, metadata={"help": "Random seed used for evaluation."})
   74 
   75 
   76 @dataclass
   77 class HumanEvalArguments:
   78     """
   79     Configuration for running evaluation on HumanEval dataset.
   80     """
   81 
   82     model_ckpt: Optional[str] = field(
   83         default="lvwerra/codeparrot", metadata={"help": "Model name or path of model to be evaluated."}
   84     )
   85     num_workers: Optional[int] = field(default=None, metadata={"help": "Number of workers used for code evaluation."})
   86     num_tasks: Optional[int] = field(
   87         default=None,
   88         metadata={"help": "The number of human-eval tasks to run. If not included all tasks are evaluated."},
   89     )
   90     do_sample: Optional[bool] = field(
   91         default=True, metadata={"help": "Sample from the language model's output distribution."}
   92     )
   93     temperature: Optional[float] = field(default=0.2, metadata={"help": "Sampling temperature used for generation."})
   94     max_new_tokens: Optional[int] = field(default=256, metadata={"help": "Maximum number of newly generated tokens."})
   95     top_k: Optional[int] = field(default=0, metadata={"help": "Top-k parameter used for generation."})
   96     top_p: Optional[float] = field(default=0.95, metadata={"help": "Top-p parameter used for nucleus sampling."})
   97     batch_size: Optional[int] = field(default=10, metadata={"help": "Number of generations to run in parallel."})
   98     n_samples: Optional[int] = field(
   99         default=200, metadata={"help": "Number of completions to generate for each sample."}
  100     )
  101     seed: Optional[int] = field(default=1, metadata={"help": "Random seed used for evaluation."})
  102     output_file: Optional[str] = field(
  103         default="eval_results.json", metadata={"help": "Random seed used for evaluation."}
  104     )
  105     HF_ALLOW_CODE_EVAL: Optional[str] = field(
  106         default="0", metadata={"help": "Allow `code_eval` to execute Python code on machine"}
  107     )
  108     device_int: Optional[int] = field(
  109         default=-1,
  110         metadata={
  111             "help": (
  112                 "Determine which device to run the `text-generation` Pipeline on. -1 is CPU and any zero or positive"
  113                 " number corresponds to which GPU device id to run on."
  114             )
  115         },
  116     )
  117 
  118 
  119 @dataclass
  120 class PreprocessingArguments:
  121     """
  122     Configuration for preprocessing data.
  123     """
  124 
  125     num_workers: Optional[int] = field(
  126         default=None,
  127         metadata={
  128             "help": "The number of CPU cores to use for parallel preprocessing. Default uses the maximum available."
  129         },
  130     )
  131     dataset_name: Optional[str] = field(
  132         default="transformersbook/codeparrot", metadata={"help": "Folder or name of dataset to process."}
  133     )
  134     output_dir: Optional[str] = field(
  135         default="codeparrot-clean", metadata={"help": "Folder to save processed processed dataset."}
  136     )
  137     samples_per_file: Optional[int] = field(
  138         default=100_000, metadata={"help": "Number of files to save per JSON output file."}
  139     )
  140     text_column: Optional[str] = field(default="content", metadata={"help": "Column containing text data to process."})
  141     line_max: Optional[float] = field(
  142         default=1000, metadata={"help": "Maximum line length in file, otherwise file is filtered."}
  143     )
  144     line_mean: Optional[float] = field(
  145         default=100, metadata={"help": "Maximum mean line length in file, otherwise file is filtered."}
  146     )
  147     alpha_frac: Optional[float] = field(
  148         default=0.25, metadata={"help": "Maximum fraction of non-alphanumeric characters, otherwise file is filtered."}
  149     )
  150     min_token_ratio: Optional[float] = field(
  151         default=1.5, metadata={"help": "Minimum character token ratio for the file, otherwise file is filtered."}
  152     )
  153     filter_proba: Optional[float] = field(
  154         default=0.7, metadata={"help": "Probability for filtering config, test and uncommon files."}
  155     )
  156     tokenizer: Optional[str] = field(
  157         default="lvwerra/codeparrot",
  158         metadata={"help": "Name or path to the tokenizer."},
  159     )
  160     near_deduplication: Optional[bool] = field(
  161         default=False, metadata={"help": "If True, near-duplicate samples are removed."}
  162     )
  163     jaccard_threshold: Optional[float] = field(
  164         default=0.85, metadata={"help": "Jaccard threshold for near-duplicate samples."}
  165     )
  166 
  167 
  168 @dataclass
  169 class TokenizerTrainingArguments:
  170     """
  171     Configuration for tokenizer training.
  172     """
  173 
  174     base_tokenizer: Optional[str] = field(
  175         default="gpt2", metadata={"help": "Base tokenizer to build new tokenizer from."}
  176     )
  177     dataset_name: Optional[str] = field(
  178         default="transformersbook/codeparrot-train", metadata={"help": "Dataset to train tokenizer on."}
  179     )
  180     text_column: Optional[str] = field(default="content", metadata={"help": "Column containing text data to process."})
  181     vocab_size: Optional[int] = field(default=200_000, metadata={"help": "Number of examples to train tokenizer on."})
  182     n_examples: Optional[int] = field(
  183         default=32768, metadata={"help": "Number of examples to train the tokenizer on."}
  184     )
  185     tokenizer_name: Optional[str] = field(default="codeparrot", metadata={"help": "Name of new tokenizer."})
  186     push_to_hub: Optional[bool] = field(default=True, metadata={"help": "Push saved tokenizer to the hub."})
  187 
  188 
  189 @dataclass
  190 class PretokenizationArguments:
  191     """
  192     Configuration for data pretokenization.
  193     """
  194 
  195     tokenizer_dir: Optional[str] = field(
  196         default="lvwerra/codeparrot", metadata={"help": "Name or path to the tokenizer."}
  197     )
  198     dataset_name: Optional[str] = field(
  199         default="lvwerra/codeparrot-clean-train", metadata={"help": "Name or path to the dataset to pretokenize."}
  200     )
  201     tokenized_data_repo: Optional[str] = field(
  202         default="tokenized-codeparrot-train", metadata={"help": "Repo name of the pretokenized data."}
  203     )
  204     num_workers: Optional[int] = field(default=None, metadata={"help": "Number of workers used for code evaluation."})
  205 
  206 
  207 @dataclass
  208 class InitializationArguments:
  209     """
  210     Configuration for initializing new model.
  211     """
  212 
  213     config_name: Optional[str] = field(
  214         default="gpt2-large", metadata={"help": "Configuration to use for model initialization."}
  215     )
  216     tokenizer_name: Optional[str] = field(
  217         default="lvwerra/codeparrot", metadata={"help": "Tokenizer attached to model."}
  218     )
  219     model_name: Optional[str] = field(default="codeparrot", metadata={"help": "Name of the created model."})
  220     push_to_hub: Optional[bool] = field(default=True, metadata={"help": "Push saved tokenizer to the hub."})