"Fossies" - the Fresh Open Source Software Archive 
Member "transformers-4.21.1/examples/research_projects/codeparrot/scripts/arguments.py" (4 Aug 2022, 10173 Bytes) of package /linux/misc/transformers-4.21.1.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style:
standard) with prefixed line numbers.
Alternatively you can here
view or
download the uninterpreted source code file.
See also the last
Fossies "Diffs" side-by-side code changes report for "arguments.py":
4.20.1_vs_4.21.0.
1 from dataclasses import dataclass, field
2 from typing import Optional
3
4
5 @dataclass
6 class TrainingArguments:
7 """
8 Configuration for training model.
9 """
10
11 model_ckpt: Optional[str] = field(
12 default="lvwerra/codeparrot", metadata={"help": "Model name or path of model to be trained."}
13 )
14 save_dir: Optional[str] = field(
15 default="./", metadata={"help": "Save dir where model repo is cloned and models updates are saved to."}
16 )
17 dataset_name_train: Optional[str] = field(
18 default="lvwerra/codeparrot-clean-train", metadata={"help": "Name or path of training dataset."}
19 )
20 dataset_name_valid: Optional[str] = field(
21 default="lvwerra/codeparrot-clean-valid", metadata={"help": "Name or path of validation dataset."}
22 )
23 train_batch_size: Optional[int] = field(default=2, metadata={"help": "Batch size for training."})
24 valid_batch_size: Optional[int] = field(default=2, metadata={"help": "Batch size for evaluation."})
25 weight_decay: Optional[float] = field(default=0.1, metadata={"help": "Value of weight decay."})
26 shuffle_buffer: Optional[int] = field(
27 default=10000, metadata={"help": "Size of buffer used to shuffle streaming dataset."}
28 )
29 learning_rate: Optional[float] = field(default=2e-4, metadata={"help": "Learning rate fo training."})
30 lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "Learning rate."})
31 num_warmup_steps: Optional[int] = field(
32 default=750, metadata={"help": "Number of warmup steps in the learning rate schedule."}
33 )
34 gradient_accumulation_steps: Optional[int] = field(
35 default=16, metadata={"help": "Number of gradient accumulation steps."}
36 )
37 gradient_checkpointing: Optional[bool] = field(
38 default=True, metadata={"help": "Use gradient checkpointing to reduce memory footprint."}
39 )
40 max_train_steps: Optional[int] = field(default=50000, metadata={"help": "Maximum number of training steps."})
41 max_eval_steps: Optional[int] = field(
42 default=-1, metadata={"help": "Maximum number of evaluation steps. If -1 the full dataset is evaluated."}
43 )
44 seq_length: Optional[int] = field(default=1024, metadata={"help": "Sequence lengths used for training."})
45 seed: Optional[int] = field(default=1, metadata={"help": "Training seed."})
46 save_checkpoint_steps: Optional[int] = field(
47 default=1024,
48 metadata={"help": "Interval to save checkpoints. Measured as number of forward passes not training steps."},
49 )
50 resume_from_checkpoint: Optional[str] = field(
51 default=None, metadata={"help": "States path if the training should continue from a checkpoint folder."}
52 )
53 tokenized: Optional[bool] = field(default=False, metadata={"help": "If True the data is pretokenized."})
54
55
56 @dataclass
57 class EvaluationArguments:
58 """
59 Configuration for evaluating model.
60 """
61
62 model_ckpt: Optional[str] = field(
63 default="lvwerra/codeparrot", metadata={"help": "Model name or path of model to be evaluated."}
64 )
65 dataset_name: Optional[str] = field(
66 default="lvwerra/codeparrot-clean-valid", metadata={"help": "Name or path of validation dataset."}
67 )
68 batch_size: Optional[int] = field(default=2, metadata={"help": "Batch size used for evaluation."})
69 max_eval_steps: Optional[int] = field(
70 default=-1, metadata={"help": "Maximum number of evaluation steps. If -1 the full dataset is evaluated."}
71 )
72 seq_length: Optional[int] = field(default=1024, metadata={"help": "Length of sequences to be evaluated."})
73 seed: Optional[int] = field(default=1, metadata={"help": "Random seed used for evaluation."})
74
75
76 @dataclass
77 class HumanEvalArguments:
78 """
79 Configuration for running evaluation on HumanEval dataset.
80 """
81
82 model_ckpt: Optional[str] = field(
83 default="lvwerra/codeparrot", metadata={"help": "Model name or path of model to be evaluated."}
84 )
85 num_workers: Optional[int] = field(default=None, metadata={"help": "Number of workers used for code evaluation."})
86 num_tasks: Optional[int] = field(
87 default=None,
88 metadata={"help": "The number of human-eval tasks to run. If not included all tasks are evaluated."},
89 )
90 do_sample: Optional[bool] = field(
91 default=True, metadata={"help": "Sample from the language model's output distribution."}
92 )
93 temperature: Optional[float] = field(default=0.2, metadata={"help": "Sampling temperature used for generation."})
94 max_new_tokens: Optional[int] = field(default=256, metadata={"help": "Maximum number of newly generated tokens."})
95 top_k: Optional[int] = field(default=0, metadata={"help": "Top-k parameter used for generation."})
96 top_p: Optional[float] = field(default=0.95, metadata={"help": "Top-p parameter used for nucleus sampling."})
97 batch_size: Optional[int] = field(default=10, metadata={"help": "Number of generations to run in parallel."})
98 n_samples: Optional[int] = field(
99 default=200, metadata={"help": "Number of completions to generate for each sample."}
100 )
101 seed: Optional[int] = field(default=1, metadata={"help": "Random seed used for evaluation."})
102 output_file: Optional[str] = field(
103 default="eval_results.json", metadata={"help": "Random seed used for evaluation."}
104 )
105 HF_ALLOW_CODE_EVAL: Optional[str] = field(
106 default="0", metadata={"help": "Allow `code_eval` to execute Python code on machine"}
107 )
108 device_int: Optional[int] = field(
109 default=-1,
110 metadata={
111 "help": (
112 "Determine which device to run the `text-generation` Pipeline on. -1 is CPU and any zero or positive"
113 " number corresponds to which GPU device id to run on."
114 )
115 },
116 )
117
118
119 @dataclass
120 class PreprocessingArguments:
121 """
122 Configuration for preprocessing data.
123 """
124
125 num_workers: Optional[int] = field(
126 default=None,
127 metadata={
128 "help": "The number of CPU cores to use for parallel preprocessing. Default uses the maximum available."
129 },
130 )
131 dataset_name: Optional[str] = field(
132 default="transformersbook/codeparrot", metadata={"help": "Folder or name of dataset to process."}
133 )
134 output_dir: Optional[str] = field(
135 default="codeparrot-clean", metadata={"help": "Folder to save processed processed dataset."}
136 )
137 samples_per_file: Optional[int] = field(
138 default=100_000, metadata={"help": "Number of files to save per JSON output file."}
139 )
140 text_column: Optional[str] = field(default="content", metadata={"help": "Column containing text data to process."})
141 line_max: Optional[float] = field(
142 default=1000, metadata={"help": "Maximum line length in file, otherwise file is filtered."}
143 )
144 line_mean: Optional[float] = field(
145 default=100, metadata={"help": "Maximum mean line length in file, otherwise file is filtered."}
146 )
147 alpha_frac: Optional[float] = field(
148 default=0.25, metadata={"help": "Maximum fraction of non-alphanumeric characters, otherwise file is filtered."}
149 )
150 min_token_ratio: Optional[float] = field(
151 default=1.5, metadata={"help": "Minimum character token ratio for the file, otherwise file is filtered."}
152 )
153 filter_proba: Optional[float] = field(
154 default=0.7, metadata={"help": "Probability for filtering config, test and uncommon files."}
155 )
156 tokenizer: Optional[str] = field(
157 default="lvwerra/codeparrot",
158 metadata={"help": "Name or path to the tokenizer."},
159 )
160 near_deduplication: Optional[bool] = field(
161 default=False, metadata={"help": "If True, near-duplicate samples are removed."}
162 )
163 jaccard_threshold: Optional[float] = field(
164 default=0.85, metadata={"help": "Jaccard threshold for near-duplicate samples."}
165 )
166
167
168 @dataclass
169 class TokenizerTrainingArguments:
170 """
171 Configuration for tokenizer training.
172 """
173
174 base_tokenizer: Optional[str] = field(
175 default="gpt2", metadata={"help": "Base tokenizer to build new tokenizer from."}
176 )
177 dataset_name: Optional[str] = field(
178 default="transformersbook/codeparrot-train", metadata={"help": "Dataset to train tokenizer on."}
179 )
180 text_column: Optional[str] = field(default="content", metadata={"help": "Column containing text data to process."})
181 vocab_size: Optional[int] = field(default=200_000, metadata={"help": "Number of examples to train tokenizer on."})
182 n_examples: Optional[int] = field(
183 default=32768, metadata={"help": "Number of examples to train the tokenizer on."}
184 )
185 tokenizer_name: Optional[str] = field(default="codeparrot", metadata={"help": "Name of new tokenizer."})
186 push_to_hub: Optional[bool] = field(default=True, metadata={"help": "Push saved tokenizer to the hub."})
187
188
189 @dataclass
190 class PretokenizationArguments:
191 """
192 Configuration for data pretokenization.
193 """
194
195 tokenizer_dir: Optional[str] = field(
196 default="lvwerra/codeparrot", metadata={"help": "Name or path to the tokenizer."}
197 )
198 dataset_name: Optional[str] = field(
199 default="lvwerra/codeparrot-clean-train", metadata={"help": "Name or path to the dataset to pretokenize."}
200 )
201 tokenized_data_repo: Optional[str] = field(
202 default="tokenized-codeparrot-train", metadata={"help": "Repo name of the pretokenized data."}
203 )
204 num_workers: Optional[int] = field(default=None, metadata={"help": "Number of workers used for code evaluation."})
205
206
207 @dataclass
208 class InitializationArguments:
209 """
210 Configuration for initializing new model.
211 """
212
213 config_name: Optional[str] = field(
214 default="gpt2-large", metadata={"help": "Configuration to use for model initialization."}
215 )
216 tokenizer_name: Optional[str] = field(
217 default="lvwerra/codeparrot", metadata={"help": "Tokenizer attached to model."}
218 )
219 model_name: Optional[str] = field(default="codeparrot", metadata={"help": "Name of the created model."})
220 push_to_hub: Optional[bool] = field(default=True, metadata={"help": "Push saved tokenizer to the hub."})