GenerTeam commited on
Commit
3be4abf
·
verified ·
1 Parent(s): 9f6e787

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +20 -2
README.md CHANGED
@@ -43,14 +43,32 @@ config = model.config
43
  max_length = config.max_position_embeddings
44
 
45
  # Define input sequences.
46
- # The input sequence length should be a
47
  sequences = [
48
  "ATGAGGTGGCAAGAAATGGGCTAC",
49
  "GAATTCCATGAGGCTATAGAATAATCTAAGAGAAAT"
50
  ]
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  # Process the sequences
53
- sequences = [tokenizer.bos_token + sequence for sequence in sequences]
54
 
55
  # Tokenize the sequences
56
  tokenizer.padding_side = "left"
 
43
  max_length = config.max_position_embeddings
44
 
45
  # Define input sequences.
 
46
  sequences = [
47
  "ATGAGGTGGCAAGAAATGGGCTAC",
48
  "GAATTCCATGAGGCTATAGAATAATCTAAGAGAAAT"
49
  ]
50
 
51
+ def left_padding(sequence, padding_char='A', multiple=6):
52
+ remainder = len(sequence) % multiple
53
+ if remainder != 0:
54
+ padding_length = multiple - remainder
55
+ return padding_char * padding_length + sequence
56
+ return sequence
57
+
58
+ def left_truncation(sequence, multiple=6):
59
+ remainder = len(sequence) % multiple
60
+ if remainder != 0:
61
+ return sequence[remainder:]
62
+ return sequence
63
+
64
+ # Apply left_padding to all sequences
65
+ # padded_sequences = [left_padding(seq) for seq in sequences]
66
+
67
+ # Apply left_truncation to all sequences
68
+ truncated_sequences = [left_truncation(seq) for seq in sequences]
69
+
70
  # Process the sequences
71
+ sequences = [tokenizer.bos_token + sequence for sequence in truncated_sequences]
72
 
73
  # Tokenize the sequences
74
  tokenizer.padding_side = "left"