gradio_test / breakupText.py
ajsbsd's picture
qwen.ai helper
f52daa3
# text_to_training_csv.py
import sys
import csv
def main():
if len(sys.argv) < 2:
print("Usage: python text_to_training_csv.py <chunk_size>", file=sys.stderr)
sys.exit(1)
try:
chunk_size = int(sys.argv[1])
except ValueError:
print("Error: Chunk size must be an integer.", file=sys.stderr)
sys.exit(1)
# CSV writer setup
writer = csv.writer(sys.stdout)
writer.writerow(["id", "text"]) # Header row
id_counter = 1
buffer = ''
while True:
chunk = sys.stdin.read(chunk_size)
if not chunk:
break
buffer += chunk
# If we've reached or exceeded chunk_size, write and reset
if len(buffer) >= chunk_size:
writer.writerow([id_counter, buffer[:chunk_size]])
id_counter += 1
buffer = buffer[chunk_size:] # Remaining text
# Write any leftover text
if buffer:
writer.writerow([id_counter, buffer])
id_counter += 1
if __name__ == "__main__":
main()