In [1]:
from javalang import tree, parser
import pandas as pd
import numpy
import javalang
from collections import defaultdict
import re
import shutil
import os

In [2]:
def parse_identifiers(code):
    tree = javalang.parse.parse(code)
    identifier_count = defaultdict(int)
    
    for _, node in tree.filter(javalang.tree.LocalVariableDeclaration):
        for declarator in node.declarators:
            identifier = declarator.name
            identifier_count[identifier] += 1
    
    for _, node in tree.filter(javalang.tree.MethodDeclaration):
        for parameter in node.parameters:
            identifier = parameter.name
            identifier_count[identifier] += 1
    
    sorted_identifiers = sorted(identifier_count.items(), key=lambda x: x[1], reverse=True)
    return sorted_identifiers

In [3]:
def find_top_identifiers(identifiers, num_identifiers=3):
    commonly_used_identifiers = ['i', 'j', 'k', 'result', 'output', 'temp', 'tmp', 'value', 'data', 'input', 'args',
                                 'index', 'flag', 'is', 'count', 'num', 'max', 'min', 'config', 'settings',
                                 'param', 'var']
    
    filtered_identifiers = [(identifier, count) for identifier, count in identifiers
                            if len(identifier) > 4 and identifier not in commonly_used_identifiers
                            and not re.search(r'\d$', identifier)]
#     filtered_identifiers = [(identifier, count) for identifier, count in identifiers
#                             if len(identifier) > 4 and identifier not in commonly_used_identifiers
#                             and not re.search(r'\d$', identifier) and count >= min_occurrences]
    print(filtered_identifiers)
    sorted_identifiers = sorted(filtered_identifiers, key=lambda x: x[1], reverse=True)
    top_identifiers = sorted_identifiers[:num_identifiers]
    
    return top_identifiers

In [4]:
def replace_identifier_with_mask(java_code, output_file_path, identifier, identifier_output_path):
    
    modified_content = java_code.replace(identifier, "<mask> <mask> <mask> <mask>")
    
    with open(output_file_path, 'w+') as output_file:
        output_file.write(modified_content)
    
    with open(identifier_output_path, 'w+') as identifier_output_file:
        identifier_output_file.write(f"{identifier}\n")

In [6]:
folder_path = 'Dataset/inp-txt/'  # Replace with the actual folder path
r = 0
for file_name in os.listdir(folder_path):
    print(r)
    try:
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as file:
            java_code = file.read()
        identifiers = parse_identifiers(java_code)
    except:
        identifiers = []
    top_identifiers = find_top_identifiers(identifiers, num_identifiers=3)
    # Print top identifiers for each file
    print(f"File: {file_name}")
    if r < 5:
        for identifier, count in top_identifiers:
            print(identifier, "-", count)

    # Write top identifier names to a separate output file
#     output_file_path = f"output_{file_name}.txt"
#     with open(output_file_path, 'w') as output_file:
#         for identifier, count in top_identifiers:
#             output_file.write(f"{identifier} - {count}\n")

    # Modify files and save them with the identifier names
    if len(top_identifiers) != 0:
        for i, (identifier, count) in enumerate(top_identifiers, start=1):
            modified_file_path = f"Dataset/op-txt/{r}_{i}.txt"
            id_path = f"Dataset/op-txt/id_{r}_{i}.txt"
            # Replace identifier in the new file
            replace_identifier_with_mask(java_code, modified_file_path, identifier, id_path)
        r+=1

0
[('action', 3), ('execTime', 2), ('delay', 2), ('timedRunnable', 1), ('missed', 1), ('polled', 1), ('delayTime', 1)]
File: train_792
action - 3
execTime - 2
delay - 2
1
[('empty', 3), ('sources', 2), ('missed', 2), ('coordinator', 1), ('replenishInsteadOfDrain', 1), ('localNonEmptySources', 1), ('localCompletedSources', 1), ('requestedMode', 1)]
File: train_302
empty - 3
sources - 2
missed - 2
2
[('timeout', 4), ('subscriberCount', 3), ('connection', 2), ('scheduler', 2), ('numberOfSubscribers', 2)]
File: train_130
timeout - 4
subscriberCount - 3
connection - 2
3
[]
File: train_554
3
[('parent', 1), ('observer', 1)]
File: train_766
parent - 1
observer - 1
4
[('observer', 1)]
File: train_759
observer - 1
5
[]
File: train_933
5
[('report', 2), ('property', 2), ('reportFile', 1), ('objectMapper', 1), ('properties', 1), ('lines', 1), ('exclusions', 1), ('propertyName', 1)]
File: train_1100
6
[('source', 1), ('observer', 1)]
File: train_761
7
[('nextObserver', 1), ('nextNotification', 1),

91
[('resource', 2)]
File: train_153
92
[('missed', 1)]
File: train_395
93
[]
File: train_92
93
[('observer', 1)]
File: train_683
94
[('array', 1), ('singletonList', 1)]
File: train_7
95
[]
File: train_213
95
[('inner', 3), ('sources', 2), ('empty', 2), ('coordinator', 1), ('values', 1), ('missed', 1), ('sourceEmpty', 1)]
File: train_445
96
[]
File: train_677
96
[('signaller', 1), ('other', 1), ('parent', 1), ('observer', 1)]
File: train_648
97
[]
File: train_822
97
[('calls', 3), ('disposable', 2), ('retrofit', 1)]
File: train_1011
98
[('inner', 3), ('empty', 2), ('nextInner', 1), ('active', 1), ('delayErrors', 1), ('missing', 1), ('retry', 1)]
File: train_670
99
[('intercept', 6), ('window', 5), ('downstream', 4), ('queue', 3), ('missed', 3), ('isDone', 3), ('isEmpty', 3), ('emitted', 2), ('boundaryTask', 1), ('boundary', 1), ('windows', 1), ('sender', 1), ('isOpen', 1)]
File: train_442
100
[('notification', 4), ('throwable', 2), ('errorClass', 2), ('subscriber', 1), ('subscription',

[('parent', 2), ('nextIndex', 2)]
File: train_432
198
[]
File: train_1059
198
[('shared', 1), ('observer', 1)]
File: train_264
199
[('sources', 2)]
File: train_290
200
[('executorService', 1), ('start', 1), ('versionOptions', 1), ('librariesToUpgrade', 1), ('librariesByName', 1)]
File: train_1066
201
[]
File: train_855
201
[('resolutionResult', 1), ('dependencies', 1), ('unconstrainedDependencies', 1), ('constraints', 1), ('classpath', 1)]
File: train_1092
202
[]
File: train_801
202
[('description', 1)]
File: train_1032
203
[('observer', 1)]
File: train_459
204
[('observer', 1)]
File: train_466
205
[('serial', 1)]
File: train_654
206
[]
File: train_230
206
[('observer', 1)]
File: train_492
207
[('subscribers', 2), ('parents', 1), ('initialValue', 1)]
File: train_698
208
[]
File: train_208
208
[]
File: train_806
208
[('observer', 3), ('retrofit', 1)]
File: train_1035
209
[('lifted', 1), ('observer', 1)]
File: train_495
210
[('newCap', 1)]
File: train_839
211
[('observer', 1)]
File: trai

293
[('whenReference', 1), ('handler', 1), ('biConsumer', 1), ('error', 1)]
File: train_178
294
[('reportError', 1)]
File: train_910
295
[('project', 5), ('classpath', 3), ('plugins', 1), ('starterMetadata', 1), ('configurations', 1), ('runtimeClasspath', 1), ('destination', 1), ('checkClasspathForConflicts', 1), ('checkClasspathForUnnecessaryExclusions', 1), ('checkClasspathForUnconstrainedDirectDependencies', 1), ('attributes', 1)]
File: train_1123
296
[]
File: train_19
296
[('parent', 1), ('observer', 1)]
File: train_548
297
[('parent', 1), ('errors', 1), ('missed', 1), ('empty', 1), ('observer', 1)]
File: train_577
298
[('source', 2), ('observer', 1)]
File: train_745
299
[('array', 1)]
File: train_26
300
[('collection', 1), ('subscriber', 1)]
File: train_321
301
[('disposable', 3), ('errors', 1), ('array', 1), ('disposables', 1)]
File: train_113
302
[('notification', 2), ('observer', 1)]
File: train_583
303
[('propertyAccessor', 2), ('enabled', 1), ('defaultNotFound', 1), ('default

[('source', 5), ('child', 3), ('timeLimit', 3), ('eagerTruncate', 3), ('bufferSize', 2), ('maxAge', 2), ('scheduler', 2), ('terminal', 2), ('onSubscribe', 1), ('doConnect', 1), ('missed', 1), ('alreadyRequested', 1), ('downstreamMaxRequest', 1), ('sourceIndex', 1), ('destinationIndexObject', 1), ('destinationIndex', 1), ('unbounded', 1), ('observable', 1), ('inner', 1), ('connectableFactory', 1), ('selector', 1), ('bufferFactory', 1), ('connection', 1), ('producer', 1)]
File: train_397
381
[('subscriber', 3), ('retrofit', 1)]
File: train_999
382
[]
File: train_90
382
[]
File: train_535
382
[]
File: train_64
382
[('inner', 7), ('empty', 4), ('missed', 2), ('parent', 1)]
File: train_707
383
[('observer', 1)]
File: train_363
384
[]
File: train_151
384
[('observer', 9), ('throwable', 9), ('pluginRef', 6), ('composite', 3), ('errorRef', 2), ('retrofit', 1), ('first', 1), ('second', 1)]
File: train_990
385
[]
File: train_99
385
[]
File: train_158
385
[('future', 6), ('response', 2), ('retrof

475
[]
File: train_881
475
[('project', 7), ('publication', 3), ('extension', 2), ('publishing', 1), ('javaComponent', 1), ('organization', 1), ('licences', 1), ('developers', 1), ('issueManagement', 1)]
File: train_1046
476
[]
File: train_875
476
[('observer', 1)]
File: train_244
477
[('restTemplate', 1), ('uriTemplateHandler', 1), ('organization', 1)]
File: train_1079
478
[]
File: train_412
478
[('worker', 1), ('observer', 1)]
File: train_620
479
[('publishing', 1), ('mavenPublication', 1), ('project', 1)]
File: train_1041
480
[('buffer', 8), ('offset', 7), ('capacity', 3), ('nextBuffer', 3), ('newBuffer', 2), ('offsetInNew', 2), ('p2capacity', 1), ('lookAheadStep', 1), ('lookAheadElementOffset', 1), ('nextOffset', 1), ('isNextBuffer', 1), ('after', 1), ('before', 1), ('currentProducerIndex', 1), ('oldBuffer', 1), ('currIndex', 1), ('nextIndex', 1), ('first', 1), ('second', 1)]
File: train_872
481
[]
File: train_618
481
[]
File: train_288
481
[]
File: train_886
481
[]
File: train_627

[('connection', 15), ('response', 8), ('reader', 7), ('request', 5), ('startNanos', 3), ('elapsedNanos', 3), ('elapsedMillis', 3), ('server', 3), ('handshake', 2), ('reasons', 1), ('expectedReason', 1), ('first', 1), ('redirect', 1), ('urlConnection', 1), ('byteRead', 1), ('headers', 1), ('headerList', 1), ('other', 1), ('called', 1), ('statement', 1), ('requestUrl', 1), ('refusedConnection', 1), ('handshakeCertificates', 1), ('clientCa', 1), ('serverCa', 1), ('serverCertificate', 1), ('serverHandshakeCertificates', 1), ('clientCertificate', 1), ('clientHandshakeCertificates', 1)]
File: train_1038
562
[('observer', 1)]
File: train_205
563
[]
File: train_661
563
[('parent', 1), ('cancelled', 1), ('goNextSource', 1), ('source', 1)]
File: train_453
564
[('sources', 2), ('delayError', 2), ('empty', 2), ('missing', 1), ('emptyCount', 1), ('observer', 1), ('bufferSize', 1), ('source', 1)]
File: train_695
565
[('retrofit', 2), ('annotations', 1), ('parameterAnnotations', 1), ('methodAnnotatio

[('current', 3), ('observer', 2), ('sender', 2), ('missed', 1), ('downstream', 1), ('errors', 1), ('inner', 1), ('empty', 1)]
File: train_545
664
[('decoratedRun', 5), ('interruptibleTask', 2), ('delay', 2), ('delayed', 1), ('disposable', 1), ('runnableTask', 1), ('first', 1), ('missed', 1), ('state', 1), ('initialDelay', 1), ('period', 1)]
File: train_777
665
[('typeLib', 2), ('array', 1), ('before', 1), ('after', 1)]
File: train_14
666
[('empty', 5), ('missed', 3), ('emitter', 1), ('errors', 1)]
File: train_313
667
[]
File: train_121
667
[]
File: train_181
667
[('observer', 1)]
File: train_723
668
[('sourceArray', 1), ('source', 1)]
File: train_40
669
[('parent', 1), ('observer', 1)]
File: train_511
670
[('bucketConsumer', 1), ('cardinality', 1), ('randomTerms', 1), ('buckets', 1), ('subAggs', 1), ('shards', 1), ('request', 1), ('executor', 1), ('isCanceled', 1), ('consumer', 1), ('latch', 1), ('phase', 1), ('withNested', 1), ('candidateList', 1)]
File: train_949
671
[('container', 2

In [7]:
code = []
iden = []
folder_path = 'Dataset/op-txt/' 
file_names = sorted(os.listdir(folder_path))
for file_name in file_names:
    if file_name.endswith(".txt"): 
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as file:
            print(file_name)
            a = file.read()
            if file_name.startswith("id"): 
                iden.append(a)
            else:
                code.append(a)
df = pd.DataFrame({'code': code, 'identifier': iden})

0_1.txt
0_2.txt
0_3.txt
100_1.txt
100_2.txt
100_3.txt
101_1.txt
101_2.txt
101_3.txt
102_1.txt
102_2.txt
103_1.txt
103_2.txt
103_3.txt
104_1.txt
105_1.txt
105_2.txt
106_1.txt
107_1.txt
108_1.txt
108_2.txt
108_3.txt
109_1.txt
10_1.txt
10_2.txt
10_3.txt
110_1.txt
110_2.txt
110_3.txt
111_1.txt
112_1.txt
113_1.txt
113_2.txt
114_1.txt
114_2.txt
115_1.txt
115_2.txt
115_3.txt
116_1.txt
116_2.txt
117_1.txt
118_1.txt
119_1.txt
119_2.txt
119_3.txt
11_1.txt
11_2.txt
120_1.txt
121_1.txt
121_2.txt
122_1.txt
123_1.txt
123_2.txt
124_1.txt
125_1.txt
126_1.txt
126_2.txt
127_1.txt
127_2.txt
128_1.txt
128_2.txt
128_3.txt
129_1.txt
12_1.txt
12_2.txt
12_3.txt
130_1.txt
130_2.txt
131_1.txt
131_2.txt
132_1.txt
132_2.txt
132_3.txt
133_1.txt
134_1.txt
134_2.txt
134_3.txt
135_1.txt
136_1.txt
137_1.txt
138_1.txt
138_2.txt
139_1.txt
13_1.txt
140_1.txt
140_2.txt
140_3.txt
141_1.txt
142_1.txt
142_2.txt
142_3.txt
143_1.txt
143_2.txt
144_1.txt
145_1.txt
145_2.txt
146_1.txt
146_2.txt
147_1.txt
148_1.txt
149_1.txt
14_1.

In [8]:
# new_df = pd.read_csv('dat.csv')
# concatenated_df = pd.concat([df, new_df])
df.to_csv('dat.csv',index=False)