|
use File::Slurp;
|
|
|
|
use utf8;
|
|
|
|
if(scalar(@ARGV)!=2)
|
|
{
|
|
print "\tArgv1 --> input text file\n";
|
|
print "\tArgv2 --> output text file\n";
|
|
exit(0);
|
|
}
|
|
open(file,">",@ARGV[1]);
|
|
@english = split("\n",read_file("lists/english"));
|
|
@spl_chr = split("\n",read_file("lists/spl_chr"));
|
|
@tamil = split("\n",read_file("lists/tamil"));
|
|
@number_file = split("\n",read_file("lists/number"));
|
|
@text = split("",read_file(@ARGV[0]));
|
|
$eng_cnt = 0;
|
|
$spl_cnt = 0;
|
|
$space = 0;
|
|
$tamil_cnt = 0;
|
|
$num_cnt = 0;
|
|
$i=0;
|
|
Foreach1: foreach $txt(@text)
|
|
{
|
|
if($txt ne "(" && $txt ne ")" && $txt ne "\\" && $txt ne "." && $txt ne "^" && $txt ne "*" && $txt ne "+" && $txt ne "?" && $txt ne "{" && $txt ne "}" && $txt ne "[" && $txt ne "]" && $txt ne "|")
|
|
{
|
|
@arr = grep(/^$txt$/,@english);
|
|
@arr1 = grep(/^$txt$/,@spl_chr);
|
|
@arr2 = grep(/^$txt$/,@tamil);
|
|
@arr3 = grep(/^$txt$/,@number_file);
|
|
}
|
|
else
|
|
{
|
|
@arr =[];@arr1=[];@arr2=[];
|
|
$spl_cnt++;
|
|
next Foreach1;
|
|
}
|
|
if($txt eq "\$")
|
|
{
|
|
$spl_cnt++;
|
|
}
|
|
elsif($txt eq " " || $txt eq "\t" || $txt eq "\n")
|
|
{
|
|
$space++;
|
|
}
|
|
elsif(scalar(@arr)!=0)
|
|
{
|
|
$eng_cnt++;
|
|
}
|
|
elsif(scalar(@arr1)!=0 || $txt eq "-" || $txt eq "/" || $txt eq "\\")
|
|
{
|
|
$spl_cnt++;
|
|
|
|
}
|
|
elsif(scalar(@arr3)!=0)
|
|
{
|
|
$num_cnt++;
|
|
}
|
|
else
|
|
{
|
|
|
|
$tamil_cnt++;
|
|
}
|
|
$i++;
|
|
|
|
}
|
|
|
|
@spl_chr_map = split("\n",read_file("lists/spl_chr_map"));
|
|
|
|
for($j=0; $j<scalar(@text); $j++)
|
|
{
|
|
$txt = @text[$j];
|
|
$prev = @text[$j-1];
|
|
$nxt = @text[$j+1];
|
|
if($txt ne "(" && $txt ne ")" && $txt ne "\\" && $txt ne "^" && $txt ne "*" && $txt ne "+" && $txt ne "?" && $txt ne "{" && $txt ne "}" && $txt ne "[" && $txt ne "]" && $txt ne "|")
|
|
{
|
|
@arr = grep(/^$txt$/,@english);
|
|
@arr1 = grep(/^$txt /,@spl_chr_map);
|
|
@num_arr = grep(/\b$txt\b/,@number_file);
|
|
}
|
|
else
|
|
{
|
|
@arr = [];
|
|
@arr1 = [];
|
|
@num_arr = [];
|
|
|
|
}
|
|
if($txt eq "\$")
|
|
{
|
|
if($tamil_cnt > 0)
|
|
{
|
|
print file " டாலர் ";
|
|
}
|
|
elsif($eng_cnt > 0)
|
|
{
|
|
print file " dollar ";
|
|
}
|
|
elsif($eng_cnt == 0 && $tamil_cnt == 0)
|
|
{
|
|
print file " டாலர் ";
|
|
}
|
|
}
|
|
elsif($txt eq "\+")
|
|
{
|
|
if($tamil_cnt > 0)
|
|
{
|
|
print file " ப்ளஸ் ";
|
|
}
|
|
elsif($eng_cnt > 0)
|
|
{
|
|
print file " plus ";
|
|
}
|
|
elsif($eng_cnt == 0 && $tamil_cnt == 0)
|
|
{
|
|
print file " ப்ளஸ் ";
|
|
}
|
|
}
|
|
elsif($txt eq "(" || $txt eq ")" || $txt eq "\\" || $txt eq "^" || $txt eq "*" || $txt eq "?" || $txt eq "{" || $txt eq "}" || $txt eq "[" || $txt eq "]" || $txt eq "|")
|
|
{
|
|
print file " ";
|
|
}
|
|
elsif($txt eq " " || $txt eq "\t" || $txt eq "\n")
|
|
{
|
|
print file " ";
|
|
}
|
|
elsif($txt eq ".")
|
|
{
|
|
|
|
print file "$txt";
|
|
}
|
|
elsif($txt eq ',')
|
|
{
|
|
|
|
|
|
|
|
|
|
if($prev =~ /^\d+?$/ && $nxt =~ /^\d+?$/)
|
|
{
|
|
|
|
}
|
|
else
|
|
{
|
|
print file "$txt";
|
|
}
|
|
}
|
|
elsif($txt =~ /^\d+?$/ && (($prev ne "/" && $nxt ne "/") && ($prev ne "-" && $nxt ne "-") && ($prev ne "." && $nxt ne ".")))
|
|
{
|
|
if($prev =~ /^\d+?$/ && $nxt =~ /^\d+?$/)
|
|
{
|
|
print file "$txt";
|
|
}
|
|
elsif($prev =~ /^\d+?$/ && ($nxt eq "." || $nxt eq ","))
|
|
{
|
|
print file "$txt";
|
|
}
|
|
elsif(($prev eq "." || $prev eq ",") && $nxt =~ /^\d+?$/)
|
|
{
|
|
print file "$txt";
|
|
}
|
|
elsif($nxt eq "." || $nxt eq ",")
|
|
{
|
|
print file " $txt";
|
|
}
|
|
elsif($prev eq "." || $prev eq ",")
|
|
{
|
|
print file "$txt";
|
|
}
|
|
elsif($prev !=~ /^\d+?$/ && $nxt =~ /^\d+?$/ )
|
|
{
|
|
print file "$txt";
|
|
}
|
|
elsif(($next !=~ /^\d+?$/ && $next ne "." && $next ne ",") )
|
|
{
|
|
print file "$txt ";
|
|
}
|
|
elsif(($prev !=~ /^\d+?$/ && $prev ne "." && $prev ne ",") )
|
|
{
|
|
print file "$txt ";
|
|
}
|
|
else
|
|
{
|
|
print file "$txt";
|
|
}
|
|
|
|
}
|
|
elsif(scalar(@arr)!=0)
|
|
{
|
|
print file $txt;
|
|
}
|
|
elsif(scalar(@arr1)!=0)
|
|
{
|
|
@map = split(/\s+/,@arr1[0]);
|
|
if($tamil_cnt > 0)
|
|
{
|
|
if(scalar(@map)==1)
|
|
{
|
|
print file " ";
|
|
}
|
|
elsif(scalar(@map)==2)
|
|
{
|
|
print file "@map[1]";
|
|
}
|
|
elsif(scalar(@map)==3)
|
|
{
|
|
print file " @map[2] ";
|
|
}
|
|
elsif(scalar(@map)==4)
|
|
{
|
|
print file " @map[3] ";
|
|
}
|
|
elsif(scalar(@map)==5)
|
|
{
|
|
print file " @map[3] @map[4] ";
|
|
}
|
|
if(@map[1] eq "rupees")
|
|
{
|
|
$j = $j+2;
|
|
}
|
|
}
|
|
elsif($eng_cnt > 0)
|
|
{
|
|
if(scalar(@map)==1)
|
|
{
|
|
print file " ";
|
|
}
|
|
elsif(scalar(@map)==2)
|
|
{
|
|
print file "@map[1]";
|
|
}
|
|
elsif(scalar(@map)==3)
|
|
{
|
|
print file " @map[1] ";
|
|
}
|
|
elsif(scalar(@map)==4)
|
|
{
|
|
print file " @map[1] @map[2] ";
|
|
}
|
|
elsif(scalar(@map)==5)
|
|
{
|
|
print file " @map[1] @map[2] ";
|
|
}
|
|
if(@map[1] eq "rupees")
|
|
{
|
|
$j = $j+2;
|
|
}
|
|
}
|
|
elsif($eng_cnt == 0 && $tamil_cnt == 0)
|
|
{
|
|
if(scalar(@map)==1)
|
|
{
|
|
print file " ";
|
|
}
|
|
elsif(scalar(@map)==2)
|
|
{
|
|
print file "@map[0]";
|
|
}
|
|
elsif(scalar(@map)==3)
|
|
{
|
|
print file " @map[0] ";
|
|
}
|
|
elsif(scalar(@map)==4)
|
|
{
|
|
print file " @map[0] ";
|
|
}
|
|
elsif(scalar(@map)==5)
|
|
{
|
|
print file " @map[0] ";
|
|
}
|
|
if(@map[1] eq "rupees")
|
|
{
|
|
$j = $j+2;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
print file "$txt";
|
|
}
|
|
}
|
|
close(file);
|
|
|
|
|
|
@tmp = split(/\s+/,read_file(@ARGV[1]));
|
|
open(file,">",@ARGV[1]);
|
|
if(@tmp[0] eq "")
|
|
{
|
|
$i=1;
|
|
}
|
|
else
|
|
{
|
|
$i=0;
|
|
}
|
|
while($i<scalar(@tmp))
|
|
{
|
|
$temp="";
|
|
$word = @tmp[$i];
|
|
|
|
@wd = split("",$word);
|
|
if(@wd[0] =~ /^\d+$/)
|
|
{
|
|
@wd_c = split(/,/,$word);
|
|
foreach $digit (@wd_c)
|
|
{
|
|
|
|
$temp.=$digit;
|
|
}
|
|
if(@wd[scalar(@wd)-1] eq ',')
|
|
{
|
|
|
|
$temp.=", ";
|
|
}
|
|
else
|
|
{
|
|
|
|
|
|
}
|
|
$word = $temp;
|
|
}
|
|
|
|
if($word =~ m!^(\d+)[- /.](\d+)[- /.](\d+)$!)
|
|
{
|
|
|
|
if($1 gt 12 && $2 gt 12)
|
|
{
|
|
print file "$1 $2 $3 ";
|
|
}
|
|
elsif($2 > 12)
|
|
{
|
|
print file "$2/$1/$3 ";
|
|
}
|
|
elsif($1 > 12)
|
|
{
|
|
print file "$1/$2/$3 ";
|
|
}
|
|
else
|
|
{
|
|
print file "$1/$2/$3 ";
|
|
}
|
|
}
|
|
elsif($word =~ m!^(\d+)[-](\d+)$!)
|
|
{
|
|
print file "$1 - $2 ";
|
|
}
|
|
elsif($word =~ m!^(\d+)[.](\d+)$!)
|
|
{
|
|
if($tamil_cnt > 0)
|
|
{
|
|
|
|
|
|
$sep_num = join(" ",split("",$2));
|
|
|
|
}
|
|
elsif($eng_cnt == 0 && $tamil_cnt ==0 )
|
|
{
|
|
|
|
|
|
$sep_num = join(" ",split("",$2));
|
|
|
|
}
|
|
elsif($tamil_cnt == 0)
|
|
{
|
|
|
|
|
|
$sep_num = join(" ",split("",$2));
|
|
|
|
}
|
|
else
|
|
{
|
|
|
|
$sep_num = join(" ",split("",$2));
|
|
|
|
}
|
|
print file "$word ";
|
|
}
|
|
elsif($word =~ m/(\d+)/ && @wd[0] eq ".")
|
|
{
|
|
print file "$word ";
|
|
}
|
|
elsif($word =~ m/(\d+)/)
|
|
{
|
|
|
|
$used = $1;
|
|
$word =~ s/$used/ $used /g;
|
|
print file "$word ";
|
|
}
|
|
else
|
|
{
|
|
print file "$word ";
|
|
}
|
|
|
|
$i++;
|
|
}
|
|
close(file);
|
|
|
|
`perl -pi -e 's/-/ - /g;' @ARGV[1]`;
|
|
|
|
|
|
|
|
|
|
@tmp = split(/\s+/,read_file(@ARGV[1]));
|
|
open(file,">",@ARGV[1]);
|
|
if(@tmp[0] eq "")
|
|
{
|
|
$i=1;
|
|
}
|
|
else
|
|
{
|
|
$i=0;
|
|
}
|
|
while($i<scalar(@tmp))
|
|
{
|
|
$temp="";
|
|
$word = @tmp[$i];
|
|
|
|
@wd = split("",$word);
|
|
if(@wd[0] =~ /^\d+$/)
|
|
{
|
|
@wd_c = split(/,/,$word);
|
|
foreach $digit (@wd_c)
|
|
{
|
|
|
|
$temp.=$digit;
|
|
}
|
|
if(@wd[scalar(@wd)-1] eq ',')
|
|
{
|
|
|
|
$temp.=", ";
|
|
}
|
|
else
|
|
{
|
|
|
|
|
|
}
|
|
$word = $temp;
|
|
}
|
|
|
|
if($word =~ m!^(\d+)[- /.](\d+)[- /.](\d+)$!)
|
|
{
|
|
|
|
if($1 gt 12 && $2 gt 12)
|
|
{
|
|
print file "$1 $2 $3 ";
|
|
}
|
|
elsif($2 > 12)
|
|
{
|
|
print file "$2/$1/$3 ";
|
|
}
|
|
elsif($1 > 12)
|
|
{
|
|
print file "$1/$2/$3 ";
|
|
|
|
}
|
|
else
|
|
{
|
|
print file "$1/$2/$3 ";
|
|
}
|
|
}
|
|
elsif($word =~ m!^(\d+)[-](\d+)$!)
|
|
{
|
|
print file "$1 $2 ";
|
|
}
|
|
elsif($word =~ m!^(\d+)[.](\d+)$!)
|
|
{
|
|
print file "$word ";
|
|
}
|
|
elsif($word =~ m/(\d+)/ && @wd[0] eq ".")
|
|
{
|
|
print file "$word ";
|
|
}
|
|
elsif($word =~ m/(\d+)/)
|
|
{
|
|
$used = $1;
|
|
$word =~ s/$used/ $used /g;
|
|
print file "$word ";
|
|
}
|
|
else
|
|
{
|
|
print file "$word ";
|
|
}
|
|
|
|
$i++;
|
|
}
|
|
close(file);
|
|
|
|
@tmp = split(/\s+/,read_file(@ARGV[1]));
|
|
open(file,">",@ARGV[1]);
|
|
if(@tmp[0] eq "")
|
|
{
|
|
$i=1;
|
|
}
|
|
else
|
|
{
|
|
$i=0;
|
|
}
|
|
while($i<scalar(@tmp))
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print file "@tmp[$i] ";
|
|
|
|
$i++;
|
|
}
|
|
close(file);
|
|
`perl scripts/replace_dot_by_sil.pl @ARGV[1]`;
|
|
|