diff --git a/images/tum-logo.png b/images/tum-logo.png new file mode 100644 index 0000000000000000000000000000000000000000..0b39ac8f7127130306314286e8f317e08516ac4c Binary files /dev/null and b/images/tum-logo.png differ diff --git a/images/tum-logo2.png b/images/tum-logo2.png new file mode 100644 index 0000000000000000000000000000000000000000..0d0cee5907fbf25e755e6291ab332dee4ffef942 Binary files /dev/null and b/images/tum-logo2.png differ diff --git a/nltk_data/corpora/stopwords.zip b/nltk_data/corpora/stopwords.zip new file mode 100644 index 0000000000000000000000000000000000000000..10b599bfb769c064115e42cf156b1a6f293e7232 --- /dev/null +++ b/nltk_data/corpora/stopwords.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15c94179887425ca1bedc265608cab9f27d650211f709bb929e320990a4b01d1 +size 34276 diff --git a/nltk_data/corpora/stopwords/README b/nltk_data/corpora/stopwords/README new file mode 100644 index 0000000000000000000000000000000000000000..debf14c8793a76a0ff71c568b23b3ac0c33e361d --- /dev/null +++ b/nltk_data/corpora/stopwords/README @@ -0,0 +1,32 @@ +Stopwords Corpus + +This corpus contains lists of stop words for several languages. These +are high-frequency grammatical words which are usually ignored in text +retrieval applications. + +They were obtained from: +http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/ + +The stop words for the Romanian language were obtained from: +http://arlc.ro/resources/ + +The English list has been augmented +https://github.com/nltk/nltk_data/issues/22 + +The German list has been corrected +https://github.com/nltk/nltk_data/pull/49 + +A Kazakh list has been added +https://github.com/nltk/nltk_data/pull/52 + +A Nepali list has been added +https://github.com/nltk/nltk_data/pull/83 + +An Azerbaijani list has been added +https://github.com/nltk/nltk_data/pull/100 + +A Greek list has been added +https://github.com/nltk/nltk_data/pull/103 + +An Indonesian list has been added +https://github.com/nltk/nltk_data/pull/112 diff --git a/nltk_data/corpora/stopwords/arabic b/nltk_data/corpora/stopwords/arabic new file mode 100644 index 0000000000000000000000000000000000000000..ee3a26e793f8ab69a44b30332708cc122b258b51 --- /dev/null +++ b/nltk_data/corpora/stopwords/arabic @@ -0,0 +1,754 @@ +إذ +إذا +إذما +إذن +أف +أقل +أكثر +ألا +إلا +التي +الذي +الذين +اللاتي +اللائي +اللتان +اللتيا +اللتين +اللذان +اللذين +اللواتي +إلى +إليك +إليكم +إليكما +إليكن +أم +أما +أما +إما +أن +إن +إنا +أنا +أنت +أنتم +أنتما +أنتن +إنما +إنه +أنى +أنى +آه +آها +أو +أولاء +أولئك +أوه +آي +أي +أيها +إي +أين +أين +أينما +إيه +بخ +بس +بعد +بعض +بك +بكم +بكم +بكما +بكن +بل +بلى +بما +بماذا +بمن +بنا +به +بها +بهم +بهما +بهن +بي +بين +بيد +تلك +تلكم +تلكما +ته +تي +تين +تينك +ثم +ثمة +حاشا +حبذا +حتى +حيث +حيثما +حين +خلا +دون +ذا +ذات +ذاك +ذان +ذانك +ذلك +ذلكم +ذلكما +ذلكن +ذه +ذو +ذوا +ذواتا +ذواتي +ذي +ذين +ذينك +ريث +سوف +سوى +شتان +عدا +عسى +عل +على +عليك +عليه +عما +عن +عند +غير +فإذا +فإن +فلا +فمن +في +فيم +فيما +فيه +فيها +قد +كأن +كأنما +كأي +كأين +كذا +كذلك +كل +كلا +كلاهما +كلتا +كلما +كليكما +كليهما +كم +كم +كما +كي +كيت +كيف +كيفما +لا +لاسيما +لدى +لست +لستم +لستما +لستن +لسن +لسنا +لعل +لك +لكم +لكما +لكن +لكنما +لكي +لكيلا +لم +لما +لن +لنا +له +لها +لهم +لهما +لهن +لو +لولا +لوما +لي +لئن +ليت +ليس +ليسا +ليست +ليستا +ليسوا +ما +ماذا +متى +مذ +مع +مما +ممن +من +منه +منها +منذ +مه +مهما +نحن +نحو +نعم +ها +هاتان +هاته +هاتي +هاتين +هاك +هاهنا +هذا +هذان +هذه +هذي +هذين +هكذا +هل +هلا +هم +هما +هن +هنا +هناك +هنالك +هو +هؤلاء +هي +هيا +هيت +هيهات +والذي +والذين +وإذ +وإذا +وإن +ولا +ولكن +ولو +وما +ومن +وهو +يا +أبٌ +أخٌ +حمٌ +فو +أنتِ +يناير +فبراير +مارس +أبريل +مايو +يونيو +يوليو +أغسطس +سبتمبر +أكتوبر +نوفمبر +ديسمبر +جانفي +فيفري +مارس +أفريل +ماي +جوان +جويلية +أوت +كانون +شباط +آذار +نيسان +أيار +حزيران +تموز +آب +أيلول +تشرين +دولار +دينار +ريال +درهم +ليرة +جنيه +قرش +مليم +فلس +هللة +سنتيم +يورو +ين +يوان +شيكل +واحد +اثنان +ثلاثة +أربعة +خمسة +ستة +سبعة +ثمانية +تسعة +عشرة +أحد +اثنا +اثني +إحدى +ثلاث +أربع +خمس +ست +سبع +ثماني +تسع +عشر +ثمان +سبت +أحد +اثنين +ثلاثاء +أربعاء +خميس +جمعة +أول +ثان +ثاني +ثالث +رابع +خامس +سادس +سابع +ثامن +تاسع +عاشر +حادي +أ +ب +ت +ث +ج +ح +خ +د +ذ +ر +ز +س +ش +ص +ض +ط +ظ +ع +غ +ف +ق +ك +ل +م +ن +ه +و +ي +ء +ى +آ +ؤ +ئ +أ +ة +ألف +باء +تاء +ثاء +جيم +حاء +خاء +دال +ذال +راء +زاي +سين +شين +صاد +ضاد +طاء +ظاء +عين +غين +فاء +قاف +كاف +لام +ميم +نون +هاء +واو +ياء +همزة +ي +نا +ك +كن +ه +إياه +إياها +إياهما +إياهم +إياهن +إياك +إياكما +إياكم +إياك +إياكن +إياي +إيانا +أولالك +تانِ +تانِك +تِه +تِي +تَيْنِ +ثمّ +ثمّة +ذانِ +ذِه +ذِي +ذَيْنِ +هَؤلاء +هَاتانِ +هَاتِه +هَاتِي +هَاتَيْنِ +هَذا +هَذانِ +هَذِه +هَذِي +هَذَيْنِ +الألى +الألاء +أل +أنّى +أيّ +ّأيّان +أنّى +أيّ +ّأيّان +ذيت +كأيّ +كأيّن +بضع +فلان +وا +آمينَ +آهِ +آهٍ +آهاً +أُفٍّ +أُفٍّ +أفٍّ +أمامك +أمامكَ +أوّهْ +إلَيْكَ +إلَيْكَ +إليكَ +إليكنّ +إيهٍ +بخٍ +بسّ +بَسْ +بطآن +بَلْهَ +حاي +حَذارِ +حيَّ +حيَّ +دونك +رويدك +سرعان +شتانَ +شَتَّانَ +صهْ +صهٍ +طاق +طَق +عَدَسْ +كِخ +مكانَك +مكانَك +مكانَك +مكانكم +مكانكما +مكانكنّ +نَخْ +هاكَ +هَجْ +هلم +هيّا +هَيْهات +وا +واهاً +وراءَك +وُشْكَانَ +وَيْ +يفعلان +تفعلان +يفعلون +تفعلون +تفعلين +اتخذ +ألفى +تخذ +ترك +تعلَّم +جعل +حجا +حبيب +خال +حسب +خال +درى +رأى +زعم +صبر +ظنَّ +عدَّ +علم +غادر +ذهب +وجد +ورد +وهب +أسكن +أطعم +أعطى +رزق +زود +سقى +كسا +أخبر +أرى +أعلم +أنبأ +حدَث +خبَّر +نبَّا +أفعل به +ما أفعله +بئس +ساء +طالما +قلما +لات +لكنَّ +ءَ +أجل +إذاً +أمّا +إمّا +إنَّ +أنًّ +أى +إى +أيا +ب +ثمَّ +جلل +جير +رُبَّ +س +علًّ +ف +كأنّ +كلَّا +كى +ل +لات +لعلَّ +لكنَّ +لكنَّ +م +نَّ +هلّا +وا +أل +إلّا +ت +ك +لمّا +ن +ه +و +ا +ي +تجاه +تلقاء +جميع +حسب +سبحان +شبه +لعمر +مثل +معاذ +أبو +أخو +حمو +فو +مئة +مئتان +ثلاثمئة +أربعمئة +خمسمئة +ستمئة +سبعمئة +ثمنمئة +تسعمئة +مائة +ثلاثمائة +أربعمائة +خمسمائة +ستمائة +سبعمائة +ثمانمئة +تسعمائة +عشرون +ثلاثون +اربعون +خمسون +ستون +سبعون +ثمانون +تسعون +عشرين +ثلاثين +اربعين +خمسين +ستين +سبعين +ثمانين +تسعين +بضع +نيف +أجمع +جميع +عامة +عين +نفس +لا سيما +أصلا +أهلا +أيضا +بؤسا +بعدا +بغتة +تعسا +حقا +حمدا +خلافا +خاصة +دواليك +سحقا +سرا +سمعا +صبرا +صدقا +صراحة +طرا +عجبا +عيانا +غالبا +فرادى +فضلا +قاطبة +كثيرا +لبيك +معاذ +أبدا +إزاء +أصلا +الآن +أمد +أمس +آنفا +آناء +أنّى +أول +أيّان +تارة +ثمّ +ثمّة +حقا +صباح +مساء +ضحوة +عوض +غدا +غداة +قطّ +كلّما +لدن +لمّا +مرّة +قبل +خلف +أمام +فوق +تحت +يمين +شمال +ارتدّ +استحال +أصبح +أضحى +آض +أمسى +انقلب +بات +تبدّل +تحوّل +حار +رجع +راح +صار +ظلّ +عاد +غدا +كان +ما انفك +ما برح +مادام +مازال +مافتئ +ابتدأ +أخذ +اخلولق +أقبل +انبرى +أنشأ +أوشك +جعل +حرى +شرع +طفق +علق +قام +كرب +كاد +هبّ \ No newline at end of file diff --git a/nltk_data/corpora/stopwords/azerbaijani b/nltk_data/corpora/stopwords/azerbaijani new file mode 100644 index 0000000000000000000000000000000000000000..27bf2940fe608d8a0698a7b896cd1fd22ad64ac9 --- /dev/null +++ b/nltk_data/corpora/stopwords/azerbaijani @@ -0,0 +1,165 @@ +a +ad +altı +altmış +amma +arasında +artıq +ay +az +bax +belə +bəli +bəlkə +beş +bəy +bəzən +bəzi +bilər +bir +biraz +biri +birşey +biz +bizim +bizlər +bu +buna +bundan +bunların +bunu +bunun +buradan +bütün +ci +cı +çox +cu +cü +çünki +da +daha +də +dedi +dək +dən +dəqiqə +deyil +dir +doqquz +doqsan +dörd +düz +ə +edən +edir +əgər +əlbəttə +elə +əlli +ən +əslində +et +etdi +etmə +etmək +faiz +gilə +görə +ha +haqqında +harada +hə +heç +həm +həmin +həmişə +hər +ı +idi +iki +il +ildə +ilə +ilk +in +indi +isə +istifadə +iyirmi +ki +kim +kimə +kimi +lakin +lap +məhz +mən +mənə +mirşey +nə +nəhayət +niyə +o +obirisi +of +olan +olar +olaraq +oldu +olduğu +olmadı +olmaz +olmuşdur +olsun +olur +on +ona +ondan +onlar +onlardan +onların +onsuzda +onu +onun +oradan +otuz +öz +özü +qarşı +qədər +qırx +saat +sadəcə +saniyə +səhv +səkkiz +səksən +sən +sənə +sənin +siz +sizin +sizlər +sonra +təəssüf +ü +üç +üçün +var +və +xan +xanım +xeyr +ya +yalnız +yaxşı +yeddi +yenə +yəni +yetmiş +yox +yoxdur +yoxsa +yüz +zaman \ No newline at end of file diff --git a/nltk_data/corpora/stopwords/basque b/nltk_data/corpora/stopwords/basque new file mode 100644 index 0000000000000000000000000000000000000000..3b84c32419b346f0606212affb50f08f3d226194 --- /dev/null +++ b/nltk_data/corpora/stopwords/basque @@ -0,0 +1,326 @@ +ahala +aitzitik +al +ala +alabadere +alabaina +alabaina +aldiz +alta +amaitu +amaitzeko +anitz +antzina +arabera +arabera +arabera +argi +arratsaldero +arte +artean +asko +aspaldiko +aurrera +aurrera +azkenez +azkenik +azkenik +ba +bada +bada +bada +bada +badarik +badarik +badarik +badere +bai +baina +baina +baina +baino +baino +baino +baino +baita +baizik +baldin +baldin +barren +bat +batean +batean +batean +batean +batek +baten +batera +batez +bati +batzuei +batzuek +batzuetan +batzuk +bazen +bederen +bederik +beharrez +behiala +behin +behin +behin +behin +behinik +behinola +behintzat +bera +beraiek +beranduago +berau +berauek +beraz +beraz +bere +berean +berebat +berehala +berori +beroriek +berriro +berriz +bertzalde +bertzenaz +bestalde +beste +bestela +besterik +bezain +bezala +bide +bien +bigarrenez +bigarrenik +bitartean +bitartean +bizkitartean +bukaeran +bukatzeko +da +dago +dago +dela +dela +dela +delarik +den +dena +dena +dezadan +dira +ditu +du +dute +edo +edo +edota +egin +egin +egun +egun +egunean +emateko +era +erdi +ere +ere +ere +ere +ere +esan +esan +esanak +esandakoaren +eta +eta +eta +eta +eta +eta +eurak +ez +ez +ez +eze +ezen +ezer +ezezik +ezik +ezpabere +ezpada +ezpere +ezperen +ezta +funtsean +gabe +gain +gainera +gainera +gainerontzean +gaur +gero +gero +gero +geroago +gisa +gu +gutxi +guzti +guztia +guztiz +haatik +haiei +haiek +haietan +hain +hainbeste +hainbestez +hala +hala +hala +halaber +halako +halatan +han +handik +hango +hara +hargatik +hari +hark +hartan +hartan +hasi +hasi +hasiera +hasieran +hasteaz +hasteko +hasteko +hau +hau +hau +hau +hau +hau +hauei +hauek +hauetan +hemen +hemendik +hemengo +hi +hona +honebestez +honek +honela +honela +honela +honen +honen +honetan +honetaz +honi +hor +hori +hori +hori +horiei +horiek +horietan +horko +horra +horratik +horregatik +horregatik +horrek +horrela +horrela +horrela +horren +horrenbestez +horretan +horri +hortaz +hortaz +hortik +hura +ikusi +ikusi +izan +izan +izan +jarraituz +kariaz +kasuaz +kontuan +laburbilduz +laburki +laster +laster +lehen +lehen +lehen +lehen +lehenengo +lehenengo +lehenik +lehen-lehenik +litzateke +medio +mendean +mundura +nahiz +ni +noiz +nola +non +nondik +nongo +nor +nora +on +ondoren +ondorio +ondorioz +ondorioz +orain +ordea +orduan +orduan +orduan +orduko +ordura +orobat +ostean +ostera +osterantzean +pentsatuz +ustez +ze +zein +zein +zen +zen +zenbait +zenbat +zer +zeren +zergatik +zergatik +ziren +zituen +zu +zuek +zuen +zuten +zuzen diff --git a/nltk_data/corpora/stopwords/bengali b/nltk_data/corpora/stopwords/bengali new file mode 100644 index 0000000000000000000000000000000000000000..9dc1bfcb2a87c09952a3e61c4c365612c4adccd2 --- /dev/null +++ b/nltk_data/corpora/stopwords/bengali @@ -0,0 +1,398 @@ +অতএব +অথচ +অথবা +অনুযায়ী +অনেক +অনেকে +অনেকেই +অন্তত +অন্য +অবধি +অবশ্য +অর্থাত +আই +আগামী +আগে +আগেই +আছে +আজ +আদ্যভাগে +আপনার +আপনি +আবার +আমরা +আমাকে +আমাদের +আমার +আমি +আর +আরও +ই +ইত্যাদি +ইহা +উচিত +উত্তর +উনি +উপর +উপরে +এ +এঁদের +এঁরা +এই +একই +একটি +একবার +একে +এক্ +এখন +এখনও +এখানে +এখানেই +এটা +এটাই +এটি +এত +এতটাই +এতে +এদের +এব +এবং +এবার +এমন +এমনকী +এমনি +এর +এরা +এল +এস +এসে +ঐ +ও +ওঁদের +ওঁর +ওঁরা +ওই +ওকে +ওখানে +ওদের +ওর +ওরা +কখনও +কত +কবে +কমনে +কয়েক +কয়েকটি +করছে +করছেন +করতে +করবে +করবেন +করলে +করলেন +করা +করাই +করায় +করার +করি +করিতে +করিয়া +করিয়ে +করে +করেই +করেছিলেন +করেছে +করেছেন +করেন +কাউকে +কাছ +কাছে +কাজ +কাজে +কারও +কারণ +কি +কিংবা +কিছু +কিছুই +কিন্তু +কী +কে +কেউ +কেউই +কেখা +কেন +কোটি +কোন +কোনও +কোনো +ক্ষেত্রে +কয়েক +খুব +গিয়ে +গিয়েছে +গিয়ে +গুলি +গেছে +গেল +গেলে +গোটা +চলে +চান +চায় +চার +চালু +চেয়ে +চেষ্টা +ছাড়া +ছাড়াও +ছিল +ছিলেন +জন +জনকে +জনের +জন্য +জন্যওজে +জানতে +জানা +জানানো +জানায় +জানিয়ে +জানিয়েছে +জে +জ্নজন +টি +ঠিক +তখন +তত +তথা +তবু +তবে +তা +তাঁকে +তাঁদের +তাঁর +তাঁরা +তাঁাহারা +তাই +তাও +তাকে +তাতে +তাদের +তার +তারপর +তারা +তারৈ +তাহলে +তাহা +তাহাতে +তাহার +তিনঐ +তিনি +তিনিও +তুমি +তুলে +তেমন +তো +তোমার +থাকবে +থাকবেন +থাকা +থাকায় +থাকে +থাকেন +থেকে +থেকেই +থেকেও +দিকে +দিতে +দিন +দিয়ে +দিয়েছে +দিয়েছেন +দিলেন +দু +দুই +দুটি +দুটো +দেওয়া +দেওয়ার +দেওয়া +দেখতে +দেখা +দেখে +দেন +দেয় +দ্বারা +ধরা +ধরে +ধামার +নতুন +নয় +না +নাই +নাকি +নাগাদ +নানা +নিজে +নিজেই +নিজেদের +নিজের +নিতে +নিয়ে +নিয়ে +নেই +নেওয়া +নেওয়ার +নেওয়া +নয় +পক্ষে +পর +পরে +পরেই +পরেও +পর্যন্ত +পাওয়া +পাচ +পারি +পারে +পারেন +পি +পেয়ে +পেয়্র্ +প্রতি +প্রথম +প্রভৃতি +প্রযন্ত +প্রাথমিক +প্রায় +প্রায় +ফলে +ফিরে +ফের +বক্তব্য +বদলে +বন +বরং +বলতে +বলল +বললেন +বলা +বলে +বলেছেন +বলেন +বসে +বহু +বা +বাদে +বার +বি +বিনা +বিভিন্ন +বিশেষ +বিষয়টি +বেশ +বেশি +ব্যবহার +ব্যাপারে +ভাবে +ভাবেই +মতো +মতোই +মধ্যভাগে +মধ্যে +মধ্যেই +মধ্যেও +মনে +মাত্র +মাধ্যমে +মোট +মোটেই +যখন +যত +যতটা +যথেষ্ট +যদি +যদিও +যা +যাঁর +যাঁরা +যাওয়া +যাওয়ার +যাওয়া +যাকে +যাচ্ছে +যাতে +যাদের +যান +যাবে +যায় +যার +যারা +যিনি +যে +যেখানে +যেতে +যেন +যেমন +র +রকম +রয়েছে +রাখা +রেখে +লক্ষ +শুধু +শুরু +সঙ্গে +সঙ্গেও +সব +সবার +সমস্ত +সম্প্রতি +সহ +সহিত +সাধারণ +সামনে +সি +সুতরাং +সে +সেই +সেখান +সেখানে +সেটা +সেটাই +সেটাও +সেটি +স্পষ্ট +স্বয়ং +হইতে +হইবে +হইয়া +হওয়া +হওয়ায় +হওয়ার +হচ্ছে +হত +হতে +হতেই +হন +হবে +হবেন +হয় +হয়তো +হয়নি +হয়ে +হয়েই +হয়েছিল +হয়েছে +হয়েছেন +হল +হলে +হলেই +হলেও +হলো +হাজার +হিসাবে +হৈলে +হোক +হয় \ No newline at end of file diff --git a/nltk_data/corpora/stopwords/catalan b/nltk_data/corpora/stopwords/catalan new file mode 100644 index 0000000000000000000000000000000000000000..cdba332cb94448bf8b93d41e5d58e2b6f447211f --- /dev/null +++ b/nltk_data/corpora/stopwords/catalan @@ -0,0 +1,278 @@ +a +abans +ací +ah +així +això +al +aleshores +algun +alguna +algunes +alguns +alhora +allà +allí +allò +als +altra +altre +altres +amb +ambdues +ambdós +anar +ans +apa +aquell +aquella +aquelles +aquells +aquest +aquesta +aquestes +aquests +aquí +baix +bastant +bé +cada +cadascuna +cadascunes +cadascuns +cadascú +com +consegueixo +conseguim +conseguir +consigueix +consigueixen +consigueixes +contra +d'un +d'una +d'unes +d'uns +dalt +de +del +dels +des +des de +després +dins +dintre +donat +doncs +durant +e +eh +el +elles +ells +els +em +en +encara +ens +entre +era +erem +eren +eres +es +esta +estan +estat +estava +estaven +estem +esteu +estic +està +estàvem +estàveu +et +etc +ets +fa +faig +fan +fas +fem +fer +feu +fi +fins +fora +gairebé +ha +han +has +haver +havia +he +hem +heu +hi +ho +i +igual +iguals +inclòs +ja +jo +l'hi +la +les +li +li'n +llarg +llavors +m'he +ma +mal +malgrat +mateix +mateixa +mateixes +mateixos +me +mentre +meu +meus +meva +meves +mode +molt +molta +moltes +molts +mon +mons +més +n'he +n'hi +ne +ni +no +nogensmenys +només +nosaltres +nostra +nostre +nostres +o +oh +oi +on +pas +pel +pels +per +per que +perquè +però +poc +poca +pocs +podem +poden +poder +podeu +poques +potser +primer +propi +puc +qual +quals +quan +quant +que +quelcom +qui +quin +quina +quines +quins +què +s'ha +s'han +sa +sabem +saben +saber +sabeu +sap +saps +semblant +semblants +sense +ser +ses +seu +seus +seva +seves +si +sobre +sobretot +soc +solament +sols +som +son +sons +sota +sou +sóc +són +t'ha +t'han +t'he +ta +tal +també +tampoc +tan +tant +tanta +tantes +te +tene +tenim +tenir +teniu +teu +teus +teva +teves +tinc +ton +tons +tot +tota +totes +tots +un +una +unes +uns +us +va +vaig +vam +van +vas +veu +vosaltres +vostra +vostre +vostres +érem +éreu +és +éssent +últim +ús \ No newline at end of file diff --git a/nltk_data/corpora/stopwords/chinese b/nltk_data/corpora/stopwords/chinese new file mode 100644 index 0000000000000000000000000000000000000000..0873a9033a24948ae253085187c882418b5b3aaa --- /dev/null +++ b/nltk_data/corpora/stopwords/chinese @@ -0,0 +1,841 @@ +一 +一下 +一些 +一切 +一则 +一天 +一定 +一方面 +一旦 +一时 +一来 +一样 +一次 +一片 +一直 +一致 +一般 +一起 +一边 +一面 +万一 +上下 +上升 +上去 +上来 +上述 +上面 +下列 +下去 +下来 +下面 +不一 +不久 +不仅 +不会 +不但 +不光 +不单 +不变 +不只 +不可 +不同 +不够 +不如 +不得 +不怕 +不惟 +不成 +不拘 +不敢 +不断 +不是 +不比 +不然 +不特 +不独 +不管 +不能 +不要 +不论 +不足 +不过 +不问 +与 +与其 +与否 +与此同时 +专门 +且 +两者 +严格 +严重 +个 +个人 +个别 +中小 +中间 +丰富 +临 +为 +为主 +为了 +为什么 +为什麽 +为何 +为着 +主张 +主要 +举行 +乃 +乃至 +么 +之 +之一 +之前 +之后 +之後 +之所以 +之类 +乌乎 +乎 +乘 +也 +也好 +也是 +也罢 +了 +了解 +争取 +于 +于是 +于是乎 +云云 +互相 +产生 +人们 +人家 +什么 +什么样 +什麽 +今后 +今天 +今年 +今後 +仍然 +从 +从事 +从而 +他 +他人 +他们 +他的 +代替 +以 +以上 +以下 +以为 +以便 +以免 +以前 +以及 +以后 +以外 +以後 +以来 +以至 +以至于 +以致 +们 +任 +任何 +任凭 +任务 +企图 +伟大 +似乎 +似的 +但 +但是 +何 +何况 +何处 +何时 +作为 +你 +你们 +你的 +使得 +使用 +例如 +依 +依照 +依靠 +促进 +保持 +俺 +俺们 +倘 +倘使 +倘或 +倘然 +倘若 +假使 +假如 +假若 +做到 +像 +允许 +充分 +先后 +先後 +先生 +全部 +全面 +兮 +共同 +关于 +其 +其一 +其中 +其二 +其他 +其余 +其它 +其实 +其次 +具体 +具体地说 +具体说来 +具有 +再者 +再说 +冒 +冲 +决定 +况且 +准备 +几 +几乎 +几时 +凭 +凭借 +出去 +出来 +出现 +分别 +则 +别 +别的 +别说 +到 +前后 +前者 +前进 +前面 +加之 +加以 +加入 +加强 +十分 +即 +即令 +即使 +即便 +即或 +即若 +却不 +原来 +又 +及 +及其 +及时 +及至 +双方 +反之 +反应 +反映 +反过来 +反过来说 +取得 +受到 +变成 +另 +另一方面 +另外 +只是 +只有 +只要 +只限 +叫 +叫做 +召开 +叮咚 +可 +可以 +可是 +可能 +可见 +各 +各个 +各人 +各位 +各地 +各种 +各级 +各自 +合理 +同 +同一 +同时 +同样 +后来 +后面 +向 +向着 +吓 +吗 +否则 +吧 +吧哒 +吱 +呀 +呃 +呕 +呗 +呜 +呜呼 +呢 +周围 +呵 +呸 +呼哧 +咋 +和 +咚 +咦 +咱 +咱们 +咳 +哇 +哈 +哈哈 +哉 +哎 +哎呀 +哎哟 +哗 +哟 +哦 +哩 +哪 +哪个 +哪些 +哪儿 +哪天 +哪年 +哪怕 +哪样 +哪边 +哪里 +哼 +哼唷 +唉 +啊 +啐 +啥 +啦 +啪达 +喂 +喏 +喔唷 +嗡嗡 +嗬 +嗯 +嗳 +嘎 +嘎登 +嘘 +嘛 +嘻 +嘿 +因 +因为 +因此 +因而 +固然 +在 +在下 +地 +坚决 +坚持 +基本 +处理 +复杂 +多 +多少 +多数 +多次 +大力 +大多数 +大大 +大家 +大批 +大约 +大量 +失去 +她 +她们 +她的 +好的 +好象 +如 +如上所述 +如下 +如何 +如其 +如果 +如此 +如若 +存在 +宁 +宁可 +宁愿 +宁肯 +它 +它们 +它们的 +它的 +安全 +完全 +完成 +实现 +实际 +宣布 +容易 +密切 +对 +对于 +对应 +将 +少数 +尔后 +尚且 +尤其 +就 +就是 +就是说 +尽 +尽管 +属于 +岂但 +左右 +巨大 +巩固 +己 +已经 +帮助 +常常 +并 +并不 +并不是 +并且 +并没有 +广大 +广泛 +应当 +应用 +应该 +开外 +开始 +开展 +引起 +强烈 +强调 +归 +当 +当前 +当时 +当然 +当着 +形成 +彻底 +彼 +彼此 +往 +往往 +待 +後来 +後面 +得 +得出 +得到 +心里 +必然 +必要 +必须 +怎 +怎么 +怎么办 +怎么样 +怎样 +怎麽 +总之 +总是 +总的来看 +总的来说 +总的说来 +总结 +总而言之 +恰恰相反 +您 +意思 +愿意 +慢说 +成为 +我 +我们 +我的 +或 +或是 +或者 +战斗 +所 +所以 +所有 +所谓 +打 +扩大 +把 +抑或 +拿 +按 +按照 +换句话说 +换言之 +据 +掌握 +接着 +接著 +故 +故此 +整个 +方便 +方面 +旁人 +无宁 +无法 +无论 +既 +既是 +既然 +时候 +明显 +明确 +是 +是否 +是的 +显然 +显著 +普通 +普遍 +更加 +曾经 +替 +最后 +最大 +最好 +最後 +最近 +最高 +有 +有些 +有关 +有利 +有力 +有所 +有效 +有时 +有点 +有的 +有着 +有著 +望 +朝 +朝着 +本 +本着 +来 +来着 +极了 +构成 +果然 +果真 +某 +某个 +某些 +根据 +根本 +欢迎 +正在 +正如 +正常 +此 +此外 +此时 +此间 +毋宁 +每 +每个 +每天 +每年 +每当 +比 +比如 +比方 +比较 +毫不 +没有 +沿 +沿着 +注意 +深入 +清楚 +满足 +漫说 +焉 +然则 +然后 +然後 +然而 +照 +照着 +特别是 +特殊 +特点 +现代 +现在 +甚么 +甚而 +甚至 +用 +由 +由于 +由此可见 +的 +的话 +目前 +直到 +直接 +相似 +相信 +相反 +相同 +相对 +相对而言 +相应 +相当 +相等 +省得 +看出 +看到 +看来 +看看 +看见 +真是 +真正 +着 +着呢 +矣 +知道 +确定 +离 +积极 +移动 +突出 +突然 +立即 +第 +等 +等等 +管 +紧接着 +纵 +纵令 +纵使 +纵然 +练习 +组成 +经 +经常 +经过 +结合 +结果 +给 +绝对 +继续 +继而 +维持 +综上所述 +罢了 +考虑 +者 +而 +而且 +而况 +而外 +而已 +而是 +而言 +联系 +能 +能否 +能够 +腾 +自 +自个儿 +自从 +自各儿 +自家 +自己 +自身 +至 +至于 +良好 +若 +若是 +若非 +范围 +莫若 +获得 +虽 +虽则 +虽然 +虽说 +行为 +行动 +表明 +表示 +被 +要 +要不 +要不是 +要不然 +要么 +要是 +要求 +规定 +觉得 +认为 +认真 +认识 +让 +许多 +论 +设使 +设若 +该 +说明 +诸位 +谁 +谁知 +赶 +起 +起来 +起见 +趁 +趁着 +越是 +跟 +转动 +转变 +转贴 +较 +较之 +边 +达到 +迅速 +过 +过去 +过来 +运用 +还是 +还有 +这 +这个 +这么 +这么些 +这么样 +这么点儿 +这些 +这会儿 +这儿 +这就是说 +这时 +这样 +这点 +这种 +这边 +这里 +这麽 +进入 +进步 +进而 +进行 +连 +连同 +适应 +适当 +适用 +逐步 +逐渐 +通常 +通过 +造成 +遇到 +遭到 +避免 +那 +那个 +那么 +那么些 +那么样 +那些 +那会儿 +那儿 +那时 +那样 +那边 +那里 +那麽 +部分 +鄙人 +采取 +里面 +重大 +重新 +重要 +鉴于 +问题 +防止 +阿 +附近 +限制 +除 +除了 +除此之外 +除非 +随 +随着 +随著 +集中 +需要 +非但 +非常 +非徒 +靠 +顺 +顺着 +首先 +高兴 +是不是 diff --git a/nltk_data/corpora/stopwords/danish b/nltk_data/corpora/stopwords/danish new file mode 100644 index 0000000000000000000000000000000000000000..d3edc6757912e3d83acfa241859c4a1bdb6005ad --- /dev/null +++ b/nltk_data/corpora/stopwords/danish @@ -0,0 +1,94 @@ +og +i +jeg +det +at +en +den +til +er +som +på +de +med +han +af +for +ikke +der +var +mig +sig +men +et +har +om +vi +min +havde +ham +hun +nu +over +da +fra +du +ud +sin +dem +os +op +man +hans +hvor +eller +hvad +skal +selv +her +alle +vil +blev +kunne +ind +når +være +dog +noget +ville +jo +deres +efter +ned +skulle +denne +end +dette +mit +også +under +have +dig +anden +hende +mine +alt +meget +sit +sine +vor +mod +disse +hvis +din +nogle +hos +blive +mange +ad +bliver +hendes +været +thi +jer +sådan diff --git a/nltk_data/corpora/stopwords/dutch b/nltk_data/corpora/stopwords/dutch new file mode 100644 index 0000000000000000000000000000000000000000..cafa0324b53763f7efadda5b0f3d0321ffa7ab38 --- /dev/null +++ b/nltk_data/corpora/stopwords/dutch @@ -0,0 +1,101 @@ +de +en +van +ik +te +dat +die +in +een +hij +het +niet +zijn +is +was +op +aan +met +als +voor +had +er +maar +om +hem +dan +zou +of +wat +mijn +men +dit +zo +door +over +ze +zich +bij +ook +tot +je +mij +uit +der +daar +haar +naar +heb +hoe +heeft +hebben +deze +u +want +nog +zal +me +zij +nu +ge +geen +omdat +iets +worden +toch +al +waren +veel +meer +doen +toen +moet +ben +zonder +kan +hun +dus +alles +onder +ja +eens +hier +wie +werd +altijd +doch +wordt +wezen +kunnen +ons +zelf +tegen +na +reeds +wil +kon +niets +uw +iemand +geweest +andere diff --git a/nltk_data/corpora/stopwords/english b/nltk_data/corpora/stopwords/english new file mode 100644 index 0000000000000000000000000000000000000000..1280aa3b08d64b846272b2d23314377200db36e9 --- /dev/null +++ b/nltk_data/corpora/stopwords/english @@ -0,0 +1,179 @@ +i +me +my +myself +we +our +ours +ourselves +you +you're +you've +you'll +you'd +your +yours +yourself +yourselves +he +him +his +himself +she +she's +her +hers +herself +it +it's +its +itself +they +them +their +theirs +themselves +what +which +who +whom +this +that +that'll +these +those +am +is +are +was +were +be +been +being +have +has +had +having +do +does +did +doing +a +an +the +and +but +if +or +because +as +until +while +of +at +by +for +with +about +against +between +into +through +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under +again +further +then +once +here +there +when +where +why +how +all +any +both +each +few +more +most +other +some +such +no +nor +not +only +own +same +so +than +too +very +s +t +can +will +just +don +don't +should +should've +now +d +ll +m +o +re +ve +y +ain +aren +aren't +couldn +couldn't +didn +didn't +doesn +doesn't +hadn +hadn't +hasn +hasn't +haven +haven't +isn +isn't +ma +mightn +mightn't +mustn +mustn't +needn +needn't +shan +shan't +shouldn +shouldn't +wasn +wasn't +weren +weren't +won +won't +wouldn +wouldn't diff --git a/nltk_data/corpora/stopwords/finnish b/nltk_data/corpora/stopwords/finnish new file mode 100644 index 0000000000000000000000000000000000000000..47ee200f6781ddfdb31af37e317e1f05d10a04ff --- /dev/null +++ b/nltk_data/corpora/stopwords/finnish @@ -0,0 +1,235 @@ +olla +olen +olet +on +olemme +olette +ovat +ole +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet +en +et +ei +emme +ette +eivät +minä +minun +minut +minua +minussa +minusta +minuun +minulla +minulta +minulle +sinä +sinun +sinut +sinua +sinussa +sinusta +sinuun +sinulla +sinulta +sinulle +hän +hänen +hänet +häntä +hänessä +hänestä +häneen +hänellä +häneltä +hänelle +me +meidän +meidät +meitä +meissä +meistä +meihin +meillä +meiltä +meille +te +teidän +teidät +teitä +teissä +teistä +teihin +teillä +teiltä +teille +he +heidän +heidät +heitä +heissä +heistä +heihin +heillä +heiltä +heille +tämä +tämän +tätä +tässä +tästä +tähän +tallä +tältä +tälle +tänä +täksi +tuo +tuon +tuotä +tuossa +tuosta +tuohon +tuolla +tuolta +tuolle +tuona +tuoksi +se +sen +sitä +siinä +siitä +siihen +sillä +siltä +sille +sinä +siksi +nämä +näiden +näitä +näissä +näistä +näihin +näillä +näiltä +näille +näinä +näiksi +nuo +noiden +noita +noissa +noista +noihin +noilla +noilta +noille +noina +noiksi +ne +niiden +niitä +niissä +niistä +niihin +niillä +niiltä +niille +niinä +niiksi +kuka +kenen +kenet +ketä +kenessä +kenestä +keneen +kenellä +keneltä +kenelle +kenenä +keneksi +ketkä +keiden +ketkä +keitä +keissä +keistä +keihin +keillä +keiltä +keille +keinä +keiksi +mikä +minkä +minkä +mitä +missä +mistä +mihin +millä +miltä +mille +minä +miksi +mitkä +joka +jonka +jota +jossa +josta +johon +jolla +jolta +jolle +jona +joksi +jotka +joiden +joita +joissa +joista +joihin +joilla +joilta +joille +joina +joiksi +että +ja +jos +koska +kuin +mutta +niin +sekä +sillä +tai +vaan +vai +vaikka +kanssa +mukaan +noin +poikki +yli +kun +niin +nyt +itse diff --git a/nltk_data/corpora/stopwords/french b/nltk_data/corpora/stopwords/french new file mode 100644 index 0000000000000000000000000000000000000000..00af58791541ed63f33c98207199f6b32727f571 --- /dev/null +++ b/nltk_data/corpora/stopwords/french @@ -0,0 +1,157 @@ +au +aux +avec +ce +ces +dans +de +des +du +elle +en +et +eux +il +ils +je +la +le +les +leur +lui +ma +mais +me +même +mes +moi +mon +ne +nos +notre +nous +on +ou +par +pas +pour +qu +que +qui +sa +se +ses +son +sur +ta +te +tes +toi +ton +tu +un +une +vos +votre +vous +c +d +j +l +à +m +n +s +t +y +été +étée +étées +étés +étant +étante +étants +étantes +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent +ayant +ayante +ayantes +ayants +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent diff --git a/nltk_data/corpora/stopwords/german b/nltk_data/corpora/stopwords/german new file mode 100644 index 0000000000000000000000000000000000000000..c2241d0ee7bbc192abe3b56efffb75f29a29e185 --- /dev/null +++ b/nltk_data/corpora/stopwords/german @@ -0,0 +1,232 @@ +aber +alle +allem +allen +aller +alles +als +also +am +an +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +auch +auf +aus +bei +bin +bis +bist +da +damit +dann +der +den +des +dem +die +das +dass +daß +derselbe +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe +dazu +dein +deine +deinem +deinen +deiner +deines +denn +derer +dessen +dich +dir +du +dies +diese +diesem +diesen +dieser +dieses +doch +dort +durch +ein +eine +einem +einen +einer +eines +einig +einige +einigem +einigen +einiger +einiges +einmal +er +ihn +ihm +es +etwas +euer +eure +eurem +euren +eurer +eures +für +gegen +gewesen +hab +habe +haben +hat +hatte +hatten +hier +hin +hinter +ich +mich +mir +ihr +ihre +ihrem +ihren +ihrer +ihres +euch +im +in +indem +ins +ist +jede +jedem +jeden +jeder +jedes +jene +jenem +jenen +jener +jenes +jetzt +kann +kein +keine +keinem +keinen +keiner +keines +können +könnte +machen +man +manche +manchem +manchen +mancher +manches +mein +meine +meinem +meinen +meiner +meines +mit +muss +musste +nach +nicht +nichts +noch +nun +nur +ob +oder +ohne +sehr +sein +seine +seinem +seinen +seiner +seines +selbst +sich +sie +ihnen +sind +so +solche +solchem +solchen +solcher +solches +soll +sollte +sondern +sonst +über +um +und +uns +unsere +unserem +unseren +unser +unseres +unter +viel +vom +von +vor +während +war +waren +warst +was +weg +weil +weiter +welche +welchem +welchen +welcher +welches +wenn +werde +werden +wie +wieder +will +wir +wird +wirst +wo +wollen +wollte +würde +würden +zu +zum +zur +zwar +zwischen diff --git a/nltk_data/corpora/stopwords/greek b/nltk_data/corpora/stopwords/greek new file mode 100644 index 0000000000000000000000000000000000000000..9d08b144659c4ae804bb84a624e1248bb2b90e40 --- /dev/null +++ b/nltk_data/corpora/stopwords/greek @@ -0,0 +1,265 @@ +αλλα +αν +αντι +απο +αυτα +αυτεσ +αυτη +αυτο +αυτοι +αυτοσ +αυτουσ +αυτων +αἱ +αἳ +αἵ +αὐτόσ +αὐτὸς +αὖ +γάρ +γα +γα^ +γε +για +γοῦν +γὰρ +δ' +δέ +δή +δαί +δαίσ +δαὶ +δαὶς +δε +δεν +δι' +διά +διὰ +δὲ +δὴ +δ’ +εαν +ειμαι +ειμαστε +ειναι +εισαι +ειστε +εκεινα +εκεινεσ +εκεινη +εκεινο +εκεινοι +εκεινοσ +εκεινουσ +εκεινων +ενω +επ +επι +εἰ +εἰμί +εἰμὶ +εἰς +εἰσ +εἴ +εἴμι +εἴτε +η +θα +ισωσ +κ +καί +καίτοι +καθ +και +κατ +κατά +κατα +κατὰ +καὶ +κι +κἀν +κἂν +μέν +μή +μήτε +μα +με +μεθ +μετ +μετά +μετα +μετὰ +μη +μην +μἐν +μὲν +μὴ +μὴν +να +ο +οι +ομωσ +οπωσ +οσο +οτι +οἱ +οἳ +οἷς +οὐ +οὐδ +οὐδέ +οὐδείσ +οὐδεὶς +οὐδὲ +οὐδὲν +οὐκ +οὐχ +οὐχὶ +οὓς +οὔτε +οὕτω +οὕτως +οὕτωσ +οὖν +οὗ +οὗτος +οὗτοσ +παρ +παρά +παρα +παρὰ +περί +περὶ +ποια +ποιεσ +ποιο +ποιοι +ποιοσ +ποιουσ +ποιων +ποτε +που +ποῦ +προ +προσ +πρόσ +πρὸ +πρὸς +πως +πωσ +σε +στη +στην +στο +στον +σόσ +σύ +σύν +σὸς +σὺ +σὺν +τά +τήν +τί +τίς +τίσ +τα +ταῖς +τε +την +τησ +τι +τινα +τις +τισ +το +τοί +τοι +τοιοῦτος +τοιοῦτοσ +τον +τοτε +του +τούσ +τοὺς +τοῖς +τοῦ +των +τό +τόν +τότε +τὰ +τὰς +τὴν +τὸ +τὸν +τῆς +τῆσ +τῇ +τῶν +τῷ +ωσ +ἀλλ' +ἀλλά +ἀλλὰ +ἀλλ’ +ἀπ +ἀπό +ἀπὸ +ἀφ +ἂν +ἃ +ἄλλος +ἄλλοσ +ἄν +ἄρα +ἅμα +ἐάν +ἐγώ +ἐγὼ +ἐκ +ἐμόσ +ἐμὸς +ἐν +ἐξ +ἐπί +ἐπεὶ +ἐπὶ +ἐστι +ἐφ +ἐὰν +ἑαυτοῦ +ἔτι +ἡ +ἢ +ἣ +ἤ +ἥ +ἧς +ἵνα +ὁ +ὃ +ὃν +ὃς +ὅ +ὅδε +ὅθεν +ὅπερ +ὅς +ὅσ +ὅστις +ὅστισ +ὅτε +ὅτι +ὑμόσ +ὑπ +ὑπέρ +ὑπό +ὑπὲρ +ὑπὸ +ὡς +ὡσ +ὥς +ὥστε +ὦ +ᾧ diff --git a/nltk_data/corpora/stopwords/hebrew b/nltk_data/corpora/stopwords/hebrew new file mode 100644 index 0000000000000000000000000000000000000000..8ac778585f3d9b71c5a6d1e4fc9afb83eb4e691a --- /dev/null +++ b/nltk_data/corpora/stopwords/hebrew @@ -0,0 +1,221 @@ +אני +את +אתה +אנחנו +אתן +אתם +הם +הן +היא +הוא +שלי +שלו +שלך +שלה +שלנו +שלכם +שלכן +שלהם +שלהן +לי +לו +לה +לנו +לכם +לכן +להם +להן +אותה +אותו +זה +זאת +אלה +אלו +תחת +מתחת +מעל +בין +עם +עד +נגר +על +אל +מול +של +אצל +כמו +אחר +אותו +בלי +לפני +אחרי +מאחורי +עלי +עליו +עליה +עליך +עלינו +עליכם +לעיכן +עליהם +עליהן +כל +כולם +כולן +כך +ככה +כזה +זה +זות +אותי +אותה +אותם +אותך +אותו +אותן +אותנו +ואת +את +אתכם +אתכן +איתי +איתו +איתך +איתה +איתם +איתן +איתנו +איתכם +איתכן +יהיה +תהיה +היתי +היתה +היה +להיות +עצמי +עצמו +עצמה +עצמם +עצמן +עצמנו +עצמהם +עצמהן +מי +מה +איפה +היכן +במקום שבו +אם +לאן +למקום שבו +מקום בו +איזה +מהיכן +איך +כיצד +באיזו מידה +מתי +בשעה ש +כאשר +כש +למרות +לפני +אחרי +מאיזו סיבה +הסיבה שבגללה +למה +מדוע +לאיזו תכלית +כי +יש +אין +אך +מנין +מאין +מאיפה +יכל +יכלה +יכלו +יכול +יכולה +יכולים +יכולות +יוכלו +יוכל +מסוגל +לא +רק +אולי +אין +לאו +אי +כלל +נגד +אם +עם +אל +אלה +אלו +אף +על +מעל +מתחת +מצד +בשביל +לבין +באמצע +בתוך +דרך +מבעד +באמצעות +למעלה +למטה +מחוץ +מן +לעבר +מכאן +כאן +הנה +הרי +פה +שם +אך +ברם +שוב +אבל +מבלי +בלי +מלבד +רק +בגלל +מכיוון +עד +אשר +ואילו +למרות +אס +כמו +כפי +אז +אחרי +כן +לכן +לפיכך +מאד +עז +מעט +מעטים +במידה +שוב +יותר +מדי +גם +כן +נו +אחר +אחרת +אחרים +אחרות +אשר +או \ No newline at end of file diff --git a/nltk_data/corpora/stopwords/hinglish b/nltk_data/corpora/stopwords/hinglish new file mode 100644 index 0000000000000000000000000000000000000000..de0654e211e01952c3f38c78d370739e7c71d2e2 --- /dev/null +++ b/nltk_data/corpora/stopwords/hinglish @@ -0,0 +1,1036 @@ +a +aadi +aaj +aap +aapne +aata +aati +aaya +aaye +ab +abbe +abbey +abe +abhi +able +about +above +accha +according +accordingly +acha +achcha +across +actually +after +afterwards +again +against +agar +ain +aint +ain't +aisa +aise +aisi +alag +all +allow +allows +almost +alone +along +already +also +although +always +am +among +amongst +an +and +andar +another +any +anybody +anyhow +anyone +anything +anyway +anyways +anywhere +ap +apan +apart +apna +apnaa +apne +apni +appear +are +aren +arent +aren't +around +arre +as +aside +ask +asking +at +aur +avum +aya +aye +baad +baar +bad +bahut +bana +banae +banai +banao +banaya +banaye +banayi +banda +bande +bandi +bane +bani +bas +bata +batao +bc +be +became +because +become +becomes +becoming +been +before +beforehand +behind +being +below +beside +besides +best +better +between +beyond +bhai +bheetar +bhi +bhitar +bht +bilkul +bohot +bol +bola +bole +boli +bolo +bolta +bolte +bolti +both +brief +bro +btw +but +by +came +can +cannot +cant +can't +cause +causes +certain +certainly +chahiye +chaiye +chal +chalega +chhaiye +clearly +c'mon +com +come +comes +could +couldn +couldnt +couldn't +d +de +dede +dega +degi +dekh +dekha +dekhe +dekhi +dekho +denge +dhang +di +did +didn +didnt +didn't +dijiye +diya +diyaa +diye +diyo +do +does +doesn +doesnt +doesn't +doing +done +dono +dont +don't +doosra +doosre +down +downwards +dude +dunga +dungi +during +dusra +dusre +dusri +dvaara +dvara +dwaara +dwara +each +edu +eg +eight +either +ek +else +elsewhere +enough +etc +even +ever +every +everybody +everyone +everything +everywhere +ex +exactly +example +except +far +few +fifth +fir +first +five +followed +following +follows +for +forth +four +from +further +furthermore +gaya +gaye +gayi +get +gets +getting +ghar +given +gives +go +goes +going +gone +good +got +gotten +greetings +haan +had +hadd +hadn +hadnt +hadn't +hai +hain +hamara +hamare +hamari +hamne +han +happens +har +hardly +has +hasn +hasnt +hasn't +have +haven +havent +haven't +having +he +hello +help +hence +her +here +hereafter +hereby +herein +here's +hereupon +hers +herself +he's +hi +him +himself +his +hither +hm +hmm +ho +hoga +hoge +hogi +hona +honaa +hone +honge +hongi +honi +hopefully +hota +hotaa +hote +hoti +how +howbeit +however +hoyenge +hoyengi +hu +hua +hue +huh +hui +hum +humein +humne +hun +huye +huyi +i +i'd +idk +ie +if +i'll +i'm +imo +in +inasmuch +inc +inhe +inhi +inho +inka +inkaa +inke +inki +inn +inner +inse +insofar +into +inward +is +ise +isi +iska +iskaa +iske +iski +isme +isn +isne +isnt +isn't +iss +isse +issi +isski +it +it'd +it'll +itna +itne +itni +itno +its +it's +itself +ityaadi +ityadi +i've +ja +jaa +jab +jabh +jaha +jahaan +jahan +jaisa +jaise +jaisi +jata +jayega +jidhar +jin +jinhe +jinhi +jinho +jinhone +jinka +jinke +jinki +jinn +jis +jise +jiska +jiske +jiski +jisme +jiss +jisse +jitna +jitne +jitni +jo +just +jyaada +jyada +k +ka +kaafi +kab +kabhi +kafi +kaha +kahaa +kahaan +kahan +kahi +kahin +kahte +kaisa +kaise +kaisi +kal +kam +kar +kara +kare +karega +karegi +karen +karenge +kari +karke +karna +karne +karni +karo +karta +karte +karti +karu +karun +karunga +karungi +kaun +kaunsa +kayi +kch +ke +keep +keeps +keh +kehte +kept +khud +ki +kin +kine +kinhe +kinho +kinka +kinke +kinki +kinko +kinn +kino +kis +kise +kisi +kiska +kiske +kiski +kisko +kisliye +kisne +kitna +kitne +kitni +kitno +kiya +kiye +know +known +knows +ko +koi +kon +konsa +koyi +krna +krne +kuch +kuchch +kuchh +kul +kull +kya +kyaa +kyu +kyuki +kyun +kyunki +lagta +lagte +lagti +last +lately +later +le +least +lekar +lekin +less +lest +let +let's +li +like +liked +likely +little +liya +liye +ll +lo +log +logon +lol +look +looking +looks +ltd +lunga +m +maan +maana +maane +maani +maano +magar +mai +main +maine +mainly +mana +mane +mani +mano +many +mat +may +maybe +me +mean +meanwhile +mein +mera +mere +merely +meri +might +mightn +mightnt +mightn't +mil +mjhe +more +moreover +most +mostly +much +mujhe +must +mustn +mustnt +mustn't +my +myself +na +naa +naah +nahi +nahin +nai +name +namely +nd +ne +near +nearly +necessary +neeche +need +needn +neednt +needn't +needs +neither +never +nevertheless +new +next +nhi +nine +no +nobody +non +none +noone +nope +nor +normally +not +nothing +novel +now +nowhere +o +obviously +of +off +often +oh +ok +okay +old +on +once +one +ones +only +onto +or +other +others +otherwise +ought +our +ours +ourselves +out +outside +over +overall +own +par +pata +pe +pehla +pehle +pehli +people +per +perhaps +phla +phle +phli +placed +please +plus +poora +poori +provides +pura +puri +q +que +quite +raha +rahaa +rahe +rahi +rakh +rakha +rakhe +rakhen +rakhi +rakho +rather +re +really +reasonably +regarding +regardless +regards +rehte +rha +rhaa +rhe +rhi +ri +right +s +sa +saara +saare +saath +sab +sabhi +sabse +sahi +said +sakta +saktaa +sakte +sakti +same +sang +sara +sath +saw +say +saying +says +se +second +secondly +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sensible +sent +serious +seriously +seven +several +shall +shan +shant +shan't +she +she's +should +shouldn +shouldnt +shouldn't +should've +si +since +six +so +soch +some +somebody +somehow +someone +something +sometime +sometimes +somewhat +somewhere +soon +still +sub +such +sup +sure +t +tab +tabh +tak +take +taken +tarah +teen +teeno +teesra +teesre +teesri +tell +tends +tera +tere +teri +th +tha +than +thank +thanks +thanx +that +that'll +thats +that's +the +theek +their +theirs +them +themselves +then +thence +there +thereafter +thereby +therefore +therein +theres +there's +thereupon +these +they +they'd +they'll +they're +they've +thi +thik +thing +think +thinking +third +this +tho +thoda +thodi +thorough +thoroughly +those +though +thought +three +through +throughout +thru +thus +tjhe +to +together +toh +too +took +toward +towards +tried +tries +true +truly +try +trying +tu +tujhe +tum +tumhara +tumhare +tumhari +tune +twice +two +um +umm +un +under +unhe +unhi +unho +unhone +unka +unkaa +unke +unki +unko +unless +unlikely +unn +unse +until +unto +up +upar +upon +us +use +used +useful +uses +usi +using +uska +uske +usne +uss +usse +ussi +usually +vaala +vaale +vaali +vahaan +vahan +vahi +vahin +vaisa +vaise +vaisi +vala +vale +vali +various +ve +very +via +viz +vo +waala +waale +waali +wagaira +wagairah +wagerah +waha +wahaan +wahan +wahi +wahin +waisa +waise +waisi +wala +wale +wali +want +wants +was +wasn +wasnt +wasn't +way +we +we'd +well +we'll +went +were +we're +weren +werent +weren't +we've +what +whatever +what's +when +whence +whenever +where +whereafter +whereas +whereby +wherein +where's +whereupon +wherever +whether +which +while +who +whoever +whole +whom +who's +whose +why +will +willing +with +within +without +wo +woh +wohi +won +wont +won't +would +wouldn +wouldnt +wouldn't +y +ya +yadi +yah +yaha +yahaan +yahan +yahi +yahin +ye +yeah +yeh +yehi +yes +yet +you +you'd +you'll +your +you're +yours +yourself +yourselves +you've +yup diff --git a/nltk_data/corpora/stopwords/hungarian b/nltk_data/corpora/stopwords/hungarian new file mode 100644 index 0000000000000000000000000000000000000000..94e9f9a0b07a68aeff30605fc445db71f63be630 --- /dev/null +++ b/nltk_data/corpora/stopwords/hungarian @@ -0,0 +1,199 @@ +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elõ +elõször +elõtt +elsõ +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +õ +õk +õket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/nltk_data/corpora/stopwords/indonesian b/nltk_data/corpora/stopwords/indonesian new file mode 100644 index 0000000000000000000000000000000000000000..bf88a458130df5ab852fb1656a8157a00ec355e9 --- /dev/null +++ b/nltk_data/corpora/stopwords/indonesian @@ -0,0 +1,758 @@ +ada +adalah +adanya +adapun +agak +agaknya +agar +akan +akankah +akhir +akhiri +akhirnya +aku +akulah +amat +amatlah +anda +andalah +antar +antara +antaranya +apa +apaan +apabila +apakah +apalagi +apatah +artinya +asal +asalkan +atas +atau +ataukah +ataupun +awal +awalnya +bagai +bagaikan +bagaimana +bagaimanakah +bagaimanapun +bagi +bagian +bahkan +bahwa +bahwasanya +baik +bakal +bakalan +balik +banyak +bapak +baru +bawah +beberapa +begini +beginian +beginikah +beginilah +begitu +begitukah +begitulah +begitupun +bekerja +belakang +belakangan +belum +belumlah +benar +benarkah +benarlah +berada +berakhir +berakhirlah +berakhirnya +berapa +berapakah +berapalah +berapapun +berarti +berawal +berbagai +berdatangan +beri +berikan +berikut +berikutnya +berjumlah +berkali-kali +berkata +berkehendak +berkeinginan +berkenaan +berlainan +berlalu +berlangsung +berlebihan +bermacam +bermacam-macam +bermaksud +bermula +bersama +bersama-sama +bersiap +bersiap-siap +bertanya +bertanya-tanya +berturut +berturut-turut +bertutur +berujar +berupa +besar +betul +betulkah +biasa +biasanya +bila +bilakah +bisa +bisakah +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +bulan +bung +cara +caranya +cukup +cukupkah +cukuplah +cuma +dahulu +dalam +dan +dapat +dari +daripada +datang +dekat +demi +demikian +demikianlah +dengan +depan +di +dia +diakhiri +diakhirinya +dialah +diantara +diantaranya +diberi +diberikan +diberikannya +dibuat +dibuatnya +didapat +didatangkan +digunakan +diibaratkan +diibaratkannya +diingat +diingatkan +diinginkan +dijawab +dijelaskan +dijelaskannya +dikarenakan +dikatakan +dikatakannya +dikerjakan +diketahui +diketahuinya +dikira +dilakukan +dilalui +dilihat +dimaksud +dimaksudkan +dimaksudkannya +dimaksudnya +diminta +dimintai +dimisalkan +dimulai +dimulailah +dimulainya +dimungkinkan +dini +dipastikan +diperbuat +diperbuatnya +dipergunakan +diperkirakan +diperlihatkan +diperlukan +diperlukannya +dipersoalkan +dipertanyakan +dipunyai +diri +dirinya +disampaikan +disebut +disebutkan +disebutkannya +disini +disinilah +ditambahkan +ditandaskan +ditanya +ditanyai +ditanyakan +ditegaskan +ditujukan +ditunjuk +ditunjuki +ditunjukkan +ditunjukkannya +ditunjuknya +dituturkan +dituturkannya +diucapkan +diucapkannya +diungkapkan +dong +dua +dulu +empat +enggak +enggaknya +entah +entahlah +guna +gunakan +hal +hampir +hanya +hanyalah +hari +harus +haruslah +harusnya +hendak +hendaklah +hendaknya +hingga +ia +ialah +ibarat +ibaratkan +ibaratnya +ibu +ikut +ingat +ingat-ingat +ingin +inginkah +inginkan +ini +inikah +inilah +itu +itukah +itulah +jadi +jadilah +jadinya +jangan +jangankan +janganlah +jauh +jawab +jawaban +jawabnya +jelas +jelaskan +jelaslah +jelasnya +jika +jikalau +juga +jumlah +jumlahnya +justru +kala +kalau +kalaulah +kalaupun +kalian +kami +kamilah +kamu +kamulah +kan +kapan +kapankah +kapanpun +karena +karenanya +kasus +kata +katakan +katakanlah +katanya +ke +keadaan +kebetulan +kecil +kedua +keduanya +keinginan +kelamaan +kelihatan +kelihatannya +kelima +keluar +kembali +kemudian +kemungkinan +kemungkinannya +kenapa +kepada +kepadanya +kesampaian +keseluruhan +keseluruhannya +keterlaluan +ketika +khususnya +kini +kinilah +kira +kira-kira +kiranya +kita +kitalah +kok +kurang +lagi +lagian +lah +lain +lainnya +lalu +lama +lamanya +lanjut +lanjutnya +lebih +lewat +lima +luar +macam +maka +makanya +makin +malah +malahan +mampu +mampukah +mana +manakala +manalagi +masa +masalah +masalahnya +masih +masihkah +masing +masing-masing +mau +maupun +melainkan +melakukan +melalui +melihat +melihatnya +memang +memastikan +memberi +memberikan +membuat +memerlukan +memihak +meminta +memintakan +memisalkan +memperbuat +mempergunakan +memperkirakan +memperlihatkan +mempersiapkan +mempersoalkan +mempertanyakan +mempunyai +memulai +memungkinkan +menaiki +menambahkan +menandaskan +menanti +menanti-nanti +menantikan +menanya +menanyai +menanyakan +mendapat +mendapatkan +mendatang +mendatangi +mendatangkan +menegaskan +mengakhiri +mengapa +mengatakan +mengatakannya +mengenai +mengerjakan +mengetahui +menggunakan +menghendaki +mengibaratkan +mengibaratkannya +mengingat +mengingatkan +menginginkan +mengira +mengucapkan +mengucapkannya +mengungkapkan +menjadi +menjawab +menjelaskan +menuju +menunjuk +menunjuki +menunjukkan +menunjuknya +menurut +menuturkan +menyampaikan +menyangkut +menyatakan +menyebutkan +menyeluruh +menyiapkan +merasa +mereka +merekalah +merupakan +meski +meskipun +meyakini +meyakinkan +minta +mirip +misal +misalkan +misalnya +mula +mulai +mulailah +mulanya +mungkin +mungkinkah +nah +naik +namun +nanti +nantinya +nyaris +nyatanya +oleh +olehnya +pada +padahal +padanya +pak +paling +panjang +pantas +para +pasti +pastilah +penting +pentingnya +per +percuma +perlu +perlukah +perlunya +pernah +persoalan +pertama +pertama-tama +pertanyaan +pertanyakan +pihak +pihaknya +pukul +pula +pun +punya +rasa +rasanya +rata +rupanya +saat +saatnya +saja +sajalah +saling +sama +sama-sama +sambil +sampai +sampai-sampai +sampaikan +sana +sangat +sangatlah +satu +saya +sayalah +se +sebab +sebabnya +sebagai +sebagaimana +sebagainya +sebagian +sebaik +sebaik-baiknya +sebaiknya +sebaliknya +sebanyak +sebegini +sebegitu +sebelum +sebelumnya +sebenarnya +seberapa +sebesar +sebetulnya +sebisanya +sebuah +sebut +sebutlah +sebutnya +secara +secukupnya +sedang +sedangkan +sedemikian +sedikit +sedikitnya +seenaknya +segala +segalanya +segera +seharusnya +sehingga +seingat +sejak +sejauh +sejenak +sejumlah +sekadar +sekadarnya +sekali +sekali-kali +sekalian +sekaligus +sekalipun +sekarang +sekarang +sekecil +seketika +sekiranya +sekitar +sekitarnya +sekurang-kurangnya +sekurangnya +sela +selain +selaku +selalu +selama +selama-lamanya +selamanya +selanjutnya +seluruh +seluruhnya +semacam +semakin +semampu +semampunya +semasa +semasih +semata +semata-mata +semaunya +sementara +semisal +semisalnya +sempat +semua +semuanya +semula +sendiri +sendirian +sendirinya +seolah +seolah-olah +seorang +sepanjang +sepantasnya +sepantasnyalah +seperlunya +seperti +sepertinya +sepihak +sering +seringnya +serta +serupa +sesaat +sesama +sesampai +sesegera +sesekali +seseorang +sesuatu +sesuatunya +sesudah +sesudahnya +setelah +setempat +setengah +seterusnya +setiap +setiba +setibanya +setidak-tidaknya +setidaknya +setinggi +seusai +sewaktu +siap +siapa +siapakah +siapapun +sini +sinilah +soal +soalnya +suatu +sudah +sudahkah +sudahlah +supaya +tadi +tadinya +tahu +tahun +tak +tambah +tambahnya +tampak +tampaknya +tandas +tandasnya +tanpa +tanya +tanyakan +tanyanya +tapi +tegas +tegasnya +telah +tempat +tengah +tentang +tentu +tentulah +tentunya +tepat +terakhir +terasa +terbanyak +terdahulu +terdapat +terdiri +terhadap +terhadapnya +teringat +teringat-ingat +terjadi +terjadilah +terjadinya +terkira +terlalu +terlebih +terlihat +termasuk +ternyata +tersampaikan +tersebut +tersebutlah +tertentu +tertuju +terus +terutama +tetap +tetapi +tiap +tiba +tiba-tiba +tidak +tidakkah +tidaklah +tiga +tinggi +toh +tunjuk +turut +tutur +tuturnya +ucap +ucapnya +ujar +ujarnya +umum +umumnya +ungkap +ungkapnya +untuk +usah +usai +waduh +wah +wahai +waktu +waktunya +walau +walaupun +wong +yaitu +yakin +yakni +yang \ No newline at end of file diff --git a/nltk_data/corpora/stopwords/italian b/nltk_data/corpora/stopwords/italian new file mode 100644 index 0000000000000000000000000000000000000000..6ee02b51fb1711625c1de28595615cdecba4de7c --- /dev/null +++ b/nltk_data/corpora/stopwords/italian @@ -0,0 +1,279 @@ +ad +al +allo +ai +agli +all +agl +alla +alle +con +col +coi +da +dal +dallo +dai +dagli +dall +dagl +dalla +dalle +di +del +dello +dei +degli +dell +degl +della +delle +in +nel +nello +nei +negli +nell +negl +nella +nelle +su +sul +sullo +sui +sugli +sull +sugl +sulla +sulle +per +tra +contro +io +tu +lui +lei +noi +voi +loro +mio +mia +miei +mie +tuo +tua +tuoi +tue +suo +sua +suoi +sue +nostro +nostra +nostri +nostre +vostro +vostra +vostri +vostre +mi +ti +ci +vi +lo +la +li +le +gli +ne +il +un +uno +una +ma +ed +se +perché +anche +come +dov +dove +che +chi +cui +non +più +quale +quanto +quanti +quanta +quante +quello +quelli +quella +quelle +questo +questi +questa +queste +si +tutto +tutti +a +c +e +i +l +o +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/nltk_data/corpora/stopwords/kazakh b/nltk_data/corpora/stopwords/kazakh new file mode 100644 index 0000000000000000000000000000000000000000..ebb9fc1bdc99fa06919ad03c1f5ac6a44695e111 --- /dev/null +++ b/nltk_data/corpora/stopwords/kazakh @@ -0,0 +1,380 @@ +ах +ох +эх +ай +эй +ой +тағы +тағыда +әрине +жоқ +сондай +осындай +осылай +солай +мұндай +бұндай +мен +сен +ол +біз +біздер +олар +сіз +сіздер +маған +оған +саған +біздің +сіздің +оның +бізге +сізге +оларға +біздерге +сіздерге +оларға +менімен +сенімен +онымен +бізбен +сізбен +олармен +біздермен +сіздермен +менің +сенің +біздің +сіздің +оның +біздердің +сіздердің +олардың +маған +саған +оған +менен +сенен +одан +бізден +сізден +олардан +біздерден +сіздерден +олардан +айтпақшы +сонымен +сондықтан +бұл +осы +сол +анау +мынау +сонау +осынау +ана +мына +сона +әні +міне +өй +үйт +бүйт +біреу +кейбіреу +кейбір +қайсыбір +әрбір +бірнеше +бірдеме +бірнеше +әркім +әрне +әрқайсы +әрқалай +әлдекім +әлдене +әлдеқайдан +әлденеше +әлдеқалай +әлдеқашан +алдақашан +еш +ешкім +ешбір +ештеме +дәнеңе +ешқашан +ешқандай +ешқайсы +емес +бәрі +барлық +барша +бар +күллі +бүкіл +түгел +өз +өзім +өзің +өзінің +өзіме +өзіне +өзімнің +өзі +өзге +менде +сенде +онда +менен +сенен онан +одан +ау +па +ей +әй +е +уа +уау +уай +я +пай +ә +о +оһо +ой +ие +аһа +ау +беу +мәссаған +бәрекелді +әттегенай +жаракімалла +масқарай +астапыралла +япырмай +ойпырмай +кәне +кәнеки +ал +әйда +кәні +міне +әні +сорап +қош-қош +пфша +пішә +құрау-құрау +шәйт +шек +моһ +тәк +құрау +құр +кә +кәһ +күшім +күшім +мышы +пырс +әукім +алақай +паһ-паһ +бәрекелді +ура +әттең +әттеген-ай +қап +түге +пішту +шіркін +алатау +пай-пай +үшін +сайын +сияқты +туралы +арқылы +бойы +бойымен +шамалы +шақты +қаралы +ғұрлы +ғұрлым +шейін +дейін +қарай +таман +салым +тарта +жуық +таяу +гөрі +бері +кейін +соң +бұрын +бетер +қатар +бірге +қоса +арс + +гүрс + +дүрс + +қорс + +тарс + +тырс + +ырс + +барқ + +борт + +күрт + +кірт + +морт + +сарт + +шырт + +дүңк + +күңк + +қыңқ + +мыңқ + +маңқ + +саңқ + +шаңқ + +шіңк + +сыңқ + +таңқ + +тыңқ + +ыңқ + +болп + +былп + +жалп + +желп + +қолп + +ірк + +ырқ + +сарт-сұрт + +тарс-тұрс + +арс-ұрс + +жалт-жалт + +жалт-жұлт + +қалт-қалт + +қалт-құлт + +қаңқ-қаңқ + +қаңқ-құңқ + +шаңқ-шаңқ + +шаңқ-шұңқ + +арбаң-арбаң + +бүгжең-бүгжең + +арсалаң-арсалаң + +ербелең-ербелең + +батыр-бұтыр + +далаң-далаң + +тарбаң-тарбаң + +қызараң-қызараң + +қаңғыр-күңгір + +қайқаң-құйқаң + +митың-митың + +салаң-сұлаң + +ыржың-тыржың +бірақ +алайда +дегенмен +әйтпесе +әйткенмен +себебі +өйткені +сондықтан +үшін +сайын +сияқты +туралы +арқылы +бойы +бойымен +шамалы +шақты +қаралы +ғұрлы +ғұрлым +гөрі +бері +кейін +соң +бұрын +бетер +қатар +бірге +қоса +шейін +дейін +қарай +таман +салым +тарта +жуық +таяу +арнайы +осындай +ғана +қана +тек +әншейін diff --git a/nltk_data/corpora/stopwords/nepali b/nltk_data/corpora/stopwords/nepali new file mode 100644 index 0000000000000000000000000000000000000000..b2e4d34709f5573aa7a19009dc8732db43f9fd6f --- /dev/null +++ b/nltk_data/corpora/stopwords/nepali @@ -0,0 +1,255 @@ +छ +र +पनि +छन् +लागि +भएको +गरेको +भने +गर्न +गर्ने +हो +तथा +यो +रहेको +उनले +थियो +हुने +गरेका +थिए +गर्दै +तर +नै +को +मा +हुन् +भन्ने +हुन +गरी +त +हुन्छ +अब +के +रहेका +गरेर +छैन +दिए +भए +यस +ले +गर्नु +औं +सो +त्यो +कि +जुन +यी +का +गरि +ती +न +छु +छौं +लाई +नि +उप +अक्सर +आदि +कसरी +क्रमशः +चाले +अगाडी +अझै +अनुसार +अन्तर्गत +अन्य +अन्यत्र +अन्यथा +अरु +अरुलाई +अर्को +अर्थात +अर्थात् +अलग +आए +आजको +ओठ +आत्म +आफू +आफूलाई +आफ्नै +आफ्नो +आयो +उदाहरण +उनको +उहालाई +एउटै +एक +एकदम +कतै +कम से कम +कसै +कसैले +कहाँबाट +कहिलेकाहीं +का +किन +किनभने +कुनै +कुरा +कृपया +केही +कोही +गए +गरौं +गर्छ +गर्छु +गर्नुपर्छ +गयौ +गैर +चार +चाहनुहुन्छ +चाहन्छु +चाहिए +छू +जताततै +जब +जबकि +जसको +जसबाट +जसमा +जसलाई +जसले +जस्तै +जस्तो +जस्तोसुकै +जहाँ +जान +जाहिर +जे +जो +ठीक +तत्काल +तदनुसार +तपाईको +तपाई +पर्याप्त +पहिले +पहिलो +पहिल्यै +पाँच +पाँचौं +तल +तापनी +तिनी +तिनीहरू +तिनीहरुको +तिनिहरुलाई +तिमी +तिर +तीन +तुरुन्तै +तेस्रो +तेस्कारण +पूर्व +प्रति +प्रतेक +प्लस +फेरी +बने +त्सपछि +त्सैले +त्यहाँ +थिएन +दिनुभएको +दिनुहुन्छ +दुई +देखि +बरु +बारे +बाहिर +देखिन्छ +देखियो +देखे +देखेको +देखेर +दोस्रो +धेरै +नजिकै +नत्र +नयाँ +निम्ति +बाहेक +बीच +बीचमा +भन +निम्न +निम्नानुसार +निर्दिष्ट +नौ +पक्का +पक्कै +पछि +पछिल्लो +पटक +पर्छ +पर्थ्यो +भन्छन् +भन् +भन्छु +भन्दा +भन्नुभयो +भर +भित्र +भित्री +म +मलाई +मात्र +माथि +मुख्य +मेरो +यति +यथोचित +यदि +यद्यपि +यसको +यसपछि +यसबाहेक +यसरी +यसो +यस्तो +यहाँ +यहाँसम्म +या +रही +राखे +राख्छ +राम्रो +रूप +लगभग +वरीपरी +वास्तवमा +बिरुद्ध +बिशेष +सायद +शायद +संग +संगै +सक्छ +सट्टा +सधै +सबै +सबैलाई +समय +सम्भव +सम्म +सही +साँच्चै +सात +साथ +साथै +सारा +सोही +स्पष्ट +हरे +हरेक \ No newline at end of file diff --git a/nltk_data/corpora/stopwords/norwegian b/nltk_data/corpora/stopwords/norwegian new file mode 100644 index 0000000000000000000000000000000000000000..9ac1abbb6cba1fb3159caa6d88b25543b9080031 --- /dev/null +++ b/nltk_data/corpora/stopwords/norwegian @@ -0,0 +1,176 @@ +og +i +jeg +det +at +en +et +den +til +er +som +på +de +med +han +av +ikke +ikkje +der +så +var +meg +seg +men +ett +har +om +vi +min +mitt +ha +hadde +hun +nå +over +da +ved +fra +du +ut +sin +dem +oss +opp +man +kan +hans +hvor +eller +hva +skal +selv +sjøl +her +alle +vil +bli +ble +blei +blitt +kunne +inn +når +være +kom +noen +noe +ville +dere +som +deres +kun +ja +etter +ned +skulle +denne +for +deg +si +sine +sitt +mot +å +meget +hvorfor +dette +disse +uten +hvordan +ingen +din +ditt +blir +samme +hvilken +hvilke +sånn +inni +mellom +vår +hver +hvem +vors +hvis +både +bare +enn +fordi +før +mange +også +slik +vært +være +båe +begge +siden +dykk +dykkar +dei +deira +deires +deim +di +då +eg +ein +eit +eitt +elles +honom +hjå +ho +hoe +henne +hennar +hennes +hoss +hossen +ikkje +ingi +inkje +korleis +korso +kva +kvar +kvarhelst +kven +kvi +kvifor +me +medan +mi +mine +mykje +no +nokon +noka +nokor +noko +nokre +si +sia +sidan +so +somt +somme +um +upp +vere +vore +verte +vort +varte +vart diff --git a/nltk_data/corpora/stopwords/portuguese b/nltk_data/corpora/stopwords/portuguese new file mode 100644 index 0000000000000000000000000000000000000000..eb53a8fc2544b16d2a849d98e26c6e34bf07bdb0 --- /dev/null +++ b/nltk_data/corpora/stopwords/portuguese @@ -0,0 +1,207 @@ +a +à +ao +aos +aquela +aquelas +aquele +aqueles +aquilo +as +às +até +com +como +da +das +de +dela +delas +dele +deles +depois +do +dos +e +é +ela +elas +ele +eles +em +entre +era +eram +éramos +essa +essas +esse +esses +esta +está +estamos +estão +estar +estas +estava +estavam +estávamos +este +esteja +estejam +estejamos +estes +esteve +estive +estivemos +estiver +estivera +estiveram +estivéramos +estiverem +estivermos +estivesse +estivessem +estivéssemos +estou +eu +foi +fomos +for +fora +foram +fôramos +forem +formos +fosse +fossem +fôssemos +fui +há +haja +hajam +hajamos +hão +havemos +haver +hei +houve +houvemos +houver +houvera +houverá +houveram +houvéramos +houverão +houverei +houverem +houveremos +houveria +houveriam +houveríamos +houvermos +houvesse +houvessem +houvéssemos +isso +isto +já +lhe +lhes +mais +mas +me +mesmo +meu +meus +minha +minhas +muito +na +não +nas +nem +no +nos +nós +nossa +nossas +nosso +nossos +num +numa +o +os +ou +para +pela +pelas +pelo +pelos +por +qual +quando +que +quem +são +se +seja +sejam +sejamos +sem +ser +será +serão +serei +seremos +seria +seriam +seríamos +seu +seus +só +somos +sou +sua +suas +também +te +tem +tém +temos +tenha +tenham +tenhamos +tenho +terá +terão +terei +teremos +teria +teriam +teríamos +teu +teus +teve +tinha +tinham +tínhamos +tive +tivemos +tiver +tivera +tiveram +tivéramos +tiverem +tivermos +tivesse +tivessem +tivéssemos +tu +tua +tuas +um +uma +você +vocês +vos diff --git a/nltk_data/corpora/stopwords/romanian b/nltk_data/corpora/stopwords/romanian new file mode 100644 index 0000000000000000000000000000000000000000..45651c90ec7ea5b26c51ea9a11fddafb00652190 --- /dev/null +++ b/nltk_data/corpora/stopwords/romanian @@ -0,0 +1,356 @@ +a +abia +acea +aceasta +această +aceea +aceeasi +acei +aceia +acel +acela +acelasi +acele +acelea +acest +acesta +aceste +acestea +acestei +acestia +acestui +aceşti +aceştia +adica +ai +aia +aibă +aici +al +ala +ale +alea +alt +alta +altceva +altcineva +alte +altfel +alti +altii +altul +am +anume +apoi +ar +are +as +asa +asta +astea +astfel +asupra +atare +atat +atata +atatea +atatia +ati +atit +atita +atitea +atitia +atunci +au +avea +avem +aveţi +avut +aş +aţi +ba +ca +cam +cand +care +careia +carora +caruia +cat +catre +ce +cea +ceea +cei +ceilalti +cel +cele +celor +ceva +chiar +ci +cind +cine +cineva +cit +cita +cite +citeva +citi +citiva +cu +cui +cum +cumva +cât +câte +câtva +câţi +cînd +cît +cîte +cîtva +cîţi +că +căci +cărei +căror +cărui +către +da +daca +dacă +dar +dat +dată +dau +de +deasupra +deci +decit +deja +desi +despre +deşi +din +dintr +dintr- +dintre +doar +doi +doilea +două +drept +dupa +după +dă +e +ea +ei +el +ele +era +eram +este +eu +eşti +face +fara +fata +fel +fi +fie +fiecare +fii +fim +fiu +fiţi +foarte +fost +fără +i +ia +iar +ii +il +imi +in +inainte +inapoi +inca +incit +insa +intr +intre +isi +iti +la +le +li +lor +lui +lângă +lîngă +m +ma +mai +mea +mei +mele +mereu +meu +mi +mie +mine +mod +mult +multa +multe +multi +multă +mulţi +mâine +mîine +mă +ne +ni +nici +nimeni +nimic +niste +nişte +noastre +noastră +noi +nostri +nostru +nou +noua +nouă +noştri +nu +numai +o +or +ori +oricare +orice +oricine +oricum +oricând +oricât +oricînd +oricît +oriunde +pai +parca +patra +patru +pe +pentru +peste +pic +pina +poate +pot +prea +prima +primul +prin +printr- +putini +puţin +puţina +puţină +până +pînă +sa +sa-mi +sa-ti +sai +sale +sau +se +si +sint +sintem +spate +spre +sub +sunt +suntem +sunteţi +sus +să +săi +său +t +ta +tale +te +ti +tine +toata +toate +toată +tocmai +tot +toti +totul +totusi +totuşi +toţi +trei +treia +treilea +tu +tuturor +tăi +tău +u +ul +ului +un +una +unde +undeva +unei +uneia +unele +uneori +unii +unor +unora +unu +unui +unuia +unul +v +va +vi +voastre +voastră +voi +vom +vor +vostru +vouă +voştri +vreo +vreun +vă +zi +zice +îi +îl +îmi +în +îţi +ăla +ălea +ăsta +ăstea +ăştia +şi +ţi +ţie \ No newline at end of file diff --git a/nltk_data/corpora/stopwords/russian b/nltk_data/corpora/stopwords/russian new file mode 100644 index 0000000000000000000000000000000000000000..ecb83d4a7f393b67b054b44e337d683b2ff2d973 --- /dev/null +++ b/nltk_data/corpora/stopwords/russian @@ -0,0 +1,151 @@ +и +в +во +не +что +он +на +я +с +со +как +а +то +все +она +так +его +но +да +ты +к +у +же +вы +за +бы +по +только +ее +мне +было +вот +от +меня +еще +нет +о +из +ему +теперь +когда +даже +ну +вдруг +ли +если +уже +или +ни +быть +был +него +до +вас +нибудь +опять +уж +вам +ведь +там +потом +себя +ничего +ей +может +они +тут +где +есть +надо +ней +для +мы +тебя +их +чем +была +сам +чтоб +без +будто +чего +раз +тоже +себе +под +будет +ж +тогда +кто +этот +того +потому +этого +какой +совсем +ним +здесь +этом +один +почти +мой +тем +чтобы +нее +сейчас +были +куда +зачем +всех +никогда +можно +при +наконец +два +об +другой +хоть +после +над +больше +тот +через +эти +нас +про +всего +них +какая +много +разве +три +эту +моя +впрочем +хорошо +свою +этой +перед +иногда +лучше +чуть +том +нельзя +такой +им +более +всегда +конечно +всю +между diff --git a/nltk_data/corpora/stopwords/slovene b/nltk_data/corpora/stopwords/slovene new file mode 100644 index 0000000000000000000000000000000000000000..eb4d1bcf28f31cb48ffa1e1d7b8fddc60f0fbd32 --- /dev/null +++ b/nltk_data/corpora/stopwords/slovene @@ -0,0 +1,1784 @@ +ali +ampak +bodisi +in +kajti +marveč +namreč +ne +niti +oziroma +pa +saj +sicer +temveč +ter +toda +torej +vendar +vendarle +zakaj +če +čeprav +čeravno +četudi +čim +da +kadar +kakor +ker +ki +ko +kot +naj +najsi +odkar +preden +dve +dvema +dveh +šest +šestdeset +šestindvajset +šestintrideset +šestnajst +šeststo +štiri +štirideset +štiriindvajset +štirinajst +štiristo +deset +devet +devetdeset +devetintrideset +devetnajst +devetsto +dvainšestdeset +dvaindvajset +dvajset +dvanajst +dvesto +enaindvajset +enaintrideset +enajst +nič +osem +osemdeset +oseminštirideset +osemindevetdeset +osemnajst +pet +petdeset +petinštirideset +petindevetdeset +petindvajset +petinosemdeset +petinpetdeset +petinsedemdeset +petintrideset +petnajst +petsto +sedem +sedemdeset +sedeminšestdeset +sedemindvajset +sedeminpetdeset +sedemnajst +sedemsto +sto +tisoč +tri +trideset +triinšestdeset +triindvajset +triinpetdeset +trinajst +tristo +šestdesetim +šestim +šestindvajsetim +šestintridesetim +šestnajstim +šeststotim +štiridesetim +štiriindvajsetim +štirim +štirinajstim +štiristotim +desetim +devetdesetim +devetim +devetintridesetim +devetnajstim +devetstotim +dvainšestdesetim +dvaindvajsetim +dvajsetim +dvanajstim +dvestotim +enaindvajsetim +enaintridesetim +enajstim +osemdesetim +oseminštiridesetim +osemindevetdesetim +osemnajstim +osmim +petdesetim +petim +petinštiridesetim +petindevetdesetim +petindvajsetim +petinosemdesetim +petinpetdesetim +petinsedemdesetim +petintridesetim +petnajstim +petstotim +sedemdesetim +sedeminšestdesetim +sedemindvajsetim +sedeminpetdesetim +sedemnajstim +sedemstotim +sedmim +stotim +tisočim +trem +tridesetim +triinšestdesetim +triindvajsetim +triinpetdesetim +trinajstim +tristotim +šestdesetih +šestih +šestindvajsetih +šestintridesetih +šestnajstih +šeststotih +štiridesetih +štirih +štiriindvajsetih +štirinajstih +štiristotih +desetih +devetdesetih +devetih +devetintridesetih +devetnajstih +devetstotih +dvainšestdesetih +dvaindvajsetih +dvajsetih +dvanajstih +dvestotih +enaindvajsetih +enaintridesetih +enajstih +osemdesetih +oseminštiridesetih +osemindevetdesetih +osemnajstih +osmih +petdesetih +petih +petinštiridesetih +petindevetdesetih +petindvajsetih +petinosemdesetih +petinpetdesetih +petinsedemdesetih +petintridesetih +petnajstih +petstotih +sedemdesetih +sedeminšestdesetih +sedemindvajsetih +sedeminpetdesetih +sedemnajstih +sedemstotih +sedmih +stotih +tisočih +treh +tridesetih +triinšestdesetih +triindvajsetih +triinpetdesetih +trinajstih +tristotih +šestdesetimi +šestimi +šestindvajsetimi +šestintridesetimi +šestnajstimi +šeststotimi +štiridesetimi +štiriindvajsetimi +štirimi +štirinajstimi +štiristotimi +desetimi +devetdesetimi +devetimi +devetintridesetimi +devetnajstimi +devetstotimi +dvainšestdesetimi +dvaindvajsetimi +dvajsetimi +dvanajstimi +dvestotimi +enaindvajsetimi +enaintridesetimi +enajstimi +osemdesetimi +oseminštiridesetimi +osemindevetdesetimi +osemnajstimi +osmimi +petdesetimi +petimi +petinštiridesetimi +petindevetdesetimi +petindvajsetimi +petinosemdesetimi +petinpetdesetimi +petinsedemdesetimi +petintridesetimi +petnajstimi +petstotimi +sedemdesetimi +sedeminšestdesetimi +sedemindvajsetimi +sedeminpetdesetimi +sedemnajstimi +sedemstotimi +sedmimi +stotimi +tisočimi +tremi +tridesetimi +triinšestdesetimi +triindvajsetimi +triinpetdesetimi +trinajstimi +tristotimi +eno +eni +ene +ena +dva +štirje +trije +en +enega +enemu +enim +enem +eden +dvojni +trojni +dvojnima +trojnima +dvojnih +trojnih +dvojne +trojne +dvojnim +trojnim +dvojnimi +trojnimi +dvojno +trojno +dvojna +trojna +dvojnega +trojnega +dvojen +trojen +dvojnemu +trojnemu +dvojnem +trojnem +četrti +šestdeseti +šesti +šestnajsti +štirideseti +štiriindvajseti +štirinajsti +deseti +devetdeseti +deveti +devetnajsti +drugi +dvaindevetdeseti +dvajseti +dvanajsti +dvestoti +enaindvajseti +enajsti +osemdeseti +osemnajsti +osmi +petdeseti +peti +petinštirideseti +petindvajseti +petinosemdeseti +petintrideseti +petnajsti +prvi +sedemdeseti +sedemindvajseti +sedemnajsti +sedmi +stoti +tisoči +tretji +trideseti +triindvajseti +triintrideseti +trinajsti +tristoti +četrtima +šestdesetima +šestima +šestnajstima +štiridesetima +štiriindvajsetima +štirinajstima +desetima +devetdesetima +devetima +devetnajstima +drugima +dvaindevetdesetima +dvajsetima +dvanajstima +dvestotima +enaindvajsetima +enajstima +osemdesetima +osemnajstima +osmima +petdesetima +petima +petinštiridesetima +petindvajsetima +petinosemdesetima +petintridesetima +petnajstima +prvima +sedemdesetima +sedemindvajsetima +sedemnajstima +sedmima +stotima +tisočima +tretjima +tridesetima +triindvajsetima +triintridesetima +trinajstima +tristotima +četrtih +drugih +dvaindevetdesetih +prvih +tretjih +triintridesetih +četrte +šestdesete +šeste +šestnajste +štiridesete +štiriindvajsete +štirinajste +desete +devetdesete +devete +devetnajste +druge +dvaindevetdesete +dvajsete +dvanajste +dvestote +enaindvajsete +enajste +osemdesete +osemnajste +osme +petdesete +pete +petinštiridesete +petindvajsete +petinosemdesete +petintridesete +petnajste +prve +sedemdesete +sedemindvajsete +sedemnajste +sedme +stote +tisoče +tretje +tridesete +triindvajsete +triintridesete +trinajste +tristote +četrtim +drugim +dvaindevetdesetim +prvim +tretjim +triintridesetim +četrtimi +drugimi +dvaindevetdesetimi +prvimi +tretjimi +triintridesetimi +četrto +šestdeseto +šestnajsto +šesto +štirideseto +štiriindvajseto +štirinajsto +deseto +devetdeseto +devetnajsto +deveto +drugo +dvaindevetdeseto +dvajseto +dvanajsto +dvestoto +enaindvajseto +enajsto +osemdeseto +osemnajsto +osmo +petdeseto +petinštirideseto +petindvajseto +petinosemdeseto +petintrideseto +petnajsto +peto +prvo +sedemdeseto +sedemindvajseto +sedemnajsto +sedmo +stoto +tisočo +tretjo +trideseto +triindvajseto +triintrideseto +trinajsto +tristoto +četrta +šesta +šestdeseta +šestnajsta +štirideseta +štiriindvajseta +štirinajsta +deseta +deveta +devetdeseta +devetnajsta +druga +dvaindevetdeseta +dvajseta +dvanajsta +dvestota +enaindvajseta +enajsta +osemdeseta +osemnajsta +osma +peta +petdeseta +petinštirideseta +petindvajseta +petinosemdeseta +petintrideseta +petnajsta +prva +sedemdeseta +sedemindvajseta +sedemnajsta +sedma +stota +tisoča +tretja +trideseta +triindvajseta +triintrideseta +trinajsta +tristota +četrtega +šestdesetega +šestega +šestnajstega +štiridesetega +štiriindvajsetega +štirinajstega +desetega +devetdesetega +devetega +devetnajstega +drugega +dvaindevetdesetega +dvajsetega +dvanajstega +dvestotega +enaindvajsetega +enajstega +osemdesetega +osemnajstega +osmega +petdesetega +petega +petinštiridesetega +petindvajsetega +petinosemdesetega +petintridesetega +petnajstega +prvega +sedemdesetega +sedemindvajsetega +sedemnajstega +sedmega +stotega +tisočega +tretjega +tridesetega +triindvajsetega +triintridesetega +trinajstega +tristotega +četrtemu +šestdesetemu +šestemu +šestnajstemu +štiridesetemu +štiriindvajsetemu +štirinajstemu +desetemu +devetdesetemu +devetemu +devetnajstemu +drugemu +dvaindevetdesetemu +dvajsetemu +dvanajstemu +dvestotemu +enaindvajsetemu +enajstemu +osemdesetemu +osemnajstemu +osmemu +petdesetemu +petemu +petinštiridesetemu +petindvajsetemu +petinosemdesetemu +petintridesetemu +petnajstemu +prvemu +sedemdesetemu +sedemindvajsetemu +sedemnajstemu +sedmemu +stotemu +tisočemu +tretjemu +tridesetemu +triindvajsetemu +triintridesetemu +trinajstemu +tristotemu +četrtem +šestdesetem +šestem +šestnajstem +štiridesetem +štiriindvajsetem +štirinajstem +desetem +devetdesetem +devetem +devetnajstem +drugem +dvaindevetdesetem +dvajsetem +dvanajstem +dvestotem +enaindvajsetem +enajstem +osemdesetem +osemnajstem +osmem +petdesetem +petem +petinštiridesetem +petindvajsetem +petinosemdesetem +petintridesetem +petnajstem +prvem +sedemdesetem +sedemindvajsetem +sedemnajstem +sedmem +stotem +tisočem +tretjem +tridesetem +triindvajsetem +triintridesetem +trinajstem +tristotem +deseteri +dvakratni +dvoji +enkratni +peteri +stoteri +tisočeri +trikratni +troji +deseterima +dvakratnima +dvojima +enkratnima +peterima +stoterima +tisočerima +trikratnima +trojima +deseterih +dvakratnih +dvojih +enkratnih +peterih +stoterih +tisočerih +trikratnih +trojih +desetere +dvakratne +dvoje +enkratne +petere +stotere +tisočere +trikratne +troje +deseterim +dvakratnim +dvojim +enkratnim +peterim +stoterim +tisočerim +trikratnim +trojim +deseterimi +dvakratnimi +dvojimi +enkratnimi +peterimi +stoterimi +tisočerimi +trikratnimi +trojimi +desetero +dvakratno +dvojo +enkratno +petero +stotero +tisočero +trikratno +trojo +desetera +dvakratna +dvoja +enkratna +petera +stotera +tisočera +trikratna +troja +deseterega +dvakratnega +dvojega +enkratnega +peterega +stoterega +tisočerega +trikratnega +trojega +deseter +dvakraten +dvoj +enkraten +peter +stoter +tisočer +trikraten +troj +deseteremu +dvakratnemu +dvojemu +enkratnemu +peteremu +stoteremu +tisočeremu +trikratnemu +trojemu +deseterem +dvakratnem +dvojem +enkratnem +peterem +stoterem +tisočerem +trikratnem +trojem +le-onega +le-tega +le-tistega +le-toliko +onega +tega +tistega +toliko +le-oni +le-takšni +le-taki +le-te +le-ti +le-tisti +oni +takšni +taki +te +ti +tisti +le-onima +le-takšnima +le-takima +le-tema +le-tistima +onima +takšnima +takima +tema +tistima +le-onih +le-takšnih +le-takih +le-teh +le-tistih +onih +takšnih +takih +teh +tistih +le-one +le-takšne +le-take +le-tiste +one +takšne +take +tiste +le-onim +le-takšnim +le-takim +le-tem +le-tistim +onim +takšnim +takim +tem +tistim +le-onimi +le-takšnimi +le-takimi +le-temi +le-tistimi +onimi +takšnimi +takimi +temi +tistimi +le-ono +le-takšno +le-tako +le-tisto +le-to +ono +takšno +tako +tisto +to +le-tej +tej +le-ona +le-ta +le-takšna +le-taka +le-tista +ona +ta +takšna +taka +tista +le-tak +le-takšen +tak +takšen +le-takšnega +le-takega +takšnega +takega +le-onemu +le-takšnemu +le-takemu +le-temu +le-tistemu +onemu +takšnemu +takemu +temu +temuintemu +tistemu +le-onem +le-takšnem +le-takem +le-tistem +onem +takšnem +takem +tistem +vsakogar +vsakomur +vsakomer +vsakdo +obe +vsaki +vsakršni +vsi +obema +vsakima +vsakršnima +vsema +obeh +vsakih +vsakršnih +vseh +vsake +vsakršne +vse +vsakim +vsakršnim +vsem +vsakimi +vsakršnimi +vsemi +vsako +vsakršno +vso +vsej +vsa +vsaka +vsakršna +oba +ves +vsak +vsakršen +vsakega +vsakršnega +vsega +vsakemu +vsakršnemu +vsemu +vsakem +vsakršnem +enako +istega +koliko +mnogo +nekoga +nekoliko +precej +kaj +koga +marsikaj +marsikoga +nekaj +čemu +komu +marsičemu +marsikomu +nečemu +nekomu +česa +marsičesa +nečesa +kom +marsičim +marsikom +nečim +nekom +čem +marsičem +nečem +kdo +marsikdo +nekdo +čigavi +drugačni +enaki +isti +kakšni +kaki +kakršnikoli +kateri +katerikoli +kolikšni +koliki +marsikateri +nekakšni +nekaki +nekateri +neki +takile +tele +tile +tolikšni +toliki +čigavima +drugačnima +enakima +enima +istima +kakšnima +kakima +kakršnimakoli +katerima +katerimakoli +kolikšnima +kolikima +marsikaterima +nekakšnima +nekakima +nekaterima +nekima +takimale +temale +tolikšnima +tolikima +čigavih +drugačnih +enakih +enih +istih +kakšnih +kakih +kakršnihkoli +katerih +katerihkoli +kolikšnih +kolikih +marsikaterih +nekakšnih +nekakih +nekaterih +nekih +takihle +tehle +tolikšnih +tolikih +čigave +drugačne +enake +iste +kakšne +kake +kakršnekoli +katere +katerekoli +kolikšne +kolike +marsikatere +nekakšne +nekake +nekatere +neke +takele +tolikšne +tolike +čigavim +drugačnim +enakim +istim +kakšnim +kakim +kakršnimkoli +katerim +katerimkoli +kolikšnim +kolikim +marsikaterim +nekakšnim +nekakim +nekaterim +nekim +takimle +temle +tolikšnim +tolikim +čigavimi +drugačnimi +enakimi +enimi +istimi +kakšnimi +kakimi +kakršnimikoli +katerimi +katerimikoli +kolikšnimi +kolikimi +marsikaterimi +nekakšnimi +nekakimi +nekaterimi +nekimi +takimile +temile +tolikšnimi +tolikimi +čigavo +drugačno +isto +kakšno +kako +kakršnokoli +katero +katerokoli +kolikšno +marsikatero +nekakšno +nekako +nekatero +neko +takole +tole +tolikšno +tejle +čigava +drugačna +enaka +ista +kakšna +kaka +kakršnakoli +katera +katerakoli +kolikšna +kolika +marsikatera +neka +nekakšna +nekaka +nekatera +takale +tale +tolikšna +tolika +čigav +drug +drugačen +enak +kak +kakšen +kakršenkoli +kakršnegakoli +kateregakoli +kolik +kolikšen +nek +nekak +nekakšen +takegale +takle +tegale +tolik +tolikšen +čigavega +drugačnega +enakega +kakšnega +kakega +katerega +kolikšnega +kolikega +marsikaterega +nekakšnega +nekakega +nekaterega +nekega +tolikšnega +tolikega +čigavemu +drugačnemu +enakemu +istemu +kakšnemu +kakemu +kakršnemukoli +kateremu +kateremukoli +kolikšnemu +kolikemu +marsikateremu +nekakšnemu +nekakemu +nekateremu +nekemu +takemule +temule +tolikšnemu +tolikemu +čigavem +drugačnem +enakem +istem +kakšnem +kakem +kakršnemkoli +katerem +kateremkoli +kolikšnem +kolikem +marsikaterem +nekakšnem +nekakem +nekaterem +nekem +takemle +tolikšnem +tolikem +naju +nama +midva +nas +nam +nami +mi +mene +me +meni +mano +menoj +jaz +vaju +vama +vidva +vas +vam +vami +vi +tebe +tebi +tabo +teboj +njiju +jih +ju +njima +jima +onedve +onidve +nje +njih +njim +jim +njimi +njo +jo +njej +nji +ji +je +onadva +njega +ga +njemu +mu +njem +on +čigar +kolikor +kar +karkoli +kogar +kogarkoli +čemur +čemurkoli +komur +komurkoli +česar +česarkoli +čimer +čimerkoli +komer +komerkoli +čemer +čemerkoli +kdor +kdorkoli +kakršni +kakršnima +kakršnih +kakršne +kakršnim +kakršnimi +kakršno +kakršna +kakršen +kakršnega +kakršnemu +kakršnem +najini +naši +moji +najinima +našima +mojima +najinih +naših +mojih +najine +naše +moje +najinim +našim +mojim +najinimi +našimi +mojimi +najino +našo +mojo +najina +naša +moja +najin +najinega +naš +našega +moj +mojega +najinemu +našemu +mojemu +najinem +našem +mojem +vajini +vaši +tvoji +vajinima +vašima +tvojima +vajinih +vaših +tvojih +vajine +vaše +tvoje +vajinim +vašim +tvojim +vajinimi +vašimi +tvojimi +vajino +vašo +tvojo +vajina +vaša +tvoja +vajin +vajinega +vaš +vašega +tvoj +tvojega +vajinemu +vašemu +tvojemu +vajinem +vašem +tvojem +njuni +njihovi +njeni +njegovi +njunima +njihovima +njenima +njegovima +njunih +njihovih +njenih +njegovih +njune +njihove +njene +njegove +njunim +njihovim +njenim +njegovim +njunimi +njihovimi +njenimi +njegovimi +njuno +njihovo +njeno +njegovo +njuna +njihova +njena +njegova +njun +njunega +njihov +njihovega +njen +njenega +njegov +njegovega +njunemu +njihovemu +njenemu +njegovemu +njunem +njihovem +njenem +njegovem +se +si +sebe +sebi +sabo +seboj +svoji +svojima +svojih +svoje +svojim +svojimi +svojo +svoja +svoj +svojega +svojemu +svojem +nikogar +noben +ničemur +nikomur +ničesar +ničimer +nikomer +ničemer +nihče +nikakršni +nobeni +nikakršnima +nobenima +nikakršnih +nobenih +nikakršne +nobene +nikakršnim +nobenim +nikakršnimi +nobenimi +nikakršno +nobeno +nikakršna +nobena +nikakršen +nikakršnega +nobenega +nikakršnemu +nobenemu +nikakršnem +nobenem +še +šele +žal +že +baje +bojda +bržčas +bržkone +celo +dobesedno +domala +edinole +gotovo +itak +ja +kajne +kajpada +kajpak +koli +komaj +le +malone +mar +menda +morda +morebiti +nadvse +najbrž +nemara +nerad +neradi +nikar +pač +pogodu +prav +pravzaprav +predvsem +preprosto +rad +rada +rade +radi +ravno +res +resda +samo +seveda +skoraj +skorajda +spet +sploh +tudi +všeč +verjetno +vnovič +vred +vsaj +zadosti +zapored +zares +zgolj +zlasti +zopet +čezenj +čeznje +mednje +mednju +medse +nadenj +nadme +nadnje +name +nanj +nanje +nanjo +nanju +nase +nate +obenj +podnjo +pome +ponj +ponje +ponjo +pote +predenj +predme +prednje +predse +skozenj +skoznje +skoznjo +skozte +vame +vanj +vanje +vanjo +vanju +vase +vate +zame +zanj +zanje +zanjo +zanju +zase +zate +čez +med +na +nad +ob +po +pod +pred +raz +skoz +skozi +v +za +zoper +h +k +kljub +nasproti +navkljub +navzlic +proti +ž +blizu +brez +dno +do +iz +izmed +iznad +izpod +izpred +izven +izza +krog +mimo +namesto +naokoli +naproti +od +okoli +okrog +onkraj +onstran +poleg +povrh +povrhu +prek +preko +razen +s +spod +spričo +sredi +vštric +vpričo +vrh +vrhu +vzdolž +z +zaradi +zavoljo +zraven +zunaj +o +pri +bi +bova +bomo +bom +bosta +boste +boš +bodo +bojo +bo +sva +nisva +smo +nismo +sem +nisem +sta +nista +ste +niste +nisi +so +niso +ni +bodiva +bodimo +bodita +bodite +bodi +biti +bili +bila +bile +bil +bilo +želiva +dovoliva +hočeva +marava +morava +moreva +smeva +zmoreva +nočeva +želimo +dovolimo +hočemo +maramo +moramo +moremo +smemo +zmoremo +nočemo +želim +dovolim +hočem +maram +moram +morem +smem +zmorem +nočem +želita +dovolita +hočeta +marata +morata +moreta +smeta +zmoreta +nočeta +želite +dovolite +hočete +marate +morate +morete +smete +zmorete +nočete +želiš +dovoliš +hočeš +maraš +moraš +moreš +smeš +zmoreš +nočeš +želijo +dovolijo +hočejo +marajo +morajo +morejo +smejo +zmorejo +nočejo +želi +dovoli +hoče +mara +mora +more +sme +zmore +noče +hotiva +marajva +hotimo +marajmo +hotita +marajta +hotite +marajte +hoti +maraj +želeti +dovoliti +hoteti +marati +moči +morati +smeti +zmoči +želeni +dovoljeni +želena +dovoljena +želene +dovoljene +želen +dovoljen +želeno +dovoljeno +želeli +dovolili +hoteli +marali +mogli +morali +smeli +zmogli +želela +dovolila +hotela +marala +mogla +morala +smela +zmogla +želele +dovolile +hotele +marale +mogle +morale +smele +zmogle +želel +dovolil +hotel +maral +mogel +moral +smel +zmogel +želelo +dovolilo +hotelo +maralo +moglo +moralo +smelo +zmogl diff --git a/nltk_data/corpora/stopwords/spanish b/nltk_data/corpora/stopwords/spanish new file mode 100644 index 0000000000000000000000000000000000000000..6a7d50cc633609c8bc2da53c3dac9a015c248b62 --- /dev/null +++ b/nltk_data/corpora/stopwords/spanish @@ -0,0 +1,313 @@ +de +la +que +el +en +y +a +los +del +se +las +por +un +para +con +no +una +su +al +lo +como +más +pero +sus +le +ya +o +este +sí +porque +esta +entre +cuando +muy +sin +sobre +también +me +hasta +hay +donde +quien +desde +todo +nos +durante +todos +uno +les +ni +contra +otros +ese +eso +ante +ellos +e +esto +mí +antes +algunos +qué +unos +yo +otro +otras +otra +él +tanto +esa +estos +mucho +quienes +nada +muchos +cual +poco +ella +estar +estas +algunas +algo +nosotros +mi +mis +tú +te +ti +tu +tus +ellas +nosotras +vosotros +vosotras +os +mío +mía +míos +mías +tuyo +tuya +tuyos +tuyas +suyo +suya +suyos +suyas +nuestro +nuestra +nuestros +nuestras +vuestro +vuestra +vuestros +vuestras +esos +esas +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +sintiendo +sentido +sentida +sentidos +sentidas +siente +sentid +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened diff --git a/nltk_data/corpora/stopwords/swedish b/nltk_data/corpora/stopwords/swedish new file mode 100644 index 0000000000000000000000000000000000000000..742bb6263b99fb2ea6e1e3624a65eb2799a1e24c --- /dev/null +++ b/nltk_data/corpora/stopwords/swedish @@ -0,0 +1,114 @@ +och +det +att +i +en +jag +hon +som +han +på +den +med +var +sig +för +så +till +är +men +ett +om +hade +de +av +icke +mig +du +henne +då +sin +nu +har +inte +hans +honom +skulle +hennes +där +min +man +ej +vid +kunde +något +från +ut +när +efter +upp +vi +dem +vara +vad +över +än +dig +kan +sina +här +ha +mot +alla +under +någon +eller +allt +mycket +sedan +ju +denna +själv +detta +åt +utan +varit +hur +ingen +mitt +ni +bli +blev +oss +din +dessa +några +deras +blir +mina +samma +vilken +er +sådan +vår +blivit +dess +inom +mellan +sådant +varför +varje +vilka +ditt +vem +vilket +sitta +sådana +vart +dina +vars +vårt +våra +ert +era +vilkas diff --git a/nltk_data/corpora/stopwords/tajik b/nltk_data/corpora/stopwords/tajik new file mode 100644 index 0000000000000000000000000000000000000000..898614a0210a2f4cd74deed295978c04a19bbf37 --- /dev/null +++ b/nltk_data/corpora/stopwords/tajik @@ -0,0 +1,163 @@ +аз +дар +ба +бо +барои +бе +то +ҷуз +пеши +назди +рӯйи +болои +паси +ғайри +ҳамон +ҳамоно +инҷониб +замон +замоно +эътиборан +пеш +қабл +дида +сар карда +агар +агар ки +валекин +ки +лекин +аммо +вале +балки +ва +ҳарчанд +чунки +зеро +зеро ки +вақте ки +то вақте ки +барои он ки +бо нияти он ки +лекин ва ҳол он ки +ё +ё ин ки +бе он ки +дар ҳолате ки +то даме ки +баъд аз он ки +даме ки +ба тразе ки +аз баҳри он ки +гар +ар +ба шарте +азбаски +модоме ки +агар чи +гарчанде ки +бо вуҷуди он ки +гӯё +аз-баски +чун-ки +агар-чанд +агар-чи +гар-чи +то ки +чунон ки +то даме ки +ҳар қадар ки +магар +оё +наход +ҳатто +ҳам +бале +оре +хуб +хуш +хайр +не +на +мана +э +фақат +танҳо +кошки +мабодо +ҳтимол +ана ҳамин +наход ки +ҳатто ки +аз афташ +майлаш куя +ана +ҳа +канӣ +гӯё ки +ҳо ана +на ин ки +ваҳ +ҳой +и +а +о +эҳ +ҳе +ҳу +аҳа +оҳе +уҳа +ҳм +нм +оббо +ӯббо +ҳой-ҳой +вой-вой +ту-ту +ҳмм +эҳа +тавба +ӯҳӯ +аҷабо +ало +аё +ой +ӯим +ором +хом?ш +ҳай-ҳай +бай-бай +аз +он +баъд +азбаски +ӯ +ҳангоми +чӣ +кадом +ин +ҷо +ҳам +ё ки +бояд +аст +чанд +ҳар +бар +чаро ки +агар +то кӣ +бинобар +бинобар ин +ҳаргиз +асло +нахот +нахот ки +кошкӣ +шояд +шояд ки +охир +аз рӯи +аз рӯйи +рӯ \ No newline at end of file diff --git a/nltk_data/corpora/stopwords/turkish b/nltk_data/corpora/stopwords/turkish new file mode 100644 index 0000000000000000000000000000000000000000..5a48ccce0737bb38cb99cafa1f8d3d0ce34e7488 --- /dev/null +++ b/nltk_data/corpora/stopwords/turkish @@ -0,0 +1,53 @@ +acaba +ama +aslında +az +bazı +belki +biri +birkaç +birşey +biz +bu +çok +çünkü +da +daha +de +defa +diye +eğer +en +gibi +hem +hep +hepsi +her +hiç +için +ile +ise +kez +ki +kim +mı +mu +mü +nasıl +ne +neden +nerde +nerede +nereye +niçin +niye +o +sanki +şey +siz +şu +tüm +ve +veya +ya +yani diff --git a/nltk_data/corpora/wordnet.zip b/nltk_data/corpora/wordnet.zip new file mode 100644 index 0000000000000000000000000000000000000000..777df8872a4b420dc0324d7656f5b04270c54110 --- /dev/null +++ b/nltk_data/corpora/wordnet.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbda5ea6eef7f36a97a43d4a75f85e07fccbb4f23657d27b4ccbc93e2646ab59 +size 10775600 diff --git a/nltk_data/tokenizers/punkt.zip b/nltk_data/tokenizers/punkt.zip new file mode 100644 index 0000000000000000000000000000000000000000..37a8056e82eb03e73fc0a7419132d2156a27048c --- /dev/null +++ b/nltk_data/tokenizers/punkt.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51c3078994aeaf650bfc8e028be4fb42b4a0d177d41c012b6a983979653660ec +size 13905355 diff --git a/nltk_data/tokenizers/punkt/.DS_Store b/nltk_data/tokenizers/punkt/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..3d15671132023443e5cabd15df83766b8fb71f38 Binary files /dev/null and b/nltk_data/tokenizers/punkt/.DS_Store differ diff --git a/nltk_data/tokenizers/punkt/PY3/README b/nltk_data/tokenizers/punkt/PY3/README new file mode 100644 index 0000000000000000000000000000000000000000..49a637cb19b758348071ed19edb29d33ab435b33 --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/README @@ -0,0 +1,98 @@ +Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected) + +Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have +been contributed by various people using NLTK for sentence boundary detection. + +For information about how to use these models, please confer the tokenization HOWTO: +http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html +and chapter 3.8 of the NLTK book: +http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation + +There are pretrained tokenizers for the following languages: + +File Language Source Contents Size of training corpus(in tokens) Model contributed by +======================================================================================================================================================================= +czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss + Literarni Noviny +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss + (Berlingske Avisdata, Copenhagen) Weekend Avisen +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss + (American) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss + Text Bank (Suomen Kielen newspapers + Tekstipankki) + Finnish Center for IT Science + (CSC) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss + (European) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss + (Switzerland) CD-ROM + (Uses "ss" + instead of "ß") +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss + (Bokmål and Information Technologies, + Nynorsk) Bergen +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner + (http://www.nkjp.pl/) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss + (Brazilian) (Linguateca) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss + Slovene Academy for Arts + and Sciences +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss + (European) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss + (and some other texts) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss + (Türkçe Derlem Projesi) + University of Ankara +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to +Unicode using the codecs module. + +Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection. +Computational Linguistics 32: 485-525. + +---- Training Code ---- + +# import punkt +import nltk.tokenize.punkt + +# Make a new Tokenizer +tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer() + +# Read in training corpus (one example: Slovene) +import codecs +text = codecs.open("slovene.plain","Ur","iso-8859-2").read() + +# Train tokenizer +tokenizer.train(text) + +# Dump pickled tokenizer +import pickle +out = open("slovene.pickle","wb") +pickle.dump(tokenizer, out) +out.close() + +--------- diff --git a/nltk_data/tokenizers/punkt/PY3/czech.pickle b/nltk_data/tokenizers/punkt/PY3/czech.pickle new file mode 100644 index 0000000000000000000000000000000000000000..e62c4b16167fc65f3d73e464f68f59568663dd7d --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/czech.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64b0734b6fbe8e8d7cac79f48d1dd9f853824e57c4e3594dadd74ba2c1d97f50 +size 1119050 diff --git a/nltk_data/tokenizers/punkt/PY3/danish.pickle b/nltk_data/tokenizers/punkt/PY3/danish.pickle new file mode 100644 index 0000000000000000000000000000000000000000..597395f54052d39d89fd9e5dcfce778a0633f7af --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/danish.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6189c7dd254e29e2bd406a7f6a4336297c8953214792466a790ea4444223ceb3 +size 1191710 diff --git a/nltk_data/tokenizers/punkt/PY3/dutch.pickle b/nltk_data/tokenizers/punkt/PY3/dutch.pickle new file mode 100644 index 0000000000000000000000000000000000000000..fc63f96dd51e24d9689bfc6a31873e8b1e799a71 --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/dutch.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fda0d6a13f02e8898daec7fe923da88e25abe081bcfa755c0e015075c215fe4c +size 693759 diff --git a/nltk_data/tokenizers/punkt/PY3/english.pickle b/nltk_data/tokenizers/punkt/PY3/english.pickle new file mode 100644 index 0000000000000000000000000000000000000000..f80cfde61f08d459fcf7d0392a17c29796b7a471 --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/english.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c +size 406697 diff --git a/nltk_data/tokenizers/punkt/PY3/estonian.pickle b/nltk_data/tokenizers/punkt/PY3/estonian.pickle new file mode 100644 index 0000000000000000000000000000000000000000..34114d79fc9059688ae2aff00c852c7c0280008c --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/estonian.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b364f72538d17b146a98009ad239a8096ce6c0a8b02958c0bc776ecd0c58a25f +size 1499502 diff --git a/nltk_data/tokenizers/punkt/PY3/finnish.pickle b/nltk_data/tokenizers/punkt/PY3/finnish.pickle new file mode 100644 index 0000000000000000000000000000000000000000..32d3cab7cee7f42af839526d7640c41e8c6e712b --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/finnish.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a4b5ff5500ee851c456f9dd40d5fc0d8c1859c88eb3178de1317d26b7d22833 +size 1852226 diff --git a/nltk_data/tokenizers/punkt/PY3/french.pickle b/nltk_data/tokenizers/punkt/PY3/french.pickle new file mode 100644 index 0000000000000000000000000000000000000000..15eb9f852e0735235abe876a705315096aa7062f --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/french.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28e3a4cd2971989b3cb9fd3433a6f15d17981e464db2be039364313b5de94f29 +size 553575 diff --git a/nltk_data/tokenizers/punkt/PY3/german.pickle b/nltk_data/tokenizers/punkt/PY3/german.pickle new file mode 100644 index 0000000000000000000000000000000000000000..28180cd0382e4bf0627dafbce40ffe640c9551c2 --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/german.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddcbbe85e2042a019b1a6e37fd8c153286c38ba201fae0f5bfd9a3f74abae25c +size 1463575 diff --git a/nltk_data/tokenizers/punkt/PY3/greek.pickle b/nltk_data/tokenizers/punkt/PY3/greek.pickle new file mode 100644 index 0000000000000000000000000000000000000000..bda22e88d55836afee8f64ca9c18727cacc9d7c9 --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/greek.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85dabc44ab90a5f208ef37ff6b4892ebe7e740f71fb4da47cfd95417ca3e22fd +size 876006 diff --git a/nltk_data/tokenizers/punkt/PY3/italian.pickle b/nltk_data/tokenizers/punkt/PY3/italian.pickle new file mode 100644 index 0000000000000000000000000000000000000000..78cee51480c1e7a7c6ed33a63e73b3f90ea983bf --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/italian.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68a94007b1e4ffdc4d1a190185ca5442c3dafeb17ab39d30329e84cd74a43947 +size 615089 diff --git a/nltk_data/tokenizers/punkt/PY3/malayalam.pickle b/nltk_data/tokenizers/punkt/PY3/malayalam.pickle new file mode 100644 index 0000000000000000000000000000000000000000..780c35e0ffb8b8390c60c3766d657410a8a0861c --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/malayalam.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d +size 221207 diff --git a/nltk_data/tokenizers/punkt/PY3/norwegian.pickle b/nltk_data/tokenizers/punkt/PY3/norwegian.pickle new file mode 100644 index 0000000000000000000000000000000000000000..8c82d791ed82719d0afd2fe35cd2ba96e50b117d --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/norwegian.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ff7a46d1438b311457d15d7763060b8d3270852c1850fd788c5cee194dc4a1d +size 1181271 diff --git a/nltk_data/tokenizers/punkt/PY3/polish.pickle b/nltk_data/tokenizers/punkt/PY3/polish.pickle new file mode 100644 index 0000000000000000000000000000000000000000..6da78bc0b08152010057e7c0a5c64bd9253180f9 --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/polish.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:624900ae3ddfb4854a98c5d3b8b1c9bb719975f33fee61ce1441dab9f8a00718 +size 1738386 diff --git a/nltk_data/tokenizers/punkt/PY3/portuguese.pickle b/nltk_data/tokenizers/punkt/PY3/portuguese.pickle new file mode 100644 index 0000000000000000000000000000000000000000..ec190248fd4c46bc29904463f0d94e6a88ff8137 --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/portuguese.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02a0b7b25c3c7471e1791b66a31bbb530afbb0160aee4fcecf0107652067b4a1 +size 611919 diff --git a/nltk_data/tokenizers/punkt/PY3/russian.pickle b/nltk_data/tokenizers/punkt/PY3/russian.pickle new file mode 100644 index 0000000000000000000000000000000000000000..ca30476d51b8cd2d4d3d0bf2dbefb5c98240849d --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/russian.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:549762f8190024d89b511472df21a3a135eee5d9233e63ac244db737c2c61d7e +size 33020 diff --git a/nltk_data/tokenizers/punkt/PY3/slovene.pickle b/nltk_data/tokenizers/punkt/PY3/slovene.pickle new file mode 100644 index 0000000000000000000000000000000000000000..b95a018444f3d9dfa5b0c5e9ac157aa352fac415 --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/slovene.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52ef2cc0ed27d79b3aa635cbbc40ad811883a75a4b8a8be1ae406972870fd864 +size 734444 diff --git a/nltk_data/tokenizers/punkt/PY3/spanish.pickle b/nltk_data/tokenizers/punkt/PY3/spanish.pickle new file mode 100644 index 0000000000000000000000000000000000000000..e322355abaedb8e4005f0d9fad6ec4ef7db38354 --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/spanish.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:164a50fadc5a49f8ec7426eae11d3111ee752b48a3ef373d47745011192a5984 +size 562337 diff --git a/nltk_data/tokenizers/punkt/PY3/swedish.pickle b/nltk_data/tokenizers/punkt/PY3/swedish.pickle new file mode 100644 index 0000000000000000000000000000000000000000..75975c08f17a0274985c418e4e38d06f51deb9f5 --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/swedish.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0f7d538bfd5266633b09e842cd92e9e0ac10f1d923bf211e1497972ddc47318 +size 979681 diff --git a/nltk_data/tokenizers/punkt/PY3/turkish.pickle b/nltk_data/tokenizers/punkt/PY3/turkish.pickle new file mode 100644 index 0000000000000000000000000000000000000000..9c15a8c22a7cc7bc8de50b1e63922153189dcbd4 --- /dev/null +++ b/nltk_data/tokenizers/punkt/PY3/turkish.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae68ef5863728ac5332e87eb1f6bae772ff32a13a4caa2b01a5c68103e853c5b +size 1017038 diff --git a/nltk_data/tokenizers/punkt/README b/nltk_data/tokenizers/punkt/README new file mode 100644 index 0000000000000000000000000000000000000000..49a637cb19b758348071ed19edb29d33ab435b33 --- /dev/null +++ b/nltk_data/tokenizers/punkt/README @@ -0,0 +1,98 @@ +Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected) + +Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have +been contributed by various people using NLTK for sentence boundary detection. + +For information about how to use these models, please confer the tokenization HOWTO: +http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html +and chapter 3.8 of the NLTK book: +http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation + +There are pretrained tokenizers for the following languages: + +File Language Source Contents Size of training corpus(in tokens) Model contributed by +======================================================================================================================================================================= +czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss + Literarni Noviny +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss + (Berlingske Avisdata, Copenhagen) Weekend Avisen +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss + (American) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss + Text Bank (Suomen Kielen newspapers + Tekstipankki) + Finnish Center for IT Science + (CSC) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss + (European) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss + (Switzerland) CD-ROM + (Uses "ss" + instead of "ß") +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss + (Bokmål and Information Technologies, + Nynorsk) Bergen +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner + (http://www.nkjp.pl/) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss + (Brazilian) (Linguateca) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss + Slovene Academy for Arts + and Sciences +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss + (European) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss + (and some other texts) +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- +turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss + (Türkçe Derlem Projesi) + University of Ankara +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to +Unicode using the codecs module. + +Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection. +Computational Linguistics 32: 485-525. + +---- Training Code ---- + +# import punkt +import nltk.tokenize.punkt + +# Make a new Tokenizer +tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer() + +# Read in training corpus (one example: Slovene) +import codecs +text = codecs.open("slovene.plain","Ur","iso-8859-2").read() + +# Train tokenizer +tokenizer.train(text) + +# Dump pickled tokenizer +import pickle +out = open("slovene.pickle","wb") +pickle.dump(tokenizer, out) +out.close() + +--------- diff --git a/nltk_data/tokenizers/punkt/czech.pickle b/nltk_data/tokenizers/punkt/czech.pickle new file mode 100644 index 0000000000000000000000000000000000000000..9e25fd5807fb774c1ca5a8afb1505ad81dc85a94 --- /dev/null +++ b/nltk_data/tokenizers/punkt/czech.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ba73d293c7d7953956bcf02f3695ec5c1f0d527f2a3c38097f5593394fa1690 +size 1265552 diff --git a/nltk_data/tokenizers/punkt/danish.pickle b/nltk_data/tokenizers/punkt/danish.pickle new file mode 100644 index 0000000000000000000000000000000000000000..13990c08b98862d02ea83a64d136dda6fcedb209 --- /dev/null +++ b/nltk_data/tokenizers/punkt/danish.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea29760a0a9197f52ca59e78aeafc5a6f55d05258faf7db1709b2b9eb321ef20 +size 1264725 diff --git a/nltk_data/tokenizers/punkt/dutch.pickle b/nltk_data/tokenizers/punkt/dutch.pickle new file mode 100644 index 0000000000000000000000000000000000000000..c8655aef0708b9ad98ef2a83a59576bfa9c95962 --- /dev/null +++ b/nltk_data/tokenizers/punkt/dutch.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a8e26b3d68c45c38e594d19e2d5677447bfdcaa636d3b1e7acfed0e9272d73c +size 742624 diff --git a/nltk_data/tokenizers/punkt/english.pickle b/nltk_data/tokenizers/punkt/english.pickle new file mode 100644 index 0000000000000000000000000000000000000000..39546a0de451a0eaecb08347948463154e91dde7 --- /dev/null +++ b/nltk_data/tokenizers/punkt/english.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dda37972ae88998a6fd3e3ec002697a6bd362b32d050fda7d7ca5276873092aa +size 433305 diff --git a/nltk_data/tokenizers/punkt/estonian.pickle b/nltk_data/tokenizers/punkt/estonian.pickle new file mode 100644 index 0000000000000000000000000000000000000000..5c97b292ed3e7251289dc98a617a5e3326dc90c0 --- /dev/null +++ b/nltk_data/tokenizers/punkt/estonian.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3867fee26a36bdb197c64362aa13ac683f5f33fa4d0d225a5d56707582a55a1d +size 1596714 diff --git a/nltk_data/tokenizers/punkt/finnish.pickle b/nltk_data/tokenizers/punkt/finnish.pickle new file mode 100644 index 0000000000000000000000000000000000000000..e2738f8ecdeda7e7a8ecbb8d71fb90313af5ffa0 --- /dev/null +++ b/nltk_data/tokenizers/punkt/finnish.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a9e17b3d5b4df76345d812b8a65b1da0767eda5086eadcc11e625eef0942835 +size 1951656 diff --git a/nltk_data/tokenizers/punkt/french.pickle b/nltk_data/tokenizers/punkt/french.pickle new file mode 100644 index 0000000000000000000000000000000000000000..ebd9e5f5d79b19c97994d7f1b47f2c0942794f15 --- /dev/null +++ b/nltk_data/tokenizers/punkt/french.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de05f3d5647d3d2296626fb83f68428e4c6ad6e05a00ed4694c8bdc8f2f197ee +size 583482 diff --git a/nltk_data/tokenizers/punkt/german.pickle b/nltk_data/tokenizers/punkt/german.pickle new file mode 100644 index 0000000000000000000000000000000000000000..d99b7a27c1c55dc08155828bb4893f431bec3f60 --- /dev/null +++ b/nltk_data/tokenizers/punkt/german.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eab497fa085413130c8fd0fb13b929128930afe2f6a26ea8715c95df7088e97c +size 1526714 diff --git a/nltk_data/tokenizers/punkt/greek.pickle b/nltk_data/tokenizers/punkt/greek.pickle new file mode 100644 index 0000000000000000000000000000000000000000..dea8b87119c8a991e97e7e78a9a74599d5e6811e --- /dev/null +++ b/nltk_data/tokenizers/punkt/greek.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21752a6762fad5cfe46fb5c45fad9a85484a0e8e81c67e6af6fb973cfc27d67c +size 1953106 diff --git a/nltk_data/tokenizers/punkt/italian.pickle b/nltk_data/tokenizers/punkt/italian.pickle new file mode 100644 index 0000000000000000000000000000000000000000..e68bcb795da9894847f1d22659c3e8d3c5f946bc --- /dev/null +++ b/nltk_data/tokenizers/punkt/italian.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcb2717d7be5f26e860a92e05acf69b1123a5f4527cd7a269a9ab9e9e668c805 +size 658331 diff --git a/nltk_data/tokenizers/punkt/malayalam.pickle b/nltk_data/tokenizers/punkt/malayalam.pickle new file mode 100644 index 0000000000000000000000000000000000000000..780c35e0ffb8b8390c60c3766d657410a8a0861c --- /dev/null +++ b/nltk_data/tokenizers/punkt/malayalam.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d +size 221207 diff --git a/nltk_data/tokenizers/punkt/norwegian.pickle b/nltk_data/tokenizers/punkt/norwegian.pickle new file mode 100644 index 0000000000000000000000000000000000000000..1cffbc5f1c94bac455aaf08c73c585ac0d50c111 --- /dev/null +++ b/nltk_data/tokenizers/punkt/norwegian.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4a97f8f9a03a0338dd746bcc89a0ae0f54ae43b835fa37d83e279e1ca794faf +size 1259779 diff --git a/nltk_data/tokenizers/punkt/polish.pickle b/nltk_data/tokenizers/punkt/polish.pickle new file mode 100644 index 0000000000000000000000000000000000000000..dd12f3ee60dbe99fec4c5691d303c00961a7256c --- /dev/null +++ b/nltk_data/tokenizers/punkt/polish.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16127b6d10933427a3e90fb20e9be53e1fb371ff79a730c1030734ed80b90c92 +size 2042451 diff --git a/nltk_data/tokenizers/punkt/portuguese.pickle b/nltk_data/tokenizers/punkt/portuguese.pickle new file mode 100644 index 0000000000000000000000000000000000000000..52f8b2f3c39deb945d112ea76c92cf390e0f3564 --- /dev/null +++ b/nltk_data/tokenizers/punkt/portuguese.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb01bf7c79a4eadc2178bbd209665139a0e4b38f2d1c44fef097de93955140e0 +size 649051 diff --git a/nltk_data/tokenizers/punkt/russian.pickle b/nltk_data/tokenizers/punkt/russian.pickle new file mode 100644 index 0000000000000000000000000000000000000000..1d13c7c860beecd80034bee8de1dd20ef60be149 --- /dev/null +++ b/nltk_data/tokenizers/punkt/russian.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc984432fbe31f7000014f8047502476889169c60f09be5413ca09276b16c909 +size 33027 diff --git a/nltk_data/tokenizers/punkt/slovene.pickle b/nltk_data/tokenizers/punkt/slovene.pickle new file mode 100644 index 0000000000000000000000000000000000000000..ef82e2942a7abc45ac19c3b78c0b54c3787c46ee --- /dev/null +++ b/nltk_data/tokenizers/punkt/slovene.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dac650212b3787b39996c01bd2084115493e6f6ec390bab61f767525b08b8ea +size 832867 diff --git a/nltk_data/tokenizers/punkt/spanish.pickle b/nltk_data/tokenizers/punkt/spanish.pickle new file mode 100644 index 0000000000000000000000000000000000000000..4188b8fb222c1995cd6015999e3b52efdccb40f7 --- /dev/null +++ b/nltk_data/tokenizers/punkt/spanish.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:271dc6027c4aae056f72a9bfab5645cf67e198bf4f972895844e40f5989ccdc3 +size 597831 diff --git a/nltk_data/tokenizers/punkt/swedish.pickle b/nltk_data/tokenizers/punkt/swedish.pickle new file mode 100644 index 0000000000000000000000000000000000000000..21b92a99ae58168cd6c2fd2d1bc91ab100c005c3 --- /dev/null +++ b/nltk_data/tokenizers/punkt/swedish.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40d50ebdad6caa87715f2e300b1217ec92c42de205a543cc4a56903bd2c9acfa +size 1034496 diff --git a/nltk_data/tokenizers/punkt/turkish.pickle b/nltk_data/tokenizers/punkt/turkish.pickle new file mode 100644 index 0000000000000000000000000000000000000000..0dd0c80b1ad0b06b3bfaf20ab28c7e613cfe852d --- /dev/null +++ b/nltk_data/tokenizers/punkt/turkish.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3ae47d76501d027698809d12e75292c9c392910488543342802f95db9765ccc +size 1225013 diff --git a/src/__pycache__/nltk_utilities.cpython-311.pyc b/src/__pycache__/nltk_utilities.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3499a2117fbe595c6ea856df1352bb1c4f6545bc Binary files /dev/null and b/src/__pycache__/nltk_utilities.cpython-311.pyc differ diff --git a/src/__pycache__/preprocessing.cpython-311.pyc b/src/__pycache__/preprocessing.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26ee5692761df7fd3ba76ddad381c7bff143504b Binary files /dev/null and b/src/__pycache__/preprocessing.cpython-311.pyc differ diff --git a/src/__pycache__/sentence_transformer_utilities.cpython-311.pyc b/src/__pycache__/sentence_transformer_utilities.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1946e8d1413d2298f757059d98d086a5364e0671 Binary files /dev/null and b/src/__pycache__/sentence_transformer_utilities.cpython-311.pyc differ diff --git a/src/__pycache__/spacy_utilities.cpython-311.pyc b/src/__pycache__/spacy_utilities.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bfbd35cbb74cc66b581010507955459f5245544 Binary files /dev/null and b/src/__pycache__/spacy_utilities.cpython-311.pyc differ diff --git a/src/__pycache__/stanza_utilities.cpython-311.pyc b/src/__pycache__/stanza_utilities.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..961c8fa89d57c93a6b44b3413c17eda7cc66a224 Binary files /dev/null and b/src/__pycache__/stanza_utilities.cpython-311.pyc differ diff --git a/src/__pycache__/summarization_utilities.cpython-311.pyc b/src/__pycache__/summarization_utilities.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..889610f69be4e1da6476ad6a7004fc8641fdd892 Binary files /dev/null and b/src/__pycache__/summarization_utilities.cpython-311.pyc differ diff --git a/src/nltk_utilities.py b/src/nltk_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..0fb734244198e08b3e756478bec82ef2544f393b --- /dev/null +++ b/src/nltk_utilities.py @@ -0,0 +1,38 @@ +### Imports +import nltk +from nltk.tokenize import sent_tokenize +nltk.data.path.append("/nltk_data/") + + +class NltkSegmentizer: + ##========================================================================================================== + """ + Definition of attributes + """ + + ##========================================================================================================== + """ + Function: __init__ + """ + def __init__(self): + print("Initializing NltkSegmentizer object") + nltk.download('punkt') + ##========================================================================================================== + """ + Function: segment_into_sentences + """ + def segment_into_sentences(self, src_text="", _format=""): + intermediate_result = None + + if isinstance(src_text, str): + intermediate_result = sent_tokenize(src_text) + elif isinstance(src_text, list): + intermediate_result = list() + + for sent in src_text: + intermediate_result.extend(sent_tokenize(sent)) + + return intermediate_result + ##========================================================================================================== + +##========================================================================================================== \ No newline at end of file diff --git a/src/preprocessing.py b/src/preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..6d9a160478d8a585a86b909c20242f8b22b7c363 --- /dev/null +++ b/src/preprocessing.py @@ -0,0 +1,39 @@ +import re +from nltk.tokenize import RegexpTokenizer +import spacy + +def remove_patterns(text): + """ + Remove punctions, emails, hashtags in given text + """ + + if isinstance(text, spacy.tokens.span.Span): + text = text.text + # Remove return char + text = re.sub(r'\n', ' ', text) + # Remove emails + text = re.sub(r'\S*@\S*\s?', '', text) + # Remove hashtags + text = re.sub(r'#\w+', '', text) + # Remove punctuation + text = re.sub(r'[^\w\s]', '', text) + + return text + +def extract_patterns(text): + """ + Extract punctions, emails, hashtags in given text + """ + # extract emails + emails = re.findall(r'\S+@\S+', text) + # extract hashtags + hashtags = re.findall(r'#\w+', text) + # extract punctuation + punctuation = re.findall(r'[^\w\s]', text) + + return punctuation, emails, hashtags + +def remove_punct_nltk(text): + tokenizer = RegexpTokenizer(r'\w+') + tokenizer.tokenize(text) + return text \ No newline at end of file diff --git a/src/sentence_transformer_utilities.py b/src/sentence_transformer_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..50c960f476dc468aa1c8573af39a9b994bfa3f67 --- /dev/null +++ b/src/sentence_transformer_utilities.py @@ -0,0 +1,85 @@ +### Imports +from sentence_transformers import SentenceTransformer, util + +### Classes and functions + +##========================================================================================================== +class SentTransfUtilities: + ##========================================================================================================== + """ + Definition of attributes + """ + model = None + __model_name = None + ##========================================================================================================== + """ + Function: __init__ + Arguments: + - model_name: + Options: + - 'all-MiniLM-L6-v2 + - 'nq-distilbert-base-v1' + - 'paraphrase-multilingual-MiniLM-L12-v2' + """ + def __init__(self, model_name="all-MiniLM-L6-v2"): + self.__model_name = model_name + if self.model == None: + print("Initializing the Sentence Transformer model") + self.model = SentenceTransformer(self.__model_name) + ##========================================================================================================== + """ + Function: get_embeddings() + """ + def get_embeddings(self, src_data): + return self.model.encode(src_data, convert_to_tensor=True, device='cpu') + ##========================================================================================================== + """ + Function: compute_cosine_similarity(query_embeddings, passage_embeddings) + """ + def compute_cosine_similarity(self, query_embeddings, passage_embeddings): + #Compute cosine-similarities + cosine_scores = util.cos_sim(query_embeddings, passage_embeddings) + return cosine_scores + ##========================================================================================================== + """ + Function: compute_dot_similarity(query_embeddings, passage_embeddings) + Arguments: + - query_embeddings + - passage_embeddings + """ + def compute_dot_similarity(self, query_embeddings, passage_embeddings): + #Compute dot-similarities + dot_scores = util.dot_score(query_embeddings, passage_embeddings) + return dot_scores + ##========================================================================================================== + """ + Function: compute_semantic_search(query_embeddings, corpus_embeddings) + Arguments: + - query_embeddings + - corpus_embeddings + """ + def compute_semantic_search(self, query_embeddings, corpus_embeddings): + #Compute dot-similarities + dot_scores = util.semantic_search(query_embeddings, corpus_embeddings) + return dot_scores + ##========================================================================================================== + """ + Function: compute_sentences_similarity(sentence_1, sentence_2, sim_func) + Arguments: + - sentence_1 + - sentence_2 + - sim_func: { "cosine", "dot" } + """ + def compute_sentences_similarity(self, sentence_1, sentence_2, sim_func="cosine"): + embeddings_1 = self.get_embeddings(sentence_1) + embeddings_2 = self.get_embeddings(sentence_2) + + scores = None + if sim_func == "cosine": + scores = self.compute_cosine_similarity(embeddings_1, embeddings_2) + elif sim_func == "dot": + scores = self.compute_dot_similarity(embeddings_1, embeddings_2) + return scores + ##========================================================================================================== + +##========================================================================================================== diff --git a/src/spacy_utilities.py b/src/spacy_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..8c082040e4bda150894c07bcd198dd160a1a7258 --- /dev/null +++ b/src/spacy_utilities.py @@ -0,0 +1,64 @@ +### Imports +import spacy +from spacy.lang.en import English +from spacy import displacy + +import pandas as pd + +import traceback + +class SpacySegmentizer: + ##========================================================================================================== + """ + Definition of attributes + """ + __nlp_SpaCy = None + ##========================================================================================================== + """ + Function: __init__ + """ + def __init__(self): + if self.__nlp_SpaCy == None: + print("Initializing spacy") + self.initialize_spacy() + ##========================================================================================================== + """ + Function: initialize_spacy + """ + def initialize_spacy(self): + try: + self.__nlp_SpaCy = English() + #self.__nlp_spacy = spacy.load("en_core_web_sm") + self.__nlp_SpaCy.add_pipe("sentencizer") + #nlp.add_pipe("sentencizer", config={"punct_chars":[".", ";"]}) + except Exception as excmsg: + print(f"An error happens in initialize_spacy(...) {traceback.format_exc()}.") + self.__nlp_SpaCy = None + return self.__nlp_SpaCy + ##========================================================================================================== + """ + Function: segment_into_sentences + """ + def segment_into_sentences(self, src_text="", _format=""): + intermediate_result = None + + if isinstance(src_text, str): + intermediate_result = [s for s in (self.__nlp_SpaCy(src_text)).sents] + elif isinstance(src_text, list): + intermediate_result = list() + + for sent in src_text: + intermediate_result.extend([s for s in (self.__nlp_SpaCy(sent)).sents]) + + if _format == "str": + sentences_new_doc = list() + + for intsent in intermediate_result: + sentences_new_doc.append(" ".join([str(s) for s in intsent])) + + return sentences_new_doc + else: + return intermediate_result + ##========================================================================================================== + +##========================================================================================================== \ No newline at end of file diff --git a/src/stanza_utilities.py b/src/stanza_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..1a203697546c739723c53c9ede9770d4b70f5a82 --- /dev/null +++ b/src/stanza_utilities.py @@ -0,0 +1,59 @@ +### Imports +import stanza +import pandas as pd +import traceback + +class StanzaSegmentizer: + ##========================================================================================================== + """ + Definition of attributes + """ + __nlp_stanza = None + ##========================================================================================================== + """ + Function: __init__ + """ + def __init__(self): + try: + if self.__nlp_stanza == None: + print("Initializing stanza") + self.initialize_stanza() + except Exception as excMsg: + print(excMsg) + ##========================================================================================================== + """ + Function: initialize_stanza + """ + def initialize_stanza(self): + try: + self.__nlp_stanza = stanza.Pipeline('en') + except Exception as excmsg: + print(f"An error happens in initialize_spacy(...) {traceback.format_exc()}.") + self.__nlp_stanza = None + return self.__nlp_stanza + ##========================================================================================================== + """ + Function: segment_into_sentences + """ + def segment_into_sentences(self, src_text="", _format="str"): + intermediate_result = None + + if isinstance(src_text, str): + intermediate_result = [s for s in (self.__nlp_stanza(src_text)).sentences] + elif isinstance(src_text, list): + intermediate_result = list() + + for sent in src_text: + intermediate_result.extend([s for s in (self.__nlp_stanza(sent)).sentences]) + + if _format == "str": + sentences_new_doc = list() + + for intsent in intermediate_result: + sentences_new_doc.append(intsent.text) + return sentences_new_doc + else: + return intermediate_result + ##========================================================================================================== + +##========================================================================================================== \ No newline at end of file diff --git a/src/summarization_utilities.py b/src/summarization_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..1f8cadca6621ebf766ef3610c5191ad6f9f37a4a --- /dev/null +++ b/src/summarization_utilities.py @@ -0,0 +1,145 @@ +### Imports +from transformers import PegasusForConditionalGeneration, PegasusTokenizer +from transformers import BartForConditionalGeneration, BartTokenizer +from transformers import T5ForConditionalGeneration, T5Tokenizer +from transformers import ProphetNetForConditionalGeneration, ProphetNetTokenizer +import torch + +from config import config + +### Classes and functions + +##========================================================================================================== +class SummarizationUtilities: + ##========================================================================================================== + """ + Definition of attributes + """ + model_name = None + device = None + tokenizer = None + model = None + ##========================================================================================================== + """ + Function: __init__ + Arguments: + - model_name + - device + """ + def __init__(self, model_name="google/pegasus-xsum", device=None, model_path=config.pegasus_model_path): + self.model_name = model_name + if device == None: + self.device = self.detect_available_cuda_device() + else: + self.device = device + + self.tokenizer = PegasusTokenizer.from_pretrained(model_path) + self.model = PegasusForConditionalGeneration.from_pretrained(model_path).to(device) + ##========================================================================================================= + """ + Function: detect_available_cuda_device + Arguments: NA + """ + def detect_available_cuda_device(self): + self.device = "cuda" if torch.cuda.is_available() else "cpu" + ##========================================================================================================= + """ + Function: detect_available_cuda_device + Arguments: NA + """ + def tokenize(self, src_text, truncation = True, padding="longest", return_tensors="pt"): + return self.tokenizer(src_text, truncation=truncation, padding=padding, return_tensors=return_tensors).to(self.device) + ##========================================================================================================= + """ + Function: generate + Arguments: + - batch + """ + def generate(self, batch): + text_generated = self.model.generate(**batch) + return text_generated + ##========================================================================================================= + """ + Function: decode_generated_text + Arguments: + - batch + """ + def decode_generated_text(self, generated_text, skip_special_tokens=True): + return self.tokenizer.batch_decode(generated_text, skip_special_tokens=skip_special_tokens) + ##========================================================================================================= + """ + Function: get_summary + Arguments: + - src_text + """ + def get_summary(self, src_text): + summary = None + + batch = self.tokenize(src_text) + generated_text = self.generate(batch) + target_text = self.decode_generated_text(generated_text) + #print("target_text", target_text) + summary = target_text + + return summary + + def summarize(self, src_text): + summary = None + + batch = self.tokenize(src_text) + generated_text = self.generate(batch) + target_text = self.decode_generated_text(generated_text) + #print("target_text", target_text) + summary = target_text + + return summary + + ##========================================================================================================= +##========================================================================================================== + + + +class BARTSummarizer: + def __init__(self, device=None, model_path=config.bart_model_path): + # https://stackoverflow.com/questions/66639722/why-does-huggingfaces-bart-summarizer-replicate-the-given-input-text + self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu") + # self.tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-xsum-6-6") #facebook/bart-large-cnn + # self.model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-xsum-6-6").to(self.device) + self.tokenizer = BartTokenizer.from_pretrained(model_path) + self.model = BartForConditionalGeneration.from_pretrained(model_path) + + def summarize(self, text): + inputs = self.tokenizer([text], truncation=True, padding="longest", return_tensors="pt").to(self.device) + summary_ids = self.model.generate(inputs["input_ids"], num_beams=4, max_length=200, early_stopping=True) + summary = self.tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True) + return summary + + +class T5Summarizer: + def __init__(self, device=None, model_path=config.t5_model_path): + self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu") + # self.tokenizer = T5Tokenizer.from_pretrained("t5-base") + # self.model = T5ForConditionalGeneration.from_pretrained("t5-base").to(self.device) + self.tokenizer = T5Tokenizer.from_pretrained(model_path) + self.model = T5ForConditionalGeneration.from_pretrained(model_path).to(self.device) + + def summarize(self, text): + inputs = self.tokenizer.encode_plus(text, return_tensors="pt", truncation=True, padding="longest").to(self.device) + summary_ids = self.model.generate(inputs.input_ids) + summary = self.tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True) + return summary + + +class ProphetNetSummarizer: + def __init__(self, device=None, model_path=config.prophetnet_model_path): + self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu") + # self.tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased") + # self.model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased").to(self.device) + self.tokenizer = ProphetNetTokenizer.from_pretrained(model_path) + self.model = ProphetNetForConditionalGeneration.from_pretrained(model_path).to(self.device) + + def summarize(self, text): + inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding="longest").to(self.device) + summary_ids = self.model.generate(inputs.input_ids) + summary = self.tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True) + return summary \ No newline at end of file