Table of Stop Words
Three sources hyperlinked in column names.1
Translations by Google, except where noted.
idx | words | 01a_stop2 | 02_ranks_nl | 03_dutch-stop-words | translation | BD count |
---|---|---|---|---|---|---|
0 | aan | 1 | 1 | 1 | to | 213 |
1 | af | 0 | 1 | 1 | down | 19 |
2 | al | 1 | 1 | 1 | already | 82 |
3 | alles | 1 | 0 | 1 | all | 9 |
4 | als | 1 | 1 | 1 | as | 97 |
5 | altijd | 1 | 0 | 1 | always | 10 |
6 | andere | 1 | 0 | 1 | other | 37 |
7 | ben | 1 | 0 | 1 | am | 14 |
8 | bij | 1 | 1 | 1 | at | 175 |
9 | daar | 1 | 0 | 1 | over there | 43 |
10 | dan | 1 | 1 | 1 | than | 64 |
11 | dat | 1 | 1 | 1 | it | 403 |
12 | de | 1 | 0 | 1 | the | 2481 |
13 | der | 1 | 0 | 1 | of | 29 |
14 | deze | 1 | 0 | 1 | this | 66 |
15 | die | 1 | 1 | 1 | that | 245 |
16 | dit | 1 | 1 | 1 | this | 68 |
17 | doch | 1 | 0 | 1 | but | |
18 | doen | 1 | 0 | 1 | do | 25 |
19 | door | 1 | 0 | 1 | by | 125 |
20 | dus | 1 | 0 | 1 | so | 20 |
21 | een | 1 | 1 | 1 | a | 923 |
22 | eens | 1 | 0 | 1 | once | 14 |
23 | en | 1 | 1 | 1 | and | 615 |
24 | er | 1 | 1 | 1 | there | 239 |
25 | ge | 1 | 0 | 1 | ‘thou’3 | |
26 | geen | 1 | 0 | 1 | no | 69 |
27 | geweest | 1 | 0 | 1 | been | 7 |
28 | haar | 1 | 0 | 1 | its | 35 |
29 | had | 1 | 1 | 1 | had | 43 |
30 | heb | 1 | 1 | 1 | have | 10 |
31 | hebben | 1 | 0 | 1 | have | 107 |
32 | heeft | 1 | 0 | 1 | has | 149 |
33 | hem | 1 | 1 | 1 | him | 31 |
34 | het | 1 | 1 | 1 | the | 1063 |
35 | hier | 1 | 0 | 1 | here | 15 |
36 | hij | 1 | 1 | 1 | he | 141 |
37 | hoe | 1 | 1 | 1 | how | 21 |
38 | hun | 1 | 1 | 1 | their | 40 |
39 | iemand | 1 | 0 | 1 | someone | 2 |
40 | iets | 1 | 0 | 1 | something | 13 |
41 | ik | 1 | 1 | 1 | I | 66 |
42 | in | 1 | 1 | 1 | in | 861 |
43 | is | 1 | 1 | 1 | is | 435 |
44 | ja | 1 | 0 | 1 | yes | 3 |
45 | je | 1 | 1 | 1 | you | 57 |
46 | kan | 1 | 1 | 1 | can | 59 |
47 | kon | 1 | 0 | 1 | could | 20 |
48 | kunnen | 1 | 0 | 1 | can | 47 |
49 | maar | 1 | 0 | 1 | but | 150 |
50 | me | 1 | 1 | 1 | me | 10 |
51 | meer | 1 | 0 | 1 | (1) more | 101 |
52 | men | 1 | 1 | 1 | one | |
53 | met | 1 | 1 | 1 | with | 316 |
54 | mij | 1 | 1 | 1 | me | 8 |
55 | mijn | 1 | 0 | 1 | my | 16 |
56 | moet | 1 | 0 | 1 | must | 42 |
57 | na | 1 | 0 | 1 | after | 62 |
58 | naar | 1 | 0 | 1 | to | 153 |
59 | niet | 1 | 0 | 1 | not | 226 |
60 | niets | 1 | 0 | 1 | nothing | 6 |
61 | nog | 1 | 1 | 1 | yet | 156 |
62 | nu | 1 | 1 | 1 | now | 43 |
63 | of | 1 | 1 | 1 | whether | 61 |
64 | om | 1 | 0 | 1 | to | 218 |
65 | omdat | 1 | 0 | 1 | because | 27 |
66 | onder | 1 | 0 | 0 | below | 67 |
67 | ons | 1 | 1 | 1 | us / our | 16 |
68 | ook | 1 | 1 | 1 | also | 149 |
69 | op | 1 | 0 | 1 | on | 544 |
70 | over | 1 | 0 | 1 | about | 140 |
71 | reeds | 1 | 0 | 1 | already | |
72 | te | 1 | 1 | 1 | too | 324 |
73 | tegen | 1 | 0 | 1 | against | 46 |
74 | toch | 1 | 0 | 1 | nevertheless | 17 |
75 | toen | 1 | 0 | 1 | when | 38 |
76 | tot | 1 | 1 | 1 | until | 85 |
77 | u | 1 | 0 | 1 | you | 9 |
78 | uit | 1 | 1 | 1 | from | 158 |
79 | uw | 1 | 0 | 1 | your | 2 |
80 | van | 1 | 1 | 1 | from | 1078 |
81 | veel | 1 | 0 | 1 | many | 42 |
82 | voor | 1 | 0 | 1 | in front of | 278 |
83 | want | 1 | 0 | 1 | because | 13 |
84 | waren | 1 | 0 | 1 | goods | 53 |
85 | was | 1 | 1 | 1 | was | 133 |
86 | wat | 1 | 1 | 1 | what | 46 |
87 | we | 0 | 1 | 1 | we | 39 |
88 | wel | 0 | 1 | 1 | well | 59 |
89 | werd | 1 | 0 | 1 | became | 99 |
90 | wezen | 1 | 0 | 1 | being | |
91 | wie | 1 | 0 | 1 | who | 13 |
92 | wij | 0 | 1 | 1 | we | 19 |
93 | wil | 1 | 0 | 1 | want | 33 |
94 | worden | 1 | 0 | 1 | be | 94 |
95 | wordt | 1 | 0 | 0 | is | 82 |
96 | zal | 1 | 1 | 1 | shall | 28 |
97 | ze | 1 | 1 | 1 | she | 90 |
98 | zei | 0 | 1 | 1 | said | 13 |
99 | zelf | 1 | 0 | 1 | self | 12 |
100 | zich | 1 | 0 | 1 | himself | 73 |
101 | zij | 1 | 1 | 1 | they | 40 |
102 | zijn | 1 | 0 | 1 | his | 304 |
103 | zo | 1 | 1 | 1 | so | 45 |
104 | zonder | 1 | 0 | 1 | without | 14 |
105 | zou | 1 | 1 | 1 | would | 67 |
Code
import pandas as pd
import os
stop_files = os.listdir('stopwords/')
stop_files.pop(1)
stop_files
Out:
[‘01a_stop.txt’, ‘02_ranks_nl.txt’, ‘03_dutch-stop-words.txt’]
stop_list = []
stopwords = []
for stop in stop_files:
with open('stopwords/'+stop, 'r') as f:
temp_list = []
for line in f:
temp_list.append(line.strip())
stopwords.append(line.strip())
temp_dict = {'file': stop, 'words': temp_list}
stop_list.append(temp_dict)
print [len(d['words']) for d in stop_list]
print len(stopwords)
print len(set(stopwords))
Out:
[101, 48, 104]
253
106
stopwords_idx = pd.DataFrame(list(set(stopwords)))
print stopwords_idx.shape
stopwords_idx.columns = ['words']
stopwords_idx.head()
Out:
(106, 1)
words | |
---|---|
0 | andere |
1 | deze |
2 | over |
3 | zei |
4 | zal |
stopwords_full = stopwords_idx
print stopwords_full.shape
for i in range(len(stop_list)):
df = pd.DataFrame(stop_list[i]['words'], columns=['words'])
df[stop_list[i]['file'].replace('.txt', '')] = 1
stopwords_temp = pd.merge(stopwords_idx, df, on='words')
stopwords_full = pd.merge(stopwords_full, stopwords_temp, on='words', how='outer')
print stopwords_full.shape
stopwords_full = stopwords_full.fillna(0)
stopwords_full = stopwords_full.sort(columns='words', axis=0)
stopwords_full = stopwords_full.reset_index(drop=True)
stopwords_full.head()
Out:
(106, 1)
(106, 4)
words | 01a_stop | 02_ranks_nl | 03_dutch-stop-words | |
---|---|---|---|---|
0 | aan | 1 | 1 | 1 |
1 | af | 0 | 1 | 1 |
2 | al | 1 | 1 | 1 |
3 | alles | 1 | 0 | 1 |
4 | als | 1 | 1 | 1 |
stopwords_full.to_csv('stopwords/stopwords_table.csv', encoding='utf8')
Notes
1 And listed below with identified contacts:
- http://snowball.tartarus.org/algorithms/dutch/stop.txt; http://www.patrickmileswriter.co.uk/ \ mail@patrickmiles.co.uk
- http://www.ranks.nl/stopwords/dutch; damian@ranks.nl
- http://www.damienvanholten.com/blog/dutch-stop-words/; http://twitter.com/damienvanholten ↩
2 Modified to remove provided translations. ↩
3 According to http://snowball.tartarus.org/algorithms/dutch/stop.txt: “‘thou’, still used in Belgium and south Netherlands” ↩