Files
gists/codegrab/ctf/filtertext.py
2022-05-13 12:50:20 +02:00

14 lines
397 B
Python

import sys
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
with open(sys.argv[0],'r') as f:
text=" ".join(f.readlines())
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)
for word in [w for w in word_tokens if len(w)>3 and not w in stop_words]:
word=word.strip(' \n,.=!_\'')
word.replace(".","_")
print(word)