Tokenisation is a fundamental process in natural language processing (NLP) that involves breaking down text into smaller units called tokens. These tokens can be words, sentences, or subwords, depending on the level of tokenization. Word tokenisation from nltk.tokenize import word_tokenize #keeps punctuation text_word_tokens_nltk = word_tokenize(text_original) print(text_word_tokens_nltk) Sentence tokenisation from nltk.tokenize import sent_tokenize text_sentence_tokens_nltk = sent_tokenize(text_original) print(text_sentence_tokens_nltk) Basic tokenisation temp = text_original.lower() temp = re.sub(r"[^a-zA-Z0-9]", " ", temp) # just letters and numbers temp = re.sub(r"\[[0-9]+\]", "", temp) #remove weird stuff temp = word_tokenize(temp) #break up text to word list tokens_no_stopwords = [token for token in temp if token not in stopwords.words("english")] #remove common words print(tokens_no_stopwords)