Word Frequency in 5 Programming Languages (Java, Scala, Go, C++, R)

Let the code speak for itself.

Java 8

public class Application {

    public static void main(String[] args) throws IOException {

        Map wordCounts = Files.lines(new File("samples.txt").toPath())
            .parallel()
            .flatMap(Pattern.compile("\\W")::splitAsStream)
            .filter(word -> word.length() > 6)
            .map(String::toUpperCase)
            .collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum));

        wordCounts.entrySet().stream()
            .sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
            .limit(20)
            .forEach(System.out::println);
    }
}

Golang

type WordOccourences struct {
  word string
  occourcenres int
}

func main() {
  expression := regexp.MustCompile("[^a-zA-ZüäößÜÄÖ]")

  data, error := os.Open("samples.txt")
  defer data.Close()

  if error == nil {
    var wordCounts = make(map[string]int)

    scanner := bufio.NewScanner(data)
    scanner.Split(bufio.ScanWords)

    for scanner.Scan() {
      if len(scanner.Text()) > 5  {
        sanitized := expression.ReplaceAllString(scanner.Text(),  "")
        wordCounts[strings.ToUpper(sanitized)]++
      }
    }

    var wordlist = make([]WordOccourences, len(wordCounts))
    var wordListPos = 0

    for k, v := range wordCounts {
      if v > 20 {
        wordlist[wordListPos] = WordOccourences{k,v}
        wordListPos++
      }
    }

    sort.Slice(wordlist, func(i, j int) bool { return wordlist[i].occourcenres > wordlist[j].occourcenres })

    for idx := 0 ; idx < wordListPos && idx <=20 ; idx++ {
      fmt.Printf("%s=%d\n", wordlist[idx].word , wordlist[idx].occourcenres)
    }

  }

}

Scala

class CountingWords {
  def countFile(filename : String) = {
    (scala.io.Source.fromFile(filename)
      .getLines)
      .flatMap(_.split("\\W+"))
      .filter(_.length > 6)
      .map(_.toUpperCase)
      .foldLeft(Map.empty[String, Int]) {
        (count, word) => count + (word -> (count.getOrElse(word,0) +1))
      }
      .toList
      .sortBy(-_._2)
      .take(20)
  }
}

R

words <- scan("samples.txt", what="list")
words <- toupper(gsub("[^[:alpha:]]","", words))
words <- words[nchar(words)>5]
words <- table(words)
occourences <- sort(words, decreasing = TRUE)
occourences[occourences > 10]

C++

namespace wordfrequency {

    typedef std::unordered_map<string, int> wordFrequencyMap;
    typedef std::pair<string, int> wordFrequencyPair;
    typedef std::vector<wordFrequencyPair> wordFrequencyVector;

    class WordCounter {
    private:
        string fileName;
        std::unordered_map<string, int> data;
        wordFrequencyVector exportData;
        int ignoreLessThan = 0;
        int minimumWordLength = 0;

    public:
        WordCounter();
        WordCounter(const string& fileName);
        WordCounter(const WordCounter& orig);
        virtual ~WordCounter();

        WordCounter& count();
        WordCounter& ignoreCountsLessThan(int numOfWords);
        WordCounter& sortAscending();
        WordCounter& sortDescending();
        WordCounter& ignoreWordsShorterThanChars(int numOfChars);
        const wordFrequencyVector get();
}

namespace wordfrequency {

    WordCounter::WordCounter() {
    }

    WordCounter::WordCounter(const WordCounter& orig) {
    }

    WordCounter::~WordCounter() {
    }

    WordCounter::WordCounter(const string& fileName) {
        this->fileName.assign(fileName);
    }

    WordCounter& WordCounter::count() {
        boost::regex printableChars("[^a-zA-ZüäößÜÄÖ]");
        ifstream file;
        file.open(this->fileName);
        std::string word;
        while (file >> word) {
            string wordCleaned = boost::regex_replace(word, printableChars, "");
            if (wordCleaned.size() >= this->minimumWordLength) {
                boost::to_upper(wordCleaned);
                this->data[wordCleaned]++;
            }
        }

        for (std::unordered_map<std::string, int>::iterator it = this->data.begin(); it != this->data.end(); ++it) {
            if(it->second >= this->ignoreLessThan) {
                this->exportData.push_back(*it);
            }
        }

        return *this;
    }

    WordCounter& WordCounter::ignoreCountsLessThan(int numOfWords) {
        this->ignoreLessThan = numOfWords;
        return *this;
    }

    WordCounter& WordCounter::sortAscending() {
        sort(this->exportData.begin(), this->exportData.end(), [  ](wordFrequencyPair& a, wordFrequencyPair& b) {
            return a.second < b.second;
        });

        return *this;
    }

    WordCounter& WordCounter::sortDescending() {
         sort(this->exportData.begin(), this->exportData.end(), [  ](wordFrequencyPair& a, wordFrequencyPair& b) {
            return a.second > b.second;
        });

        return *this;
    }

    WordCounter& WordCounter::ignoreWordsShorterThanChars(int numOfChars) {
        this->minimumWordLength = numOfChars;
    }

    const wordFrequencyVector WordCounter::get() {
        return this->exportData;
    }

}

int main(int argc, char** argv) {
    WordCounter wordCounter("sample.txt");
    wordFrequencyVector wfm = wordCounter
            .ignoreCountsLessThan(100)
            .ignoreWordsShorterThanChars(6)
            .count()
            .sortDescending()
            .get();

    for (wordFrequencyVector::iterator it = wfm.begin(); it != wfm.end(); ++it) {
        std::cout << ' ' << it->first << " " << it->second << std::endl;
    }

    return 0;
}

Recent Posts

codefreeze 2024
Fuzz Testing in Golang
Hyères
Gran Canaria
Solingen
Norderney
Scotland
Træfik
Copenhagen | København
Tenerife
Etretat
Lanzarote