05 diciembre 2017

Testing Python vs C# - Read CSV file

Es inevitable que quiera hacer esto mientras aprendo un lenguaje.

Python C#

No he aplicado nada de refactorización en Python, pero sí en C# para intentar ahorrar tiempo.
Seguro que habrá todavía margen de mejora en ambos lenguajes, pero ya había pasado el tiempo razonable dedicado a refactorizar.

Si tenéis sugerencias para reducir el tiempo de ejecución, genial, así aprendemos todos.

En la red podéis encontrar comparaciones más profundas de ambos lenguajes. Aquí dos de ellas:

http://onstartups.com/tabid/3339/bid/128/Python-vs-C-Business-and-Technology-Tradeoffs.aspx

Los resultados que obtuve son estos:

Read CSV file - 23865 rows, filter year 2015, sum values from one column

--- Excel
62683287355
62.683.287.355
62.683 million persons

--- Python
Rows after first filter: 274
Rows after second filter: 272

World population in 2015: 62683.287355 million persons
Read CSV file - 23865 rows, filter year 2015, sum values from one column
Execution time: 18.53711571297298ms

--- C#
Rows after first filter: 274
Rows after second filter: 272

World population in 2015: 62683,287355 million persons
Read CSV file - 23865 rows, filter year 2015, sum values from one column
Execution time: 24ms


PYTHON CODE

import pandas as pd
import timeit
import numbers

"""Read CSV file, filter year 2015, sum all results"""

def getDataFrameFromCsv(csvFileName):
    """Read CSV file"""
    worldPopulation = pd.read_csv(csvFileName, sep = ',')
    return worldPopulation

def filterColumn(dataFrame, columnName, columnValue):
    """Filter dataFrame by year columns"""
    dataFrameFilter = dataFrame[columnName] == columnValue
    filterResult = dataFrame[dataFrameFilter]
    print("Rows after first filter: " + str(len(filterResult)))
    return filterResult

def filterIntsInColumn(dataFrame, columnName):
    """Filter int values in column"""
    dataFrameFilter = dataFrame[columnName] >= 0
    filterResult = dataFrame[dataFrameFilter]
    print("Rows after second filter: " + str(len(filterResult)))
    return filterResult
  
def sumColumn(dataFrame, columnName):
    """Sum column"""  
    result = sum(dataFrame[columnName])
    return result

def mainProgram():
    year = 2015
    yearColumnName = 'Year'
    populationColumnName = 'Medium Projection (UN Population Division (2015 revision)) (people)'
    fileName = 'world-population-1750-2015-and-un-projection-until-2100.csv'
  
    csvContent = getDataFrameFromCsv(fileName)
    contentFilteredByYear = filterColumn(csvContent, yearColumnName, year)
    contentFilteredByYear = filterIntsInColumn(contentFilteredByYear, populationColumnName)
    worldPopulation = sumColumn(contentFilteredByYear, populationColumnName)

    millionUnits = 10**6
    print("\nWorld population in " + str(year) + ": " + str(worldPopulation/millionUnits) + " million persons")

executionTimeInSecs = timeit.timeit(mainProgram, number=1)
print("Read CSV file - 23865 rows, filter year 2015, sum values from one column")
print("Execution time: " + str(executionTimeInSecs * 1000) + "ms")



C# CODE

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;

namespace SumCsvColumn
{
    class Program
    {
        static void Main(string[] args)
        {
            long elapsedMs = MeasureMainProgram();
            PrintResults(elapsedMs);
        }

        private static long MeasureMainProgram()
        {
            var watch = System.Diagnostics.Stopwatch.StartNew();

            MainProgram();

            watch.Stop();
            var elapsedMs = watch.ElapsedMilliseconds;
            return elapsedMs;
        }

        private static void PrintResults(long elapsedMs)
        {
            Console.WriteLine("Read CSV file - 23865 rows, filter year 2015, sum values from one column");
            Console.WriteLine("Execution time: " + elapsedMs.ToString() + "ms");
            Console.ReadLine();
        }

        private static void MainProgram()
        {
            var year = 2015;
            string[] columnNames = null;

            var yearColumnName = "Year";
            var yearColumnIndex = 0;

            var populationColumnName = "Medium Projection (UN Population Division (2015 revision)) (people)";
            var populationColumnIndex = 0;

            long worldPopulation = 0;
            double millionUnits = Math.Pow(10, 6);

            string[] csvContent = LoadCsvFile();
            IEnumerable<string[]> rowsSplitted = SplitRowsIntoColumns(csvContent);

            columnNames = rowsSplitted.First();
            yearColumnIndex = Array.IndexOf(columnNames, yearColumnName);
            populationColumnIndex = Array.IndexOf(columnNames, populationColumnName);

            rowsSplitted = FilterColumn(rowsSplitted, yearColumnIndex, year);
            worldPopulation = SumColumn(rowsSplitted, populationColumnIndex);

            Console.WriteLine("\nWorld population in " + year.ToString() + ": " + (worldPopulation / millionUnits).ToString() + " million persons");
        }

        private static string[] LoadCsvFile()
        {
            string fileName = "world-population-1750-2015-and-un-projection-until-2100 - CS.csv";
            String[] csvContent = GetArrayFromCsv(fileName);
            return csvContent;
        }

        private static long SumColumn(IEnumerable<string[]> rows, int columnIndex)
        {
            long sumResult = 0;

            foreach (var row in rows)
                sumResult += GetNumberFromString(row[columnIndex]);

            return sumResult;
        }

        private static long GetNumberFromString(string value)
        {
            return long.TryParse(value, out long longResult) ? longResult : 0;
        }

        private static IEnumerable<string[]> FilterColumn(IEnumerable<string[]> rows, int columnIndex, int columnValue)
        {
            var columnValueToString = columnValue.ToString();
          
            var filteredResult = rows.Where(row => row[columnIndex] == columnValueToString).ToList();

            Console.WriteLine("Rows after first filter: " + filteredResult.Count().ToString());
            return filteredResult;
        }
      
        private static IEnumerable<string[]> SplitRowsIntoColumns(string[] csvContent)
        {
            return csvContent.Select(row => row.Split(',')).ToList();
        }

        private static String[] GetArrayFromCsv(string fileName)
        {
            String[] values = File.ReadAllText(fileName).Split('\r');

            return values;
        }
    }
}

2 comentarios:

Python dijo...

Thanks for this wonderful post.
Python Online Training

fcnatra dijo...

You are welcome