No he aplicado nada de refactorización en Python, pero sí en C# para intentar ahorrar tiempo.
Seguro que habrá todavía margen de mejora en ambos lenguajes, pero ya había pasado el tiempo razonable dedicado a refactorizar.
Si tenéis sugerencias para reducir el tiempo de ejecución, genial, así aprendemos todos.
En la red podéis encontrar comparaciones más profundas de ambos lenguajes. Aquí dos de ellas:
http://onstartups.com/tabid/3339/bid/128/Python-vs-C-Business-and-Technology-Tradeoffs.aspx
Los resultados que obtuve son estos:
Read CSV file - 23865 rows, filter year 2015, sum values from one column
--- Excel
62683287355
62.683.287.355
62.683 million persons
--- Python
Rows after first filter: 274
Rows after second filter: 272
World population in 2015: 62683.287355 million persons
Read CSV file - 23865 rows, filter year 2015, sum values from one column
Execution time: 18.53711571297298ms
--- C#
Rows after first filter: 274
Rows after second filter: 272
World population in 2015: 62683,287355 million persons
Read CSV file - 23865 rows, filter year 2015, sum values from one column
Execution time: 24ms
PYTHON CODE
import pandas as pd
import timeit
import numbers
"""Read CSV file, filter year 2015, sum all results"""
def getDataFrameFromCsv(csvFileName):
"""Read CSV file"""
worldPopulation = pd.read_csv(csvFileName, sep = ',')
return worldPopulation
def filterColumn(dataFrame, columnName, columnValue):
"""Filter dataFrame by year columns"""
dataFrameFilter = dataFrame[columnName] == columnValue
filterResult = dataFrame[dataFrameFilter]
print("Rows after first filter: " + str(len(filterResult)))
return filterResult
def filterIntsInColumn(dataFrame, columnName):
"""Filter int values in column"""
dataFrameFilter = dataFrame[columnName] >= 0
filterResult = dataFrame[dataFrameFilter]
print("Rows after second filter: " + str(len(filterResult)))
return filterResult
def sumColumn(dataFrame, columnName):
"""Sum column"""
result = sum(dataFrame[columnName])
return result
def mainProgram():
year = 2015
yearColumnName = 'Year'
populationColumnName = 'Medium Projection (UN Population Division (2015 revision)) (people)'
fileName = 'world-population-1750-2015-and-un-projection-until-2100.csv'
csvContent = getDataFrameFromCsv(fileName)
contentFilteredByYear = filterColumn(csvContent, yearColumnName, year)
contentFilteredByYear = filterIntsInColumn(contentFilteredByYear, populationColumnName)
worldPopulation = sumColumn(contentFilteredByYear, populationColumnName)
millionUnits = 10**6
print("\nWorld population in " + str(year) + ": " + str(worldPopulation/millionUnits) + " million persons")
executionTimeInSecs = timeit.timeit(mainProgram, number=1)
print("Read CSV file - 23865 rows, filter year 2015, sum values from one column")
print("Execution time: " + str(executionTimeInSecs * 1000) + "ms")
C# CODE
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace SumCsvColumn
{
class Program
{
static void Main(string[] args)
{
long elapsedMs = MeasureMainProgram();
PrintResults(elapsedMs);
}
private static long MeasureMainProgram()
{
var watch = System.Diagnostics.Stopwatch.StartNew();
MainProgram();
watch.Stop();
var elapsedMs = watch.ElapsedMilliseconds;
return elapsedMs;
}
private static void PrintResults(long elapsedMs)
{
Console.WriteLine("Read CSV file - 23865 rows, filter year 2015, sum values from one column");
Console.WriteLine("Execution time: " + elapsedMs.ToString() + "ms");
Console.ReadLine();
}
private static void MainProgram()
{
var year = 2015;
string[] columnNames = null;
var yearColumnName = "Year";
var yearColumnIndex = 0;
var populationColumnName = "Medium Projection (UN Population Division (2015 revision)) (people)";
var populationColumnIndex = 0;
long worldPopulation = 0;
double millionUnits = Math.Pow(10, 6);
string[] csvContent = LoadCsvFile();
IEnumerable<string[]> rowsSplitted = SplitRowsIntoColumns(csvContent);
columnNames = rowsSplitted.First();
yearColumnIndex = Array.IndexOf(columnNames, yearColumnName);
populationColumnIndex = Array.IndexOf(columnNames, populationColumnName);
rowsSplitted = FilterColumn(rowsSplitted, yearColumnIndex, year);
worldPopulation = SumColumn(rowsSplitted, populationColumnIndex);
Console.WriteLine("\nWorld population in " + year.ToString() + ": " + (worldPopulation / millionUnits).ToString() + " million persons");
}
private static string[] LoadCsvFile()
{
string fileName = "world-population-1750-2015-and-un-projection-until-2100 - CS.csv";
String[] csvContent = GetArrayFromCsv(fileName);
return csvContent;
}
private static long SumColumn(IEnumerable<string[]> rows, int columnIndex)
{
long sumResult = 0;
foreach (var row in rows)
sumResult += GetNumberFromString(row[columnIndex]);
return sumResult;
}
private static long GetNumberFromString(string value)
{
return long.TryParse(value, out long longResult) ? longResult : 0;
}
private static IEnumerable<string[]> FilterColumn(IEnumerable<string[]> rows, int columnIndex, int columnValue)
{
var columnValueToString = columnValue.ToString();
var filteredResult = rows.Where(row => row[columnIndex] == columnValueToString).ToList();
Console.WriteLine("Rows after first filter: " + filteredResult.Count().ToString());
return filteredResult;
}
private static IEnumerable<string[]> SplitRowsIntoColumns(string[] csvContent)
{
return csvContent.Select(row => row.Split(',')).ToList();
}
private static String[] GetArrayFromCsv(string fileName)
{
String[] values = File.ReadAllText(fileName).Split('\r');
return values;
}
}
}
2 comentarios:
Thanks for this wonderful post.
Python Online Training
You are welcome
Publicar un comentario