Comparing 2 CSV files

First of all, I downloaded csv of countries from here: https://gist.githubusercontent.com/marijn/396531/raw/89280409c2bd0a514c4346375f91349fff723520/countries.csv

Then turned it into set:

def country_to_set(csv_):
    with open('countries.csv', 'r') as f:
        countries = f.read().split(",")
        countries = [country.split("|")[-1] for country in countries]
        countries = [country.replace('"', '').strip().lower() for country in countries]
    return set(countries)  

then, I made a function that parses text from html using regex

def words_from_html(html):
    # Remove all the HTML tags
    txt = re.compile(r'<[^>]+>').sub('',html)
    # Split words by all non-alpha characters
    words = re.compile(r'[^A-Z^a-z]+').split(txt)
    # Convert to lowercase
    return set([ word.lower() for word in words ])  

Then, defined a main function:

def main():
    import re, urllib2 # this should be at the top actually
    url = "http://en.wikipedia.org/wiki/List_of_ongoing_armed_conflicts"

    html = urllib2.urlopen(url).read()

    wiki_word_set = words_from_html(html)

    country_set = country_to_set('countries.csv')

    print(country_set.intersection(wiki_word_set))

Then, when I did:

>>>main()
set(['pakistan', 'angola', 'uganda', 'myanmar', 'mexico', 'tunisia', 'lebanon', 'azerbaijan', 'djibouti', 'congo', 'mozambique', 'colombia', 'burundi', 'niger', 'occupied', 'turkey', 'afghanistan', 'qatar', 'bangladesh', 'ethiopia', 'sudan', 'france', 'egypt', 'somalia', 'peru', 'nigeria', 'cameroon', 'canada', 'israel', 'iran', 'algeria', 'india', 'china', 'armenia', 'thailand', 'iraq', 'ukraine', 'eritrea', 'oman', 'philippines', 'indonesia', 'chad', 'mali', 'yemen'])
/r/learnpython Thread Parent