mrjob: setup logging on EMR
Out of all options, the only one really works is using stderr with a direct write (sys.stderr.write
) or using a logger with a StreamHandler to stderr.
The logs can later be retrieved after the job is finished (successfully or with an error) from:
[s3_log_uri]/[jobflow-id]/task-attempts/[job-id]/[attempt-id]/stderr
Be sure to keep the logs in your runners.emr.cleanup
configuration.
Here is an exmaple to get logging on stdout (python3)
from mrjob.job import MRJobfrom mrjob.job import MRStepfrom mrjob.util import log_to_stream, log_to_nullimport reimport sysimport logginglog = logging.getLogger(__name__)WORD_RE = re.compile(r'[\w]+')class MostUsedWords(MRJob): def set_up_logging(cls, quiet=False, verbose=False, stream=None): log_to_stream(name='mrjob', debug=verbose, stream=stream) log_to_stream(name='__main__', debug=verbose, stream=stream) def steps(self): return [ MRStep (mapper = self.mapper_get_words, combiner = self.combiner_get_words, reducer = self.reduce_get_words), MRStep (reducer = self.reducer_find_max) ] pass def mapper_get_words(self, _, line): for word in WORD_RE.findall(line): yield (word.lower(), 1) def combiner_get_words(self, word, counts): yield (word, sum(counts)) def reduce_get_words(self, word, counts): log.info(word + "\t" +str(list(counts)) ) yield None, (sum(counts), word) def reducer_find_max(self, key, value): # value is pairs i.e., tuples yield max(value)if __name__ == '__main__': MostUsedWords.run()