mrjob: setup logging on EMR mrjob: setup logging on EMR hadoop hadoop

mrjob: setup logging on EMR


Out of all options, the only one really works is using stderr with a direct write (sys.stderr.write) or using a logger with a StreamHandler to stderr.

The logs can later be retrieved after the job is finished (successfully or with an error) from:

[s3_log_uri]/[jobflow-id]/task-attempts/[job-id]/[attempt-id]/stderr

Be sure to keep the logs in your runners.emr.cleanup configuration.


Here is an exmaple to get logging on stdout (python3)

from mrjob.job import MRJobfrom mrjob.job import MRStepfrom mrjob.util import log_to_stream, log_to_nullimport reimport sysimport logginglog = logging.getLogger(__name__)WORD_RE = re.compile(r'[\w]+')class MostUsedWords(MRJob):    def set_up_logging(cls, quiet=False, verbose=False, stream=None):          log_to_stream(name='mrjob', debug=verbose, stream=stream)        log_to_stream(name='__main__', debug=verbose, stream=stream)    def steps(self):        return [            MRStep (mapper = self.mapper_get_words,                    combiner = self.combiner_get_words,                    reducer = self.reduce_get_words),            MRStep (reducer = self.reducer_find_max)        ]        pass    def mapper_get_words(self,  _, line):        for word in WORD_RE.findall(line):            yield (word.lower(), 1)    def combiner_get_words(self, word, counts):        yield (word, sum(counts))    def reduce_get_words(self, word, counts):        log.info(word + "\t" +str(list(counts)) )        yield None, (sum(counts), word)    def reducer_find_max(self, key, value):        # value is pairs i.e., tuples        yield max(value)if __name__ == '__main__':    MostUsedWords.run()