@article{be0dade7b21e481ead6346a5a26f8d56,
title = "Measuring Mass Text Digitization Quality and Usefulness: Lessons Learned from Assessing the OCR Accuracy of the British Library's 19th Century Online Newspaper Archive",
abstract = "This article will discuss how to measure the accuracy of Optical Character Recognition (OCR) output in a way that is relevant to the needs of the end users of digital resources. A case study measuring the OCR accuracy of the British Library's 19th Century Newspapers Database provides a clear example of the benefits to be gained from measuring not just character accuracy but also word and significant word accuracy. As OCR primarily facilitates searching, indexing and other means of structuring the user experience of online newspaper archives, measuring the word and significant word accuracy of the OCR output is very revealing of a resource's likely performance for these functions. Having such data is therefore extremely helpful for planning and quality assurance assessment. After briefly discussing the role of OCR in the text capture process and how OCR works, we give a detailed description of the methodology, statistical data gathering techniques and analysis used in this study. Our conclusions point the way forward with suggested actions to assist other mass digitization projects in applying these techniques.",
author = "Simon Tanner and Trevor Munoz and {Hemy Ros}, Pich",
year = "2009",
month = jul,
language = "English",
volume = "15",
pages = "N/A",
journal = "Dlib Magazine",
issn = "1082-9873",
publisher = "Corporation for National Research Initiatives",
number = "78",
}