diff --git a/Images/oov.png b/Images/oov.png new file mode 100644 index 0000000..00cb715 Binary files /dev/null and b/Images/oov.png differ diff --git a/presentation.tex b/presentation.tex index c614bcd..819d807 100644 --- a/presentation.tex +++ b/presentation.tex @@ -9,6 +9,7 @@ \hypersetup{pdfpagemode=UseNone} % don't show bookmarks on initial view % font +\usepackage{bm} \usepackage{fontspec} \setsansfont{TeX Gyre Heros} \setbeamerfont{note page}{family*=pplx,size=\footnotesize} % Palatino for notes @@ -17,6 +18,11 @@ % In Mac, unzip it, double-click the .otf files, and install using "FontBook" % http://www.gust.org.pl/projects/e-foundry/tex-gyre/heros/qhv2.004otf.zip +% restore a standard LaTeX-like math font +\usepackage{amsmath} +\usepackage{unicode-math} +\setmathfont{Latin Modern Math} + %\newfontfamily\emojifont{Noto Emoji} \newcommand{\inlineemoji}[2][1.2em]{% \raisebox{-0.2em}{\includegraphics[height=#1]{#2}}% @@ -39,7 +45,8 @@ \definecolor{foreground}{RGB}{34,34,34} \definecolor{background}{RGB}{255,255,255} \definecolor{title}{RGB}{0,82,155} -\definecolor{gray}{RGB}{110,110,110} +%\definecolor{gray}{RGB}{110,110,110} +\definecolor{gray}{RGB}{15,15,15} \definecolor{subtitle}{RGB}{0,121,107} \definecolor{hilight}{RGB}{0,121,107} \definecolor{vhilight}{RGB}{180,0,102} @@ -58,6 +65,13 @@ \setbeamerfont{itemize/enumerate subbody}{size=\footnotesize} \setbeamerfont{itemize/enumerate subitem}{size=\footnotesize} +% configure itemize spacing + +% alas, it breaks itemize styling :( +%\usepackage{enumitem} +% second-level itemize (sub-itemize) +%\setlist[itemize,2]{itemsep=0.7em} + % page number \setbeamertemplate{footline}{% \raisebox{5pt}{\makebox[\paperwidth]{\hfill\makebox[20pt]{\color{gray} @@ -71,15 +85,16 @@ \newcommand{\ei}{\end{itemize}} \newcommand{\ig}{\includegraphics} \newcommand{\subt}[1]{{\footnotesize \color{subtitle} {#1}}} +\newcommand{\subtnc}[1]{{\footnotesize #1}} % title info \title{AI-Enhanced High-Accuracy Robotics for Industrial Applications} -\subtitle{} +\subtitle{Application for PhD-Position (m/f/d) in Industrial Robotics} \author{\href{https://abanbytes.eu}{David Madl}} %\institute{\href{https://www.biostat.wisc.edu}{Biostatistics \& Medical Informatics} \\[2pt] \href{http://www.wisc.edu}{University of Wisconsin{\textendash}Madison}} \date{%\href{http://kbroman.org}{\tt \scriptsize kbroman.org} %\\[-4pt] -\href{https://github.com/kbroman}{\tt \scriptsize github.com/cidermole} +\href{https://github.com/cidermole}{\tt \scriptsize github.com/cidermole} } @@ -114,4 +129,107 @@ \end{frame} +\begin{frame}{MSc Thesis} +\subt{Handling out-of-vocabulary words in a domain adaptation setting in SMT} + +%\vspace{12pt} + +\begin{itemize} + \item{Phrase-based Statistical Machine Translation\vspace{0.5em}} + \begin{itemize} + \addtolength{\itemsep}{0.7em} + \item{Warren Weaver (1947):\\[0.5em] + \textit{``This [article in Russian] is really written in English, but it has been coded in some strange symbols. I will now proceed to decode.''}} + \item{Bayes Theorem \& independence assumption:\\[0.5em] + $P(\text{en}|\text{ru}) = \frac{P(\text{ru}|\text{en}) P(\text{en})}{P(\text{ru})}$\\[0.5em] + $P(\text{ru}|\text{en}) = \prod_{i}^{M} P(\text{phrase\_ru}_{i}|\text{en})$ \hspace{0.5em} translation model\\[0.5em] + $P(\text{en}) = \prod_{k}^{L} P(w_{k}|w_{k-n}...w_{k-1})$ \hspace{0.5em} language model\\[0.5em] + $P(\text{ru})$ \hspace{0.5em} dropped normalization factor + } + \end{itemize} +\end{itemize} + +{\tiny see e.g. (Koehn et al 2003, ``Statistical phrase-based translation'')} + +\note{ + The rules for the translation model are more complex than shown here, because of the possibility of phrase splits at different word boundaries. + + The probabilities on the \textbf{right-hand side} are estimated from a training corpus. + The language model is estimated as transitions of an n-state \textbf{Hidden Markov Model}. + The translation model obtains phrases from a previous optimization called \textbf{Word Alignment}. +} + +% {\color{hilight} b} + +\end{frame} + + +\begin{frame}{MSc Thesis - Domain Adaptation 1} +\subtnc{Handling out-of-vocabulary words in a domain adaptation setting in SMT} + +\vspace{12pt} + +\bi + \item{{\color{hilight} test} and {\color{vhilight} train} datasets}\\[0.5em] + \item{{\color{hilight} medical} and {\color{vhilight} political} domains}\\[0.5em] + \item{{\color{hilight} 5 M} and {\color{vhilight} 50 M} word tokens}\\[0.5em] + \item{Domain adaptation:\\[0.5em] + ${\color{hilight} P(\text{en}|\text{ru})} = \frac{\color{vhilight} P(\text{ru}|\text{en}) P(\text{en})}{P(\text{ru})}$\\[0.5em] + ${\color{vhilight} P(\text{ru}|\text{en}) = \prod_{i}^{M} P(\text{phrase\_ru}_{i}|\text{en})}$ \hspace{0.5em} translation model\\[0.5em] + ${\color{vhilight} P(\text{en}) = \prod_{k}^{L} P(w_{k}|w_{k-n}...w_{k-1})}$ \hspace{0.5em} language model\\[0.5em] + } +\ei + +\note{ + In domain adaptation, we have a distributional mismatch between training and test data. + Simply appending target domain text to a large training dataset is not optimal. + This is because the statistics of the original domain dominate. +} + +\end{frame} + + +\begin{frame}{MSc Thesis - Domain Adaptation 2} +\subtnc{Handling out-of-vocabulary words in a domain adaptation setting in SMT} + +\vspace{12pt} + +\bi + \item{{\color{hilight} medical} and {\color{vhilight} political} domains}\\[0.5em] + \item{Mixture model:\\[0.5em] + $P(\text{ru}|\text{en}) = \alpha_1 {\color{hilight} P_1(\text{ru}|\text{en})} + \alpha_2 {\color{vhilight} P_2(\text{ru}|\text{en})} $\\[0.5em] + $P(\text{en}) = \alpha_1 {\color{hilight} P_1(\text{en})} + \alpha_2 {\color{vhilight} P_2(\text{en})} $\\[0.5em] + } + \item{Optimize quality measure:\\[0.5em] + $argmax_{\symbf{\alpha}} \text{BLEU}(\symbf{\alpha})$, $\sum_i \alpha_i = 1$} +\ei + +\note{ + We can do better by estimating two models, one on each domain. + Then we optimize the resulting translation quality based on the mixture parameters. +} + +\end{frame} + + +\begin{frame}{MSc Thesis - Word Alignment oracle} +\subt{Handling out-of-vocabulary words in a domain adaptation setting in SMT} + +\begin{center} +\ig[width=0.6\textwidth]{Images/oov.png} +\end{center} + +{\tiny source: Figure 6.1.3b, MSc Thesis} + +\note{ + The thesis topic I was assigned was to investigate words which could not be translated. + The oracle experiments are the most insightful. This one shows, for different training set sizes: + * in green, the **theoretical limit** from training data, + * in red, if **Word Alignment** had full statistics, + * in blue, the actual errors. +} + +\end{frame} + + \end{document} \ No newline at end of file