リアルタイムに目の位置推定

2020/06/02

「AI CORE XスターターキットとOpenVINO™ですぐに始めるディープラーニング推論」シリーズの８回目記事です。

このシリーズは、「ディープラーニングとは何か」から始まり、「各種ツールの使い方」「プログラミング基礎」「プログラミング応用・実践」までをステップバイステップでじっくり学び、自分で理解してオリジナルのAIアプリケーションが作れるようになることを目指しています。

第８回目はカメラ画像を入力としてリアルタイムに目の位置をディープラーニングで推定します。

リアルタイム顔検出

動画のようなイメージで、リアルタイムに顔検出を行います。

前回の顔検出の入力は静止画でしたが、これをカメラ映像に変更します。USBカメラを接続して、以下コードをコピペして実行してみて下さい

#================================================== 
# 準備 
#================================================== 
# import 
import cv2
import numpy as np
from openvino.inference_engine import IENetwork, IEPlugin
 
# ターゲットデバイスの指定 
plugin = IEPlugin(device='MYRIAD')
 
# モデルの読み込み 
net  = IENetwork(model='intel/face-detection-retail-0005/FP16/face-detection-retail-0005.xml', weights='intel/face-detection-retail-0005/FP16/face-detection-retail-0005.bin')
exec_net = plugin.load(network=net)
 
# 入出力データのキー取得 
input_blob = next(iter(net.inputs))
out_blob = next(iter(net.outputs))
 
# カメラ準備 
cap = cv2.VideoCapture(0)
 
#================================================== 
# メインループ 
#================================================== 
while True:
    # キー押下で終了 
    key = cv2.waitKey(1)
    if key != -1:
        break
 
    # カメラ画像読み込み 
    ret, frame = cap.read()
 
    # 入力データフォーマットへ変換 
    img = cv2.resize(frame, (300, 300)) # HeightとWidth変更 
    img = img.transpose((2, 0, 1))      # HWC > CHW 
    img = np.expand_dims(img, axis=0)   # CHW > BCHW 
 
    # 推論実行 
    out = exec_net.infer(inputs={input_blob: img})
 
    # 出力から必要なデータのみ取り出し 
    out = out[out_blob]
 
    # 不要な次元を削減 
    out = np.squeeze(out)
 
    # 検出されたすべての顔領域に対して１つずつ処理 
    for detection in out:
        # conf値の取得 
        confidence = float(detection[2])
 
        # バウンディングボックス座標を入力画像のスケールに変換 
        xmin = int(detection[3] * frame.shape[1])
        ymin = int(detection[4] * frame.shape[0])
        xmax = int(detection[5] * frame.shape[1])
        ymax = int(detection[6] * frame.shape[0])
 
        # conf値が0.5より大きい場合のみバウンディングボックス表示 
        if confidence > 0.5:
            # バウンディングボックス表示 
            cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color=(89, 199, 243), thickness=3)
 
    # 画像表示 
    cv2.imshow('frame', frame)
 
#================================================== 
# 終了処理 
#================================================== 
cap.release()
cv2.destroyAllWindows()

どうですしょうか？カメラを皆さんの顔やウェブ上にある適当な顔画像に向けたりすると、しっかりと顔検出されているかと思います。複数の顔も検出されます。

それではコードの説明です。
全体が分かりやすいように、コメント文を使って「準備」「メインループ」「終了処理」に分けました。前回からの変化点は下記の通りです。

準備にて「カメラ準備」追加
メインループ部は全体をwhile True:ブロック化
メインループの最初に「キー押下で終了」と「カメラ画像読み込み」を追加
終了処理にてcap.releaseを追加

追加項目は全てOpenCV基礎で学んだ内容です。
入力画像はカメラから常に異なる画像が入ってくるのでその都度処理が必要ですが、モデル読み込みは常に同じモデルを使うため、最初の１回だけで良いというのがポイントです。
プログラムを実行すると分かりますが、実は処理時間が長かったのはモデルの読み込みの部分です。推論自体は非常に速く処理されていることが実感できたかと思います。

顔検出後にランドマーク回帰

今度は、動画のようにリアルタイムにカメラ映像入力からランドマーク回帰まで一気に行いたいと思います。

#================================================== 
# 準備 
#================================================== 
# import 
import cv2
import numpy as np
from openvino.inference_engine import IENetwork, IEPlugin
 
# ターゲットデバイスの指定 
plugin = IEPlugin(device='MYRIAD')
 
# モデルの読み込みと入出力データのキー取得（顔検出） 
net_face  = IENetwork(model='intel/face-detection-retail-0005/FP16/face-detection-retail-0005.xml', weights='intel/face-detection-retail-0005/FP16/face-detection-retail-0005.bin')
exec_net_face  = plugin.load(network=net_face)
input_blob_face = next(iter(net_face.inputs))
out_blob_face  = next(iter(net_face.outputs))
 
# モデルの読み込みと入出力データのキー取得（landmarks） 
net_landmarks = IENetwork(model='intel/landmarks-regression-retail-0009/FP16/landmarks-regression-retail-0009.xml', weights='intel/landmarks-regression-retail-0009/FP16/landmarks-regression-retail-0009.bin')
exec_net_landmarks = plugin.load(network=net_landmarks)
input_blob_landmarks = next(iter(net_landmarks.inputs))
out_blob_landmarks = next(iter(net_landmarks.outputs))
 
# カメラ準備 
cap = cv2.VideoCapture(0)
 
#================================================== 
# メインループ 
#================================================== 
while True:
    # キー押下で終了 
    key = cv2.waitKey(1)
    if key != -1:
        break
 
    # カメラ画像読み込み 
    ret, frame = cap.read()
 
    # 入力データフォーマットへ変換 
    img = cv2.resize(frame, (300, 300)) # HeightとWidth変更 
    img = img.transpose((2, 0, 1))      # HWC > CHW 
    img = np.expand_dims(img, axis=0)   # CHW > BCHW 
 
    # 推論実行 
    out = exec_net_face.infer(inputs={input_blob_face: img})
 
    # 出力から必要なデータのみ取り出し 
    out = out[out_blob_face]
 
    # 不要な次元を削減 
    out = np.squeeze(out)
 
    # 検出されたすべての顔領域に対して１つずつ処理 
    for detection in out:
        # conf値の取得 
        confidence = float(detection[2])
 
        # バウンディングボックス座標を入力画像のスケールに変換 
        xmin = int(detection[3] * frame.shape[1])
        ymin = int(detection[4] * frame.shape[0])
        xmax = int(detection[5] * frame.shape[1])
        ymax = int(detection[6] * frame.shape[0])
 
        # conf値が0.5より大きい場合のみLandmarks推論とバウンディングボックス表示 
        if confidence > 0.5:
           # 顔検出領域はカメラ範囲内に補正する。特にminは補正しないとエラーになる 
            if xmin < 0:
                xmin = 0
            if ymin < 0:
                ymin = 0
            if xmax > frame.shape[1]:
                xmax = frame.shape[1]
            if ymax > frame.shape[0]:
                ymax = frame.shape[0]
 
            #-------------------------------------------------- 
            #  ディープラーニングLandmarks推定 
            #-------------------------------------------------- 
            # 顔領域のみ切り出し 
            img_face = frame[ ymin:ymax, xmin:xmax ]
 
            # 入力データフォーマットへ変換 
            img = cv2.resize(img_face, (48, 48)) # HeightとWidth変更 
            img = img.transpose((2, 0, 1))       # HWC > CHW 
            img = np.expand_dims(img, axis=0)    # CHW > BCHW 
 
            # 推論実行 
            out = exec_net_landmarks.infer(inputs={input_blob_landmarks: img})
 
            # 出力から必要なデータのみ取り出し 
            out = out[out_blob_landmarks]
 
            # 不要な次元を削減 
            out = np.squeeze(out)
 
            # Landmarks検出位置にcircle表示 
            for i in range(0, 10, 2):
                x = int(out[i] * img_face.shape[1]) + xmin
                y = int(out[i+1] * img_face.shape[0]) + ymin
                cv2.circle(frame, (x, y), 10, (89, 199, 243), thickness=-1)
 
            # バウンディングボックス表示 
            cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color=(89, 199, 243), thickness=3)
 
    # 画像表示 
    cv2.imshow('frame', frame)
 
#================================================== 
# 終了処理 
#================================================== 
cap.release()
cv2.destroyAllWindows()

コードの解説です。
今回は１つのコードの中にモデルが２種類あるため、変数名に下記文字列を追加して顔検出とランドマーク回帰でそれぞれ分けています。Inference Engineに関わる箇所に適用しています。

_face
_landmarks

カメラ映像から得られた画像frameに対し顔検出を行い、スライスを使って顔のみの画像img_faceを作っています。さらにimg_faceに対してランドマーク回帰を行っているという流れです。
なお、顔領域を切り出した後にバウンディングボックス描画を行います。順番が逆になると切り出す顔の中にバウンディングボックスの線が混じってしまうためです。
また、スライスを使って顔を切り出す際に、カメラ画像の範囲外に座標があるとエラーになってしまうため、事前にカメラ範囲内になるように補正しています。

各ランドマーク位置に円を表示する際に注意する点があります。
正規化された座標から元の座標のスケールに戻す際は、画像全体のframe.shape ではなく　顔画部分のみのimg_face.shapeを使うことです。また、全体画像の座標系に位置を合わせるため、最後にそれぞれxmin、yminを加えていることもポイントです。

# Landmarks検出位置にcircle表示 
for i in range(0, 10, 2):
    x = int(out[i] * img_face.shape[1]) + xmin
    y = int(out[i+1] * img_face.shape[0]) + ymin
    cv2.circle(frame, (x, y), 10, (89, 199, 243), thickness=-1)

簡易サングラス描画

最後に少しオマケですが、リアルタイムに得られる目の座標情報を元にして簡易的なサングラスを描画してみたいと思います。イメージは動画の通りです。

#================================================== 
# 準備 
#================================================== 
# import 
import cv2
import numpy as np
from openvino.inference_engine import IENetwork, IEPlugin
 
# ターゲットデバイスの指定 
plugin = IEPlugin(device='MYRIAD')
 
# モデルの読み込みと入出力データのキー取得（顔検出） 
net_face  = IENetwork(model='intel/face-detection-retail-0005/FP16/face-detection-retail-0005.xml', weights='intel/face-detection-retail-0005/FP16/face-detection-retail-0005.bin')
exec_net_face  = plugin.load(network=net_face)
input_blob_face = next(iter(net_face.inputs))
out_blob_face  = next(iter(net_face.outputs))
 
# モデルの読み込みと入出力データのキー取得（landmarks） 
net_landmarks = IENetwork(model='intel/landmarks-regression-retail-0009/FP16/landmarks-regression-retail-0009.xml', weights='intel/landmarks-regression-retail-0009/FP16/landmarks-regression-retail-0009.bin')
exec_net_landmarks = plugin.load(network=net_landmarks)
input_blob_landmarks = next(iter(net_landmarks.inputs))
out_blob_landmarks = next(iter(net_landmarks.outputs))
 
# カメラ準備 
cap = cv2.VideoCapture(0)
 
#================================================== 
# メインループ 
#================================================== 
while True:
    # キー押下で終了 
    key = cv2.waitKey(1)
    if key != -1:
        break
 
    # カメラ画像読み込み 
    ret, frame = cap.read()
 
    # 入力データフォーマットへ変換 
    img = cv2.resize(frame, (300, 300)) # HeightとWidth変更 
    img = img.transpose((2, 0, 1))      # HWC > CHW 
    img = np.expand_dims(img, axis=0)   # CHW > BCHW 
 
    # 推論実行 
    out = exec_net_face.infer(inputs={input_blob_face: img})
 
    # 出力から必要なデータのみ取り出し 
    out = out[out_blob_face]
 
    # 不要な次元を削減 
    out = np.squeeze(out)
 
    # 検出されたすべての顔領域に対して１つずつ処理 
    for detection in out:
        # conf値の取得 
        confidence = float(detection[2])
 
        # バウンディングボックス座標を入力画像のスケールに変換 
        xmin = int(detection[3] * frame.shape[1])
        ymin = int(detection[4] * frame.shape[0])
        xmax = int(detection[5] * frame.shape[1])
        ymax = int(detection[6] * frame.shape[0])
 
        # conf値が0.5より大きい場合のみLandmarks推論とバウンディングボックス表示 
        if confidence > 0.5:
           # 顔検出領域はカメラ範囲内に補正する。特にminは補正しないとエラーになる 
            if xmin < 0:
                xmin = 0
            if ymin < 0:
                ymin = 0
            if xmax > frame.shape[1]:
                xmax = frame.shape[1]
            if ymax > frame.shape[0]:
                ymax = frame.shape[0]
 
            #-------------------------------------------------- 
            #  ディープラーニングLandmarks推定 
            #-------------------------------------------------- 
            # 顔領域のみ切り出し 
            img_face = frame[ ymin:ymax, xmin:xmax ]
 
            # 入力データフォーマットへ変換 
            img = cv2.resize(img_face, (48, 48)) # HeightとWidth変更 
            img = img.transpose((2, 0, 1))       # HWC > CHW 
            img = np.expand_dims(img, axis=0)    # CHW > BCHW 
 
            # 推論実行 
            out = exec_net_landmarks.infer(inputs={input_blob_landmarks: img})
 
            # 出力から必要なデータのみ取り出し 
            out = out[out_blob_landmarks]
 
            # 不要な次元を削減 
            out = np.squeeze(out)
 
            # 目の座標を顔画像のスケールに変換し、オフセット考慮 
            eye_left_x = int(out[0] * img_face.shape[1]) + xmin
            eye_left_y = int(out[1] * img_face.shape[0]) + ymin
            eye_right_x = int(out[2] * img_face.shape[1]) + xmin
            eye_right_y = int(out[3] * img_face.shape[0]) + ymin
 
            # 目の位置に表示 
            r = int((xmax - xmin) / 6)
            cv2.circle(frame, (eye_left_x, eye_left_y), r, (0, 0, 0), thickness=-1)
            cv2.circle(frame, (eye_right_x, eye_right_y), r, (0, 0, 0), thickness=-1)
            cv2.line(frame, (eye_left_x, eye_left_y), (eye_right_x, eye_right_y), (0, 0, 0), thickness=3)
 
            # バウンディングボックス表示 
            #cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color=(89, 199, 243), thickness=3) 
 
    # 画像表示 
    cv2.imshow('frame', frame)
 
#================================================== 
# 終了処理 
#================================================== 
cap.release()
cv2.destroyAllWindows()

コードの変化点は以下の部分です
今回は、ランドマーク５個のうちの両目の位置座標だけを活用しています

# 目の座標を顔画像のスケールに変換し、オフセット考慮 
eye_left_x = int(out[0] * img_face.shape[1]) + xmin
eye_left_y = int(out[1] * img_face.shape[0]) + ymin
eye_right_x = int(out[2] * img_face.shape[1]) + xmin
eye_right_y = int(out[3] * img_face.shape[0]) + ymin
 
# 目の位置に表示 
r = int((xmax - xmin) / 6)
cv2.circle(frame, (eye_left_x, eye_left_y), r, (0, 0, 0), thickness=-1)
cv2.circle(frame, (eye_right_x, eye_right_y), r, (0, 0, 0), thickness=-1)
cv2.line(frame, (eye_left_x, eye_left_y), (eye_right_x, eye_right_y), (0, 0, 0), thickness=3)
 
# バウンディングボックス表示 
#cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color=(89, 199, 243), thickness=3)

内容は次の通りです

それぞれの目の座標位置に半径rの黒い円を描画
rは一定値ではなく、顔の大きさの1/6になるように設定
メガネのフレームをイメージして、目と目の間に線分を描画※
バウンディングボックスは非表示

※メガネのフレーム（ブリッジ部）の太さは簡易的に3で一定とした

課題

図形だと少し違和感があるので、実際には画像を使うべきです。
今回は図形の円を使用したので、角度に関しては考慮する必要がありませんでした。また、画像を用いる場合は、角度や大きさだけでなく、どの位置に表示させるのかをしっかり決める必要がありそうです。

実際の課題解決は10回目の「バーチャル試着アプリ」で取り上げます。

余力があれば、次回に進む前にメガネ画像や帽子画像表示にチャレンジしてみるのも良いかと思います。画像はこちらのページなどから入手可能です。
次回は、バーチャル試着アプリ時に必要となるオリジナルツールの作成を行います。

以上、「リアルタイムに目の位置推定」でした。

リアルタイムに目の位置推定

目次

リアルタイム顔検出

顔検出後にランドマーク回帰

簡易サングラス描画

課題