SizheAn / PanoHead

Dear authors,

Thanks for sharing such a great work on 3D avatar synthesis! Recently I was trying to train PanoHead on other datasets, and I'm wondering how you process the back-head images sampled from K-Hairstyle dataset. For example, you mentioned using WHENet for head pose estimation, but they can only produce 3D rotations of the head, so how to obtain 3D translations and convert them to the pose format of EG3D? Besides, how to crop and align these images as they don't have facial landmarks? Given these questions, I would be super grateful if you could share some scripts for the processing of these back-head images. Thanks!

Dear authors,

Thanks for sharing such a great work on 3D avatar synthesis! Recently I was trying to train PanoHead on other datasets, and I'm wondering how you process the back-head images sampled from K-Hairstyle dataset. For example, you mentioned using WHENet for head pose estimation, but they can only produce 3D rotations of the head, so how to obtain 3D translations and convert them to the pose format of EG3D? Besides, how to crop and align these images as they don't have facial landmarks? Given these questions, I would be super grateful if you could share some scripts for the processing of these back-head images. Thanks!

I will try to clean up the scripts and share with you soon!

Huge thanks for your help!

Can you try the following code? This file should be put under the WHENet root dir. Make sure you install WHENet properly first.

import numpy as np
import cv2
import os
import argparse
import pickle
import json

from tqdm import tqdm
from math import cos, sin
from whenet import WHENet
from yolo_v3.yolo_postprocess import YOLO
from PIL import Image

'''
'''

def angle2matrix(angles):
    ''' get rotation matrix from three rotation angles(degree). right-handed.
    Args:
        angles: [3,]. x, y, z angles
        x: yaw. positive for looking left. 
        y: pitch. positive for looking up.
        z: roll. positive for tilting head right. 
    Returns:
        R: [3, 3]. rotation matrix.
    '''
    x, y, z = np.deg2rad(angles[1]), -np.deg2rad(angles[0]), np.deg2rad(angles[2])
    # x
    Rx=np.array([[1,      0,       0],
                 [0, cos(x),  -sin(x)],
                 [0, sin(x),   cos(x)]])
    # y
    Ry=np.array([[ cos(y), 0, sin(y)],
                 [      0, 1,      0],
                 [-sin(y), 0, cos(y)]])
    # z
    Rz=np.array([[cos(z), -sin(z), 0],
                 [sin(z),  cos(z), 0],
                 [     0,       0, 1]])
    
    R=Rz.dot(Ry.dot(Rx))
    return R.astype(np.float32)


def eg3dcamparams(R_in):
    '''
    Input a rotation matrix, output 25 dim label matrix (16 dim extrinsic + 9 dim intrinsic)
    '''
    camera_dist = 2.7
    intrinsics = np.array([[4.2647, 0, 0.5], [0, 4.2647, 0.5], [0, 0, 1]])
    # assume inputs are rotation matrices for world2cam projection
    R = np.array(R_in).astype(np.float32).reshape(4,4)
    # add camera translation
    t = np.eye(4, dtype=np.float32)
    t[2, 3] = - camera_dist

    # convert to OpenCV camera
    convert = np.array([
        [1, 0, 0, 0],
        [0, -1, 0, 0],
        [0, 0, -1, 0],
        [0, 0, 0, 1],
    ]).astype(np.float32)

    # world2cam -> cam2world
    P = convert @ t @ R
    cam2world = np.linalg.inv(P)

    # add intrinsics
    label_new = np.concatenate([cam2world.reshape(16), intrinsics.reshape(9)], -1)
    return label_new




def process_detection(model, img, bbox, hori_label, args):

    y_min, x_min, y_max, x_max = bbox
    # enlarge the bbox to include more background margin
    y_min = max(0, y_min - abs(y_min - y_max) / 10)
    y_max = min(img.shape[0], y_max + abs(y_min - y_max) / 10)
    x_min = max(0, x_min - abs(x_min - x_max) / 5)
    x_max = min(img.shape[1], x_max + abs(x_min - x_max) / 5)
    x_max = min(x_max, img.shape[1])

    x_diff = x_max - x_min
    y_diff = y_max - y_min

    edge = (x_diff+y_diff)/2 * 0.85
    x_c, y_c = x_min + x_diff/2, y_min + y_diff/2
    x_min, x_max, y_min, y_max = max(0, x_c - edge/2), x_c + edge/2, max(0, y_c - edge/2), y_c + edge/2

    quad = np.stack([[x_min,y_min],[x_min,y_max],[x_max,y_max],[x_max,y_min]]).astype(np.float32)


    img_rgb = img[int(y_min):int(y_max), int(x_min):int(x_max)]
    img_rgb = cv2.cvtColor(img_rgb, cv2.COLOR_BGR2RGB)
    img_rgb = cv2.resize(img_rgb, (224, 224))

    img_rgb = np.expand_dims(img_rgb, axis=0)



    cv2.rectangle(img, (int(x_min), int(y_min)), (int(x_max), int(y_max)), (0,0,0), 2)
    yaw, pitch, roll = model.get_angle(img_rgb)
    yaw, pitch, roll = np.squeeze([yaw, pitch, roll])

    # convert horizontal label to the same format
    if hori_label>180 and hori_label<=360:
        hori_label = hori_label-360

    # change sign
    
    yaw = hori_label
    pitch = -pitch
    
    # add random noise under uniformed distribution
    if abs(yaw) > 10:
        yaw += np.random.uniform(-6,6)
        yaw = float(np.clip(yaw, -180, 180))

    t3d = np.array([0,0,0])

    R = angle2matrix(np.squeeze([yaw, pitch, roll]))
    P = np.concatenate([R,t3d[:,None]],1)
    P = np.concatenate([P, np.array([[0,0,0,1.]])],0)

    return P, quad



def main(args):
    in_dir = 'datasets/khair_uniquesub'
    image_dir = in_dir + '/image'
    whenet = WHENet(snapshot=args.snapshot)
    yolo = YOLO(**vars(args))

    
    annotations = pickle.load(open(os.path.join(in_dir, 'khair_annot_unique.pkl'), 'rb'))

    results_meta = {}
    results_quad = {}
    # get a path list of all images
    path_list = os.listdir(image_dir)

    for idx, image_name in enumerate(tqdm(path_list)):
        # if idx == 2:
        #     break
        image_path = os.path.join(image_dir, image_name)
        img = cv2.imread(image_path)
        if img is None:
            continue
        img_pil = Image.fromarray(img)
        bboxes, scores, classes = yolo.detect(img_pil)
        hori_label = int(annotations[annotations['path'] == image_name]['horizontal'].values)
        # bad detection, unrelated images.
        if len(bboxes) > 1 or len(bboxes) == 0:
            continue

        # perform next step, get angles and save other images
        for bbox in bboxes:
            P, quad = process_detection(whenet, img, bbox, hori_label, args)

        results_meta[image_path] = eg3dcamparams(P.flatten())
        results_quad[image_path] = quad
    

    # Save meta data
    results_new = []
    for img, P  in results_meta.items():
        img = os.path.basename(img)
        res = [format(r, '.6f') for r in P]
        results_new.append((img,res))
    with open(os.path.join(in_dir,'dataset.json'), 'w') as outfile:
        json.dump({"labels": results_new}, outfile, indent="\t")
    
    # Save quads
    print("results:", len(results_quad))
    with open(os.path.join(in_dir,'quads.pkl'), 'wb') as f:
        pickle.dump(results_quad, f)



if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='whenet demo with yolo')
    parser.add_argument('--snapshot', type=str, default='WHENet.h5', help='whenet snapshot path')
    parser.add_argument('--display', type=str, default='full', help='display all euler angle (simple, full)')
    parser.add_argument('--score', type=float, default=0.9, help='yolo confidence score threshold')
    parser.add_argument('--iou', type=float, default=0.3, help='yolo iou threshold')
    parser.add_argument('--gpu', type=str, default='0,1', help='gpu')
    parser.add_argument('--output', type=str, default='test.avi', help='output video name')
    args = parser.parse_args()
    main(args)

We first save khair annotation to a .pkl file (not include in this script, easy pre-processing from excel to pkl). Then we take its horizontal annotation as the yaw and use WHENet to estimate pitch and roll. The output dataset.json here will be used for final camera pose labels. We also get the quads.pkl, for cropping the images again to have consistent scale with the frontal images. We need to use

PanoHead/3DDFA_V2_cropping/recrop_images.py

Lines 104 to 158 in 17ad915

    
           def crop_final( 
        
               img,  
        
               size=512,  
        
               quad=None, 
        
               top_expand=0.1,  
        
               left_expand=0.05,  
        
               bottom_expand=0.0,  
        
               right_expand=0.05,  
        
               blur_kernel=None, 
        
               borderMode=cv2.BORDER_REFLECT, 
        
               upsample=2, 
        
               min_size=256, 
        
           ):   
        
               orig_size = min(np.linalg.norm(quad[1] - quad[0]), np.linalg.norm(quad[2] - quad[1])) 
        
               if min_size is not None and orig_size < min_size: 
        
                   return None 
        
               crop_w = int(size * (1 + left_expand + right_expand)) 
        
               crop_h = int(size * (1 + top_expand + bottom_expand)) 
        
               crop_size = (crop_w, crop_h) 
        
               top = int(size * top_expand) 
        
               left = int(size * left_expand) 
        
               size -= 1 
        
               bound = np.array([[left, top], [left, top + size], [left + size, top + size], [left + size, top]], 
        
                                   dtype=np.float32) 
        
               mat = cv2.getAffineTransform(quad[:3], bound[:3]) 
        
               if upsample is None or upsample == 1: 
        
                   crop_img = cv2.warpAffine(np.array(img), mat, crop_size, flags=cv2.INTER_LANCZOS4, borderMode=borderMode) 
        
               else: 
        
                   assert isinstance(upsample, int) 
        
                   crop_size_large = (crop_w*upsample,crop_h*upsample) 
        
                   crop_img = cv2.warpAffine(np.array(img), upsample*mat, crop_size_large, flags=cv2.INTER_LANCZOS4, borderMode=borderMode) 
        
                   crop_img = cv2.resize(crop_img, crop_size, interpolation=cv2.INTER_AREA)  
        
               empty = np.ones_like(img) * 255 
        
               crop_mask = cv2.warpAffine(empty, mat, crop_size) 
        
               if True: 
        
                   mask_kernel = int(size*0.02)*2+1 
        
                   blur_kernel = int(size*0.03)*2+1 if blur_kernel is None else blur_kernel 
        
                   downsample_size = (crop_w//8, crop_h//8) 
        
                   if crop_mask.mean() < 255: 
        
                       blur_mask = cv2.blur(crop_mask.astype(np.float32).mean(2),(mask_kernel,mask_kernel)) / 255.0 
        
                       blur_mask = blur_mask[...,np.newaxis]#.astype(np.float32) / 255.0 
        
                       blurred_img = cv2.blur(crop_img, (blur_kernel, blur_kernel), 0) 
        
                       crop_img = crop_img * blur_mask + blurred_img * (1 - blur_mask) 
        
                       crop_img = crop_img.astype(np.uint8) 
        
               return crop_img

this function with quads.pkl to re-crop the khair images for training.

I'm sorry this is really a huge mess. Since I am not with the company now this is the best script that I can find. Let me know if you have any issues, I will try my best to answer it. And I will be super grateful if you find it works and are able to make an easier version for others.

Great! Thanks so much for your help! I will look at it later today!

…

On Jul 26, 2023 at 01:24 +0800, SizheAn ***@***.***>, wrote: Can you try the following code? This file should be put under the WHENet root dir. Make sure you install WHENet properly first. import numpy as np import cv2 import os import argparse import pickle import json from tqdm import tqdm from math import cos, sin from whenet import WHENet from yolo_v3.yolo_postprocess import YOLO from PIL import Image ''' ''' def angle2matrix(angles): ''' get rotation matrix from three rotation angles(degree). right-handed. Args: angles: [3,]. x, y, z angles x: yaw. positive for looking left. y: pitch. positive for looking up. z: roll. positive for tilting head right. Returns: R: [3, 3]. rotation matrix. ''' x, y, z = np.deg2rad(angles[1]), -np.deg2rad(angles[0]), np.deg2rad(angles[2]) # x Rx=np.array([[1, 0, 0], [0, cos(x), -sin(x)], [0, sin(x), cos(x)]]) # y Ry=np.array([[ cos(y), 0, sin(y)], [ 0, 1, 0], [-sin(y), 0, cos(y)]]) # z Rz=np.array([[cos(z), -sin(z), 0], [sin(z), cos(z), 0], [ 0, 0, 1]]) R=Rz.dot(Ry.dot(Rx)) return R.astype(np.float32) def eg3dcamparams(R_in): ''' Input a rotation matrix, output 25 dim label matrix (16 dim extrinsic + 9 dim intrinsic) ''' camera_dist = 2.7 intrinsics = np.array([[4.2647, 0, 0.5], [0, 4.2647, 0.5], [0, 0, 1]]) # assume inputs are rotation matrices for world2cam projection R = np.array(R_in).astype(np.float32).reshape(4,4) # add camera translation t = np.eye(4, dtype=np.float32) t[2, 3] = - camera_dist # convert to OpenCV camera convert = np.array([ [1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1], ]).astype(np.float32) # world2cam -> cam2world P = convert @ t @ R cam2world = np.linalg.inv(P) # add intrinsics label_new = np.concatenate([cam2world.reshape(16), intrinsics.reshape(9)], -1) return label_new def process_detection(model, img, bbox, hori_label, args): y_min, x_min, y_max, x_max = bbox # enlarge the bbox to include more background margin y_min = max(0, y_min - abs(y_min - y_max) / 10) y_max = min(img.shape[0], y_max + abs(y_min - y_max) / 10) x_min = max(0, x_min - abs(x_min - x_max) / 5) x_max = min(img.shape[1], x_max + abs(x_min - x_max) / 5) x_max = min(x_max, img.shape[1]) x_diff = x_max - x_min y_diff = y_max - y_min edge = (x_diff+y_diff)/2 * 0.85 x_c, y_c = x_min + x_diff/2, y_min + y_diff/2 x_min, x_max, y_min, y_max = max(0, x_c - edge/2), x_c + edge/2, max(0, y_c - edge/2), y_c + edge/2 quad = np.stack([[x_min,y_min],[x_min,y_max],[x_max,y_max],[x_max,y_min]]).astype(np.float32) img_rgb = img[int(y_min):int(y_max), int(x_min):int(x_max)] img_rgb = cv2.cvtColor(img_rgb, cv2.COLOR_BGR2RGB) img_rgb = cv2.resize(img_rgb, (224, 224)) img_rgb = np.expand_dims(img_rgb, axis=0) cv2.rectangle(img, (int(x_min), int(y_min)), (int(x_max), int(y_max)), (0,0,0), 2) yaw, pitch, roll = model.get_angle(img_rgb) yaw, pitch, roll = np.squeeze([yaw, pitch, roll]) # convert horizontal label to the same format if hori_label>180 and hori_label<=360: hori_label = hori_label-360 # change sign yaw = hori_label pitch = -pitch # add random noise under uniformed distribution if abs(yaw) > 10: yaw += np.random.uniform(-6,6) yaw = float(np.clip(yaw, -180, 180)) t3d = np.array([0,0,0]) R = angle2matrix(np.squeeze([yaw, pitch, roll])) P = np.concatenate([R,t3d[:,None]],1) P = np.concatenate([P, np.array([[0,0,0,1.]])],0) return P, quad def main(args): in_dir = 'datasets/khair_uniquesub' image_dir = in_dir + '/image' whenet = WHENet(snapshot=args.snapshot) yolo = YOLO(**vars(args)) annotations = pickle.load(open(os.path.join(in_dir, 'khair_annot_unique.pkl'), 'rb')) results_meta = {} results_quad = {} # get a path list of all images path_list = os.listdir(image_dir) for idx, image_name in enumerate(tqdm(path_list)): # if idx == 2: # break image_path = os.path.join(image_dir, image_name) img = cv2.imread(image_path) if img is None: continue img_pil = Image.fromarray(img) bboxes, scores, classes = yolo.detect(img_pil) hori_label = int(annotations[annotations['path'] == image_name]['horizontal'].values) # bad detection, unrelated images. if len(bboxes) > 1 or len(bboxes) == 0: continue # perform next step, get angles and save other images for bbox in bboxes: P, quad = process_detection(whenet, img, bbox, hori_label, args) results_meta[image_path] = eg3dcamparams(P.flatten()) results_quad[image_path] = quad # Save meta data results_new = [] for img, P in results_meta.items(): img = os.path.basename(img) res = [format(r, '.6f') for r in P] results_new.append((img,res)) with open(os.path.join(in_dir,'dataset.json'), 'w') as outfile: json.dump({"labels": results_new}, outfile, indent="\t") # Save quads print("results:", len(results_quad)) with open(os.path.join(in_dir,'quads.pkl'), 'wb') as f: pickle.dump(results_quad, f) if __name__ == "__main__": parser = argparse.ArgumentParser(description='whenet demo with yolo') parser.add_argument('--snapshot', type=str, default='WHENet.h5', help='whenet snapshot path') parser.add_argument('--display', type=str, default='full', help='display all euler angle (simple, full)') parser.add_argument('--score', type=float, default=0.9, help='yolo confidence score threshold') parser.add_argument('--iou', type=float, default=0.3, help='yolo iou threshold') parser.add_argument('--gpu', type=str, default='0,1', help='gpu') parser.add_argument('--output', type=str, default='test.avi', help='output video name') args = parser.parse_args() main(args) We first save khair annotation to a .pkl file (not include in this script). Then we take its horizontal annotation as the yaw and use WHENet to estimate pitch and roll. We also get the quads.pkl, for cropping the images again to have consistent scale with the frontal images. We need to use https://github.com/SizheAn/PanoHead/blob/17ad915941c7e2703d5aa3eb5ff12eac47c90e53/3DDFA_V2_cropping/recrop_images.py#L104-L158 this function with quads.pkl to re-crop the khair images for the aforementioned purposes. The output dataset.json here will be used for final camera pose labels. I'm sorry this is really a huge mess. Since I am not with the company now this is the best script that I can find. Let me know if you have any issues, I will try my best to answer it. And I will be super grateful if you find it works and are able to make an easier version for others. — Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you authored the thread.Message ID: ***@***.***>

Dear authors,

Thanks for providing such a great script! I'm able to run it and get the cropped images and camera parameters. However, since my other data are processed with EG3D's pipeline, I'm wondering if the cropped back-head images are consistent with others processed with EG3D.

For example, I put two images, and personally, I feel the head in these back-head images looks larger than those in the FFHQ images. Please correct me if I'm wrong. If they are inconsistent, could you teach me how to align and crop them in a consistent way with EG3D?

Besides, I manually set all expand parameters in crop_final() to 0 as otherwise, the output size will be larger than 512x512. Will my adjustment cause any problems?

Last but not least, I would like to double check whether the cropping operation will cause any misalignment with the camera parameters since they are estimated from the uncropped images.

Glad to see you running it without issues.

Yeah we process FFHQ dataset using https://github.com/SizheAn/PanoHead/blob/17ad915941c7e2703d5aa3eb5ff12eac47c90e53/3DDFA_V2_cropping/recrop_images.py instead of original EG3D's pipeline that's why it looks a bit different.
crop_final()'s expand should stay the way it is as those are the best parameters I found to make frontal images and back images consistent. Current parameters make the photo size of 563x563 if I recall correctly. It's ok. We will be using https://github.com/SizheAn/PanoHead/blob/17ad915941c7e2703d5aa3eb5ff12eac47c90e53/dataset_tool_seg.py to zip them and there we can specify 512x512 as the size.
In terms of extrinsic, rotation shouldn't change but likely translation will change after cropping. However, we have the camera pose self-adaptation module in PanoHead so the model should be able to learn the translation offset and correct it.

I see, thanks for your quick reply! If I want to align these K-Hairstyle images with the scale produced by EG3D, do you think it's doable by manually tuning these expand parameters in crop_final()? I personally don't want to reprocess those FFHQ images as it's pretty slow on my side :(

I see, thanks for your quick reply! If I want to align these K-Hairstyle images with the scale produced by EG3D, do you think it's doable by manually tuning these expand parameters in crop_final()? I personally don't want to reprocess those FFHQ images as it's pretty slow on my side :(

I think it is doable. It might be really hard to find the suitable parameters though. Since FFHQ processing kind of center the image using its 'nose', whereas our processing center the image using its 'head centroid'. It's easy to change on the FFHQ side since it has clear facial landmarks. But if you want to change the khair side I don't think there is one-size-fits-all expand parameters can do that (like how do you find the semantic imaginary 'nose' for back images?). Currently we can do that for back images as we assume YOLO's bounding box find the head centroid then we do some minor adjustment in the process_detection function in the script that I uploaded.

But these are just my guess. Probably you can find better solutions :) Good luck and let me know!

We first save the khair annotation to a .pkl file (not included in this script, easy pre-processing from Excel to pkl).

I wanna ask where is the Excel file. I can only find the JSON file for labels.

	def crop_final(
	img,
	size=512,
	quad=None,
	top_expand=0.1,
	left_expand=0.05,
	bottom_expand=0.0,
	right_expand=0.05,
	blur_kernel=None,
	borderMode=cv2.BORDER_REFLECT,
	upsample=2,
	min_size=256,
	):

	orig_size = min(np.linalg.norm(quad[1] - quad[0]), np.linalg.norm(quad[2] - quad[1]))
	if min_size is not None and orig_size < min_size:
	return None

	crop_w = int(size * (1 + left_expand + right_expand))
	crop_h = int(size * (1 + top_expand + bottom_expand))
	crop_size = (crop_w, crop_h)

	top = int(size * top_expand)
	left = int(size * left_expand)
	size -= 1
	bound = np.array([[left, top], [left, top + size], [left + size, top + size], [left + size, top]],
	dtype=np.float32)

	mat = cv2.getAffineTransform(quad[:3], bound[:3])
	if upsample is None or upsample == 1:
	crop_img = cv2.warpAffine(np.array(img), mat, crop_size, flags=cv2.INTER_LANCZOS4, borderMode=borderMode)
	else:
	assert isinstance(upsample, int)
	crop_size_large = (crop_wupsample,crop_hupsample)
	crop_img = cv2.warpAffine(np.array(img), upsample*mat, crop_size_large, flags=cv2.INTER_LANCZOS4, borderMode=borderMode)
	crop_img = cv2.resize(crop_img, crop_size, interpolation=cv2.INTER_AREA)

	empty = np.ones_like(img) * 255
	crop_mask = cv2.warpAffine(empty, mat, crop_size)



	if True:
	mask_kernel = int(size0.02)2+1
	blur_kernel = int(size0.03)2+1 if blur_kernel is None else blur_kernel
	downsample_size = (crop_w//8, crop_h//8)

	if crop_mask.mean() < 255:
	blur_mask = cv2.blur(crop_mask.astype(np.float32).mean(2),(mask_kernel,mask_kernel)) / 255.0
	blur_mask = blur_mask[...,np.newaxis]#.astype(np.float32) / 255.0
	blurred_img = cv2.blur(crop_img, (blur_kernel, blur_kernel), 0)
	crop_img = crop_img * blur_mask + blurred_img * (1 - blur_mask)
	crop_img = crop_img.astype(np.uint8)

	return crop_img

How to process K-Hairstyle images?