LAV: remove experimental flag from the vvc decoder

matroskadec_haali: migrate to new side data functions
Restore channel count/layout update flags
2024-03-24 10:29:58 +01:00 · 2024-03-24 10:29:58 +01:00 · 2024-03-24 10:29:58 +01:00 · 2024-03-24 10:29:58 +01:00 · 2024-03-24 10:29:58 +01:00 · 2024-03-24 10:29:58 +01:00
237 changed files with 14205 additions and 1183 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,3 @@
+IndentWidth: 4
+UseTab: Never
+DisableFormat: true
--- a/2
+++ b/2
@ -35,6 +35,8 @@ version <next>:
 - AEA muxer
 - ffmpeg CLI loopback decoders
 - Support PacketTypeMetadata of PacketType in enhanced flv format
+- ffplay with hwaccel decoding support (depends on vulkan renderer via libplacebo)
+- dnn filter libtorch backend


 version 6.1:
--- a/compat/windows/dxva_av1.h
+++ b/compat/windows/dxva_av1.h
@ -0,0 +1,289 @@
+//------------------------------------------------------------------------------
+// File: DXVA.h
+//
+// Desc: DirectX Video Acceleration header file.
+//
+// Copyright (c) 1999 - 2002, Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+#ifndef _DIRECTX_AV1_VA_
+#define _DIRECTX_AV1_VA_
+
+#pragma pack(push, 1)
+
+/* AV1 picture entry data structure */
+typedef struct _DXVA_PicEntry_AV1 {
+
+    UINT width;
+    UINT height;
+
+    // Global motion parameters
+    INT wmmat[6];
+    union {
+        struct {
+            UCHAR wminvalid : 1;
+            UCHAR wmtype : 2;
+            UCHAR Reserved : 5;
+        };
+        UCHAR GlobalMotionFlags;
+    } DUMMYUNIONNAME;
+
+    UCHAR Index;
+    UINT16 Reserved16Bits;
+
+} DXVA_PicEntry_AV1, *LPDXVA_PicEntry_AV1;
+
+/* AV1 picture parameters structure */
+typedef struct _DXVA_PicParams_AV1 {
+    UINT width;
+    UINT height;
+
+    UINT max_width;
+    UINT max_height;
+
+    UCHAR CurrPicTextureIndex;
+    UCHAR superres_denom;
+    UCHAR bitdepth;
+    UCHAR seq_profile;
+
+    // Tiles:
+    struct {
+        UCHAR cols;
+        UCHAR rows;
+        USHORT context_update_id;
+        USHORT widths[64];
+        USHORT heights[64];
+    } tiles;
+
+    // Coding Tools
+    union {
+        struct {
+            UINT use_128x128_superblock : 1;
+            UINT intra_edge_filter : 1;
+            UINT interintra_compound : 1;
+            UINT masked_compound : 1;
+            UINT warped_motion : 1;
+            UINT dual_filter : 1;
+            UINT jnt_comp : 1;
+            UINT screen_content_tools : 1;
+            UINT integer_mv : 1;
+            UINT cdef : 1;
+            UINT restoration : 1;
+            UINT film_grain : 1;
+            UINT intrabc : 1;
+            UINT high_precision_mv : 1;
+            UINT switchable_motion_mode : 1;
+            UINT filter_intra : 1;
+            UINT disable_frame_end_update_cdf : 1;
+            UINT disable_cdf_update : 1;
+            UINT reference_mode : 1;
+            UINT skip_mode : 1;
+            UINT reduced_tx_set : 1;
+            UINT superres : 1;
+            UINT tx_mode : 2;
+            UINT use_ref_frame_mvs : 1;
+            UINT enable_ref_frame_mvs : 1;
+            UINT reference_frame_update : 1;
+            UINT Reserved : 5;
+        };
+        UINT32 CodingParamToolFlags;
+    } coding;
+
+    // Format & Picture Info flags
+    union {
+        struct {
+            UCHAR frame_type : 2;
+            UCHAR show_frame : 1;
+            UCHAR showable_frame : 1;
+            UCHAR subsampling_x : 1;
+            UCHAR subsampling_y : 1;
+            UCHAR mono_chrome : 1;
+            UCHAR Reserved : 1;
+        };
+        UCHAR FormatAndPictureInfoFlags;
+    } format;
+
+    // References
+    UCHAR primary_ref_frame;
+    UCHAR order_hint;
+    UCHAR order_hint_bits;
+
+    DXVA_PicEntry_AV1 frame_refs[7];
+    UCHAR RefFrameMapTextureIndex[8];
+
+    // Loop filter parameters
+    struct {
+        UCHAR filter_level[2];
+        UCHAR filter_level_u;
+        UCHAR filter_level_v;
+
+        UCHAR sharpness_level;
+        union {
+            struct {
+                UCHAR mode_ref_delta_enabled : 1;
+                UCHAR mode_ref_delta_update : 1;
+                UCHAR delta_lf_multi : 1;
+                UCHAR delta_lf_present : 1;
+                UCHAR Reserved : 4;
+            };
+            UCHAR ControlFlags;
+        } DUMMYUNIONNAME;
+        CHAR ref_deltas[8];
+        CHAR mode_deltas[2];
+        UCHAR delta_lf_res;
+        UCHAR frame_restoration_type[3];
+        USHORT log2_restoration_unit_size[3];
+        UINT16 Reserved16Bits;
+    } loop_filter;
+
+    // Quantization
+    struct {
+        union {
+            struct {
+                UCHAR delta_q_present : 1;
+                UCHAR delta_q_res : 2;
+                UCHAR Reserved : 5;
+            };
+            UCHAR ControlFlags;
+        } DUMMYUNIONNAME;
+
+        UCHAR base_qindex;
+        CHAR y_dc_delta_q;
+        CHAR u_dc_delta_q;
+        CHAR v_dc_delta_q;
+        CHAR u_ac_delta_q;
+        CHAR v_ac_delta_q;
+        // using_qmatrix:
+        UCHAR qm_y;
+        UCHAR qm_u;
+        UCHAR qm_v;
+        UINT16 Reserved16Bits;
+    } quantization;
+
+    // Cdef parameters
+    struct {
+        union {
+            struct {
+                UCHAR damping : 2;
+                UCHAR bits : 2;
+                UCHAR Reserved : 4;
+            };
+            UCHAR ControlFlags;
+        } DUMMYUNIONNAME;
+
+        union {
+            struct {
+                UCHAR primary : 6;
+                UCHAR secondary : 2;
+            };
+            UCHAR combined;
+        } y_strengths[8];
+
+        union {
+            struct {
+                UCHAR primary : 6;
+                UCHAR secondary : 2;
+            };
+            UCHAR combined;
+        } uv_strengths[8];
+
+    } cdef;
+
+    UCHAR interp_filter;
+
+    // Segmentation
+    struct {
+        union {
+            struct {
+                UCHAR enabled : 1;
+                UCHAR update_map : 1;
+                UCHAR update_data : 1;
+                UCHAR temporal_update : 1;
+                UCHAR Reserved : 4;
+            };
+            UCHAR ControlFlags;
+        } DUMMYUNIONNAME;
+        UCHAR Reserved24Bits[3];
+
+        union {
+            struct {
+                UCHAR alt_q : 1;
+                UCHAR alt_lf_y_v : 1;
+                UCHAR alt_lf_y_h : 1;
+                UCHAR alt_lf_u : 1;
+                UCHAR alt_lf_v : 1;
+                UCHAR ref_frame : 1;
+                UCHAR skip : 1;
+                UCHAR globalmv : 1;
+            };
+            UCHAR mask;
+        } feature_mask[8];
+
+        SHORT feature_data[8][8];
+
+    } segmentation;
+
+    struct {
+        union {
+            struct {
+                USHORT apply_grain : 1;
+                USHORT scaling_shift_minus8 : 2;
+                USHORT chroma_scaling_from_luma : 1;
+                USHORT ar_coeff_lag : 2;
+                USHORT ar_coeff_shift_minus6 : 2;
+                USHORT grain_scale_shift : 2;
+                USHORT overlap_flag : 1;
+                USHORT clip_to_restricted_range : 1;
+                USHORT matrix_coeff_is_identity : 1;
+                USHORT Reserved : 3;
+            };
+            USHORT ControlFlags;
+        } DUMMYUNIONNAME;
+
+        USHORT grain_seed;
+        UCHAR scaling_points_y[14][2];
+        UCHAR num_y_points;
+        UCHAR scaling_points_cb[10][2];
+        UCHAR num_cb_points;
+        UCHAR scaling_points_cr[10][2];
+        UCHAR num_cr_points;
+        UCHAR ar_coeffs_y[24];
+        UCHAR ar_coeffs_cb[25];
+        UCHAR ar_coeffs_cr[25];
+        UCHAR cb_mult;
+        UCHAR cb_luma_mult;
+        UCHAR cr_mult;
+        UCHAR cr_luma_mult;
+        UCHAR Reserved8Bits;
+        SHORT cb_offset;
+        SHORT cr_offset;
+    } film_grain;
+
+    UINT   Reserved32Bits;
+    UINT   StatusReportFeedbackNumber;
+} DXVA_PicParams_AV1, *LPDXVA_PicParams_AV1;
+
+/* AV1 tile structure */
+typedef struct _DXVA_Tile_AV1 {
+    UINT   DataOffset;
+    UINT   DataSize;
+    USHORT row;
+    USHORT column;
+    UINT16 Reserved16Bits;
+    UCHAR anchor_frame;
+    UCHAR Reserved8Bits;
+} DXVA_Tile_AV1, *LPDXVA_Tile_AV1;
+
+/* AV1 status reporting data structure */
+typedef struct _DXVA_Status_AV1 {
+    UINT  StatusReportFeedbackNumber;
+    DXVA_PicEntry_AV1 CurrPic;
+    UCHAR  BufType;
+    UCHAR  Status;
+    UCHAR  Reserved8Bits;
+    USHORT NumMbsAffected;
+} DXVA_Status_AV1, *LPDXVA_Status_AV1;
+
+#pragma pack(pop)
+
+#endif // _DIRECTX_AV1_VA_
--- a/compat/windows/dxva_hevc.h
+++ b/compat/windows/dxva_hevc.h
@ -0,0 +1,150 @@
+//------------------------------------------------------------------------------
+// Copyright (c) 1999 - 2002, Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+#ifndef __DIRECTX_VA_HEVC__
+#define __DIRECTX_VA_HEVC__
+
+#pragma pack(push, 1)
+
+/* HEVC Picture Entry structure */
+typedef struct _DXVA_PicEntry_HEVC
+{
+    union
+    {
+        struct
+        {
+            UCHAR Index7Bits : 7;
+            UCHAR AssociatedFlag : 1;
+        };
+        UCHAR bPicEntry;
+    };
+} DXVA_PicEntry_HEVC, *LPDXVA_PicEntry_HEVC;
+
+/* HEVC Picture Parameter structure */
+typedef struct _DXVA_PicParams_HEVC {
+    USHORT      PicWidthInMinCbsY;
+    USHORT      PicHeightInMinCbsY;
+    union {
+        struct {
+            USHORT  chroma_format_idc                       : 2;
+            USHORT  separate_colour_plane_flag              : 1;
+            USHORT  bit_depth_luma_minus8                   : 3;
+            USHORT  bit_depth_chroma_minus8                 : 3;
+            USHORT  log2_max_pic_order_cnt_lsb_minus4       : 4;
+            USHORT  NoPicReorderingFlag                     : 1;
+            USHORT  NoBiPredFlag                            : 1;
+            USHORT  ReservedBits1                            : 1;
+        };
+        USHORT wFormatAndSequenceInfoFlags;
+    };
+    DXVA_PicEntry_HEVC  CurrPic;
+    UCHAR   sps_max_dec_pic_buffering_minus1;
+    UCHAR   log2_min_luma_coding_block_size_minus3;
+    UCHAR   log2_diff_max_min_luma_coding_block_size;
+    UCHAR   log2_min_transform_block_size_minus2;
+    UCHAR   log2_diff_max_min_transform_block_size;
+    UCHAR   max_transform_hierarchy_depth_inter;
+    UCHAR   max_transform_hierarchy_depth_intra;
+    UCHAR   num_short_term_ref_pic_sets;
+    UCHAR   num_long_term_ref_pics_sps;
+    UCHAR   num_ref_idx_l0_default_active_minus1;
+    UCHAR   num_ref_idx_l1_default_active_minus1;
+    CHAR    init_qp_minus26;
+    UCHAR   ucNumDeltaPocsOfRefRpsIdx;
+    USHORT  wNumBitsForShortTermRPSInSlice;
+    USHORT  ReservedBits2;
+
+    union {
+        struct {
+            UINT32  scaling_list_enabled_flag                    : 1;
+            UINT32  amp_enabled_flag                            : 1;
+            UINT32  sample_adaptive_offset_enabled_flag         : 1;
+            UINT32  pcm_enabled_flag                            : 1;
+            UINT32  pcm_sample_bit_depth_luma_minus1            : 4;
+            UINT32  pcm_sample_bit_depth_chroma_minus1          : 4;
+            UINT32  log2_min_pcm_luma_coding_block_size_minus3  : 2;
+            UINT32  log2_diff_max_min_pcm_luma_coding_block_size : 2;
+            UINT32  pcm_loop_filter_disabled_flag                : 1;
+            UINT32  long_term_ref_pics_present_flag             : 1;
+            UINT32  sps_temporal_mvp_enabled_flag               : 1;
+            UINT32  strong_intra_smoothing_enabled_flag         : 1;
+            UINT32  dependent_slice_segments_enabled_flag       : 1;
+            UINT32  output_flag_present_flag                    : 1;
+            UINT32  num_extra_slice_header_bits                 : 3;
+            UINT32  sign_data_hiding_enabled_flag               : 1;
+            UINT32  cabac_init_present_flag                     : 1;
+            UINT32  ReservedBits3                               : 5;
+        };
+        UINT32 dwCodingParamToolFlags;
+    };
+
+    union {
+        struct {
+            UINT32  constrained_intra_pred_flag                 : 1;
+            UINT32  transform_skip_enabled_flag                 : 1;
+            UINT32  cu_qp_delta_enabled_flag                    : 1;
+            UINT32  pps_slice_chroma_qp_offsets_present_flag    : 1;
+            UINT32  weighted_pred_flag                          : 1;
+            UINT32  weighted_bipred_flag                        : 1;
+            UINT32  transquant_bypass_enabled_flag              : 1;
+            UINT32  tiles_enabled_flag                          : 1;
+            UINT32  entropy_coding_sync_enabled_flag            : 1;
+            UINT32  uniform_spacing_flag                        : 1;
+            UINT32  loop_filter_across_tiles_enabled_flag       : 1;
+            UINT32  pps_loop_filter_across_slices_enabled_flag  : 1;
+            UINT32  deblocking_filter_override_enabled_flag     : 1;
+            UINT32  pps_deblocking_filter_disabled_flag         : 1;
+            UINT32  lists_modification_present_flag             : 1;
+            UINT32  slice_segment_header_extension_present_flag : 1;
+            UINT32  IrapPicFlag                                 : 1;
+            UINT32  IdrPicFlag                                  : 1;
+            UINT32  IntraPicFlag                                : 1;
+            UINT32  ReservedBits4                               : 13;
+        };
+        UINT32 dwCodingSettingPicturePropertyFlags;
+    };
+    CHAR    pps_cb_qp_offset;
+    CHAR    pps_cr_qp_offset;
+    UCHAR   num_tile_columns_minus1;
+    UCHAR   num_tile_rows_minus1;
+    USHORT  column_width_minus1[19];
+    USHORT  row_height_minus1[21];
+    UCHAR   diff_cu_qp_delta_depth;
+    CHAR    pps_beta_offset_div2;
+    CHAR    pps_tc_offset_div2;
+    UCHAR   log2_parallel_merge_level_minus2;
+    INT     CurrPicOrderCntVal;
+    DXVA_PicEntry_HEVC	RefPicList[15];
+    UCHAR   ReservedBits5;
+    INT     PicOrderCntValList[15];
+    UCHAR   RefPicSetStCurrBefore[8];
+    UCHAR   RefPicSetStCurrAfter[8];
+    UCHAR   RefPicSetLtCurr[8];
+    USHORT  ReservedBits6;
+    USHORT  ReservedBits7;
+    UINT    StatusReportFeedbackNumber;
+} DXVA_PicParams_HEVC, *LPDXVA_PicParams_HEVC;
+
+/* HEVC Quantizatiuon Matrix structure */
+typedef struct _DXVA_Qmatrix_HEVC
+{
+    UCHAR ucScalingLists0[6][16];
+    UCHAR ucScalingLists1[6][64];
+    UCHAR ucScalingLists2[6][64];
+    UCHAR ucScalingLists3[2][64];
+    UCHAR ucScalingListDCCoefSizeID2[6];
+    UCHAR ucScalingListDCCoefSizeID3[2];
+} DXVA_Qmatrix_HEVC, *LPDXVA_Qmatrix_HEVC;
+
+
+/* HEVC Slice Control Structure */
+typedef struct _DXVA_Slice_HEVC_Short
+{
+    UINT    BSNALunitDataLocation;
+    UINT    SliceBytesInBuffer;
+    USHORT  wBadSliceChopping;
+} DXVA_Slice_HEVC_Short, *LPDXVA_Slice_HEVC_Short;
+
+#pragma pack(pop)
+#endif
--- a/compat/windows/dxva_vpx.h
+++ b/compat/windows/dxva_vpx.h
@ -0,0 +1,185 @@
+//------------------------------------------------------------------------------
+// Copyright (c) 1999 - 2002, Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+#ifndef __DIRECTX_VA_VPX__
+#define __DIRECTX_VA_VPX__
+
+#pragma pack(push, 1)
+
+/* VPx picture entry data structure */
+typedef struct _DXVA_PicEntry_VPx {
+    union {
+        struct {
+            UCHAR Index7Bits : 7;
+            UCHAR AssociatedFlag : 1;
+        };
+        UCHAR bPicEntry;
+    };
+} DXVA_PicEntry_VPx, *LPDXVA_PicEntry_VPx;
+
+/* VP9 segmentation structure */
+typedef struct _segmentation_VP9 {
+    union {
+        struct {
+            UCHAR enabled : 1;
+            UCHAR update_map : 1;
+            UCHAR temporal_update : 1;
+            UCHAR abs_delta : 1;
+            UCHAR ReservedSegmentFlags4Bits : 4;
+        };
+        UCHAR wSegmentInfoFlags;
+    };
+    UCHAR tree_probs[7];
+    UCHAR pred_probs[3];
+    SHORT feature_data[8][4];
+    UCHAR feature_mask[8];
+} DXVA_segmentation_VP9;
+
+/* VP9 picture parameters structure */
+typedef struct _DXVA_PicParams_VP9 {
+    DXVA_PicEntry_VPx    CurrPic;
+    UCHAR                profile;
+    union {
+        struct {
+            USHORT frame_type : 1;
+            USHORT show_frame : 1;
+            USHORT error_resilient_mode : 1;
+            USHORT subsampling_x : 1;
+            USHORT subsampling_y : 1;
+            USHORT extra_plane : 1;
+            USHORT refresh_frame_context : 1;
+            USHORT frame_parallel_decoding_mode : 1;
+            USHORT intra_only : 1;
+            USHORT frame_context_idx : 2;
+            USHORT reset_frame_context : 2;
+            USHORT allow_high_precision_mv : 1;
+            USHORT ReservedFormatInfo2Bits : 2;
+        };
+        USHORT wFormatAndPictureInfoFlags;
+    };
+    UINT  width;
+    UINT  height;
+    UCHAR BitDepthMinus8Luma;
+    UCHAR BitDepthMinus8Chroma;
+    UCHAR interp_filter;
+    UCHAR Reserved8Bits;
+    DXVA_PicEntry_VPx  ref_frame_map[8];
+    UINT  ref_frame_coded_width[8];
+    UINT  ref_frame_coded_height[8];
+    DXVA_PicEntry_VPx  frame_refs[3];
+    CHAR  ref_frame_sign_bias[4];
+    CHAR  filter_level;
+    CHAR  sharpness_level;
+    union {
+        struct {
+            UCHAR mode_ref_delta_enabled : 1;
+            UCHAR mode_ref_delta_update : 1;
+            UCHAR use_prev_in_find_mv_refs : 1;
+            UCHAR ReservedControlInfo5Bits : 5;
+        };
+        UCHAR wControlInfoFlags;
+    };
+    CHAR   ref_deltas[4];
+    CHAR   mode_deltas[2];
+    SHORT  base_qindex;
+    CHAR   y_dc_delta_q;
+    CHAR   uv_dc_delta_q;
+    CHAR   uv_ac_delta_q;
+    DXVA_segmentation_VP9 stVP9Segments;
+    UCHAR  log2_tile_cols;
+    UCHAR  log2_tile_rows;
+    USHORT uncompressed_header_size_byte_aligned;
+    USHORT first_partition_size;
+    USHORT Reserved16Bits;
+    UINT   Reserved32Bits;
+    UINT   StatusReportFeedbackNumber;
+} DXVA_PicParams_VP9, *LPDXVA_PicParams_VP9;
+
+/* VP8 segmentation structure */
+typedef struct _segmentation_VP8 {
+    union {
+        struct {
+            UCHAR segmentation_enabled : 1;
+            UCHAR update_mb_segmentation_map : 1;
+            UCHAR update_mb_segmentation_data : 1;
+            UCHAR mb_segement_abs_delta : 1;
+            UCHAR ReservedSegmentFlags4Bits : 4;
+        };
+        UCHAR wSegmentFlags;
+    };
+    CHAR  segment_feature_data[2][4];
+    UCHAR mb_segment_tree_probs[3];
+} DXVA_segmentation_VP8;
+
+/* VP8 picture parameters structure */
+typedef struct _DXVA_PicParams_VP8 {
+    UINT first_part_size;
+    UINT width;
+    UINT height;
+    DXVA_PicEntry_VPx  CurrPic;
+    union {
+        struct {
+            UCHAR frame_type : 1;
+            UCHAR version : 3;
+            UCHAR show_frame : 1;
+            UCHAR clamp_type : 1;
+            UCHAR ReservedFrameTag3Bits : 2;
+        };
+        UCHAR wFrameTagFlags;
+    };
+    DXVA_segmentation_VP8  stVP8Segments;
+    UCHAR filter_type;
+    UCHAR filter_level;
+    UCHAR sharpness_level;
+    UCHAR mode_ref_lf_delta_enabled;
+    UCHAR mode_ref_lf_delta_update;
+    CHAR  ref_lf_deltas[4];
+    CHAR  mode_lf_deltas[4];
+    UCHAR log2_nbr_of_dct_partitions;
+    UCHAR base_qindex;
+    CHAR  y1dc_delta_q;
+    CHAR  y2dc_delta_q;
+    CHAR  y2ac_delta_q;
+    CHAR  uvdc_delta_q;
+    CHAR  uvac_delta_q;
+    DXVA_PicEntry_VPx alt_fb_idx;
+    DXVA_PicEntry_VPx gld_fb_idx;
+    DXVA_PicEntry_VPx lst_fb_idx;
+    UCHAR  ref_frame_sign_bias_golden;
+    UCHAR  ref_frame_sign_bias_altref;
+    UCHAR  refresh_entropy_probs;
+    UCHAR  vp8_coef_update_probs[4][8][3][11];
+    UCHAR  mb_no_coeff_skip;
+    UCHAR  prob_skip_false;
+    UCHAR  prob_intra;
+    UCHAR  prob_last;
+    UCHAR  prob_golden;
+    UCHAR  intra_16x16_prob[4];
+    UCHAR  intra_chroma_prob[3];
+    UCHAR  vp8_mv_update_probs[2][19];
+    USHORT ReservedBits1;
+    USHORT ReservedBits2;
+    USHORT ReservedBits3;
+    UINT   StatusReportFeedbackNumber;
+} DXVA_PicParams_VP8, *LPDXVA_PicParams_VP8;
+
+/* VPx slice control data structure - short form */
+typedef struct _DXVA_Slice_VPx_Short {
+    UINT   BSNALunitDataLocation;
+    UINT   SliceBytesInBuffer;
+    USHORT wBadSliceChopping;
+} DXVA_Slice_VPx_Short, *LPDXVA_Slice_VPx_Short;
+
+/* VPx status reporting data structure */
+typedef struct _DXVA_Status_VPx {
+    UINT  StatusReportFeedbackNumber;
+    DXVA_PicEntry_VPx CurrPic;
+    UCHAR  bBufType;
+    UCHAR  bStatus;
+    UCHAR  bReserved8Bits;
+    USHORT wNumMbsAffected;
+} DXVA_Status_VPx, *LPDXVA_Status_VPx;
+
+#pragma pack(pop)
+#endif
--- a/115
+++ b/115
@ -281,6 +281,7 @@ External library support:
  --enable-libtheora       enable Theora encoding via libtheora [no]
  --enable-libtls          enable LibreSSL (via libtls), needed for https support
                           if openssl, gnutls or mbedtls is not used [no]
+  --enable-libtorch        enable Torch as one DNN backend [no]
  --enable-libtwolame      enable MP2 encoding via libtwolame [no]
  --enable-libuavs3d       enable AVS3 decoding via libuavs3d [no]
  --enable-libv4l2         enable libv4l2/v4l-utils [no]
@ -386,7 +387,9 @@ Toolchain options:
  --windres=WINDRES        use windows resource compiler WINDRES [$windres_default]
  --x86asmexe=EXE          use nasm-compatible assembler EXE [$x86asmexe_default]
  --cc=CC                  use C compiler CC [$cc_default]
+  --stdc=STDC              use C standard STDC [$stdc_default]
  --cxx=CXX                use C compiler CXX [$cxx_default]
+  --stdcxx=STDCXX          use C standard STDCXX [$stdcxx_default]
  --objcc=OCC              use ObjC compiler OCC [$cc_default]
  --dep-cc=DEPCC           use dependency generator DEPCC [$cc_default]
  --nvcc=NVCC              use Nvidia CUDA compiler NVCC or clang [$nvcc_default]
@ -1453,6 +1456,33 @@ test_cflags_cc(){
 EOF
 }

+check_cflags_cc(){
+    log check_cflags_cc "$@"
+    flags=$1
+    test_cflags_cc "$@" && add_cflags $flags
+}
+
+test_cxxflags_cc(){
+    log test_cxxflags_cc "$@"
+    flags=$1
+    header=$2
+    condition=$3
+    shift 3
+    set -- $($cflags_filter "$flags")
+    test_cxx "$@" <<EOF
+#include <$header>
+#if !($condition)
+#error "unsatisfied condition: $condition"
+#endif
+EOF
+}
+
+check_cxxflags_cc(){
+    log check_cxxflags_cc "$@"
+    flags=$1
+    test_cxxflags_cc "$@" && add_cxxflags $flags
+}
+
 check_lib(){
    log check_lib "$@"
    name="$1"
@ -1694,6 +1724,27 @@ int x;
 EOF
 }

+test_host_cflags_cc(){
+    log test_host_cflags_cc "$@"
+    flags=$1
+    header=$2
+    condition=$3
+    shift 3
+    set -- $($host_cflags_filter "$flags")
+    test_host_cc "$@" <<EOF
+#include <$header>
+#if !($condition)
+#error "unsatisfied condition: $condition"
+#endif
+EOF
+}
+
+check_host_cflags_cc(){
+    log check_host_cflags_cc "$@"
+    flags=$1
+    test_host_cflags_cc "$@" && add_host_cflags $flags
+}
+
 test_host_cpp_condition(){
    log test_host_cpp_condition "$@"
    header=$1
@ -1905,6 +1956,7 @@ EXTERNAL_LIBRARY_LIST="
    libtensorflow
    libtesseract
    libtheora
+    libtorch
    libtwolame
    libuavs3d
    libv4l2
@ -2403,6 +2455,9 @@ TOOLCHAIN_FEATURES="
 TYPES_LIST="
    DPI_AWARENESS_CONTEXT
    IDXGIOutput5
+    DXVA_PicParams_AV1
+    DXVA_PicParams_HEVC
+    DXVA_PicParams_VP9
    kCMVideoCodecType_HEVC
    kCMVideoCodecType_HEVCWithAlpha
    kCMVideoCodecType_VP9
@ -2531,6 +2586,7 @@ CONFIG_EXTRA="
    jpegtables
    lgplv3
    libx262
+    libx264_hdr10
    llauddsp
    llviddsp
    llvidencdsp
@ -2650,6 +2706,8 @@ CMDLINE_SET="
    random_seed
    ranlib
    samples
+    stdc
+    stdcxx
    strip
    sws_max_filter_size
    sysinclude
@ -2785,7 +2843,7 @@ cbs_vp9_select="cbs"
 deflate_wrapper_deps="zlib"
 dirac_parse_select="golomb"
 dovi_rpu_select="golomb"
-dnn_suggest="libtensorflow libopenvino"
+dnn_suggest="libtensorflow libopenvino libtorch"
 dnn_deps="avformat swscale"
 error_resilience_select="me_cmp"
 evcparse_select="golomb"
@ -3079,13 +3137,13 @@ videotoolbox_hwaccel_extralibs="-framework QuartzCore"
 vulkan_deps="threads"
 vulkan_deps_any="libdl LoadLibrary"

-av1_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
+av1_d3d11va_hwaccel_deps="d3d11va"
 av1_d3d11va_hwaccel_select="av1_decoder"
-av1_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
+av1_d3d11va2_hwaccel_deps="d3d11va"
 av1_d3d11va2_hwaccel_select="av1_decoder"
-av1_d3d12va_hwaccel_deps="d3d12va DXVA_PicParams_AV1"
+av1_d3d12va_hwaccel_deps="d3d12va"
 av1_d3d12va_hwaccel_select="av1_decoder"
-av1_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_AV1"
+av1_dxva2_hwaccel_deps="dxva2"
 av1_dxva2_hwaccel_select="av1_decoder"
 av1_nvdec_hwaccel_deps="nvdec CUVIDAV1PICPARAMS"
 av1_nvdec_hwaccel_select="av1_decoder"
@ -3117,13 +3175,13 @@ h264_videotoolbox_hwaccel_deps="videotoolbox"
 h264_videotoolbox_hwaccel_select="h264_decoder"
 h264_vulkan_hwaccel_deps="vulkan"
 h264_vulkan_hwaccel_select="h264_decoder"
-hevc_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_HEVC"
+hevc_d3d11va_hwaccel_deps="d3d11va"
 hevc_d3d11va_hwaccel_select="hevc_decoder"
-hevc_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_HEVC"
+hevc_d3d11va2_hwaccel_deps="d3d11va"
 hevc_d3d11va2_hwaccel_select="hevc_decoder"
-hevc_d3d12va_hwaccel_deps="d3d12va DXVA_PicParams_HEVC"
+hevc_d3d12va_hwaccel_deps="d3d12va"
 hevc_d3d12va_hwaccel_select="hevc_decoder"
-hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
+hevc_dxva2_hwaccel_deps="dxva2"
 hevc_dxva2_hwaccel_select="hevc_decoder"
 hevc_nvdec_hwaccel_deps="nvdec"
 hevc_nvdec_hwaccel_select="hevc_decoder"
@ -3189,13 +3247,13 @@ vp8_nvdec_hwaccel_deps="nvdec"
 vp8_nvdec_hwaccel_select="vp8_decoder"
 vp8_vaapi_hwaccel_deps="vaapi"
 vp8_vaapi_hwaccel_select="vp8_decoder"
-vp9_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_VP9"
+vp9_d3d11va_hwaccel_deps="d3d11va"
 vp9_d3d11va_hwaccel_select="vp9_decoder"
-vp9_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_VP9"
+vp9_d3d11va2_hwaccel_deps="d3d11va"
 vp9_d3d11va2_hwaccel_select="vp9_decoder"
-vp9_d3d12va_hwaccel_deps="d3d12va DXVA_PicParams_VP9"
+vp9_d3d12va_hwaccel_deps="d3d12va"
 vp9_d3d12va_hwaccel_select="vp9_decoder"
-vp9_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_VP9"
+vp9_dxva2_hwaccel_deps="dxva2"
 vp9_dxva2_hwaccel_select="vp9_decoder"
 vp9_nvdec_hwaccel_deps="nvdec"
 vp9_nvdec_hwaccel_select="vp9_decoder"
@ -3484,7 +3542,7 @@ libwebp_encoder_deps="libwebp"
 libwebp_anim_encoder_deps="libwebp"
 libx262_encoder_deps="libx262"
 libx264_encoder_deps="libx264"
-libx264_encoder_select="atsc_a53"
+libx264_encoder_select="atsc_a53 golomb"
 libx264rgb_encoder_deps="libx264"
 libx264rgb_encoder_select="libx264_encoder"
 libx265_encoder_deps="libx265"
@ -3656,6 +3714,8 @@ xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
 xv_outdev_deps="xlib_xv xlib_x11 xlib_xext"

 # protocols
+android_content_protocol_deps="jni"
+android_content_protocol_select="file_protocol"
 async_protocol_deps="threads"
 bluray_protocol_deps="libbluray"
 ffrtmpcrypt_protocol_conflict="librtmp_protocol"
@ -3978,6 +4038,8 @@ mandir_default='${prefix}/share/man'
 # toolchain
 ar_default="ar"
 cc_default="gcc"
+stdc_default="c17"
+stdcxx_default="c++11"
 cxx_default="g++"
 host_cc_default="gcc"
 doxygen_default="doxygen"
@ -4585,7 +4647,7 @@ if enabled cuda_nvcc; then
 fi

 set_default arch cc cxx doxygen pkg_config ranlib strip sysinclude \
-    target_exec x86asmexe metalcc metallib
+    target_exec x86asmexe metalcc metallib stdc stdcxx
 enabled cross_compile || host_cc_default=$cc
 set_default host_cc

@ -4755,7 +4817,7 @@ icl_flags(){
            # Despite what Intel's documentation says -Wall, which is supported
            # on Windows, does enable remarks so disable them here.
            -Wall)                echo $flag -Qdiag-disable:remark ;;
-            -std=c11)             echo -Qstd=c11 ;;
+            -std=$stdc)           echo -Qstd=$stdc ;;
            -flto*)               echo -ipo ;;
        esac
    done
@ -4803,7 +4865,7 @@ suncc_flags(){
                    athlon*)                   echo -xarch=pentium_proa  ;;
                esac
                ;;
-            -std=c11)             echo -xc11              ;;
+            -std=$stdc)           echo -x$stdc            ;;
            -fomit-frame-pointer) echo -xregs=frameptr    ;;
            -fPIC)                echo -KPIC -xcode=pic32 ;;
            -W*,*)                echo $flag              ;;
@ -4892,8 +4954,8 @@ probe_cc(){
        _type=suncc
        _ident=$($_cc -V 2>&1 | head -n1 | cut -d' ' -f 2-)
        _DEPCMD='$(DEP$(1)) $(DEP$(1)FLAGS) $($(1)DEP_FLAGS) $< | sed -e "1s,^.*: ,$@: ," -e "\$$!s,\$$, \\\," -e "1!s,^.*: , ," > $(@:.o=.d)'
-        _DEPFLAGS='-xM1 -xc11'
-        _ldflags='-std=c11'
+        _DEPFLAGS='-xM1 -x$stdc'
+        _ldflags='-std=$stdc'
        _cflags_speed='-O5'
        _cflags_size='-O5 -xspace'
        _flags_filter=suncc_flags
@ -5524,18 +5586,21 @@ fi

 add_cppflags -D_ISOC11_SOURCE
 add_cxxflags -D__STDC_CONSTANT_MACROS
-check_cxxflags -std=c++11 || check_cxxflags -std=c++0x
+check_cxxflags_cc -std=$stdcxx ctype.h "__cplusplus >= 201103L" ||
+    { check_cxxflags -std=c++11 && stdcxx="c++11" || { check_cxxflags -std=c++0x && stdcxx="c++0x"; }; }

 # some compilers silently accept -std=c11, so we also need to check that the
 # version macro is defined properly
-test_cflags_cc -std=c11 ctype.h "__STDC_VERSION__ >= 201112L" &&
-    add_cflags -std=c11 || die "Compiler lacks C11 support"
+check_cflags_cc -std=$stdc ctype.h "__STDC_VERSION__ >= 201112L" ||
+    { check_cflags_cc -std=c11 ctype.h "__STDC_VERSION__ >= 201112L" && stdc="c11" || die "Compiler lacks C11 support"; }

 check_cppflags -D_FILE_OFFSET_BITS=64
 check_cppflags -D_LARGEFILE_SOURCE

 add_host_cppflags -D_ISOC11_SOURCE
-check_host_cflags -std=c11
+check_host_cflags_cc -std=$stdc ctype.h "__STDC_VERSION__ >= 201112L" ||
+    check_host_cflags_cc -std=c11 ctype.h "__STDC_VERSION__ >= 201112L" || die "Host compiler lacks C11 support"
+
 check_host_cflags -Wall
 check_host_cflags $host_cflags_speed

@ -6877,13 +6942,14 @@ enabled libsmbclient      && { check_pkg_config libsmbclient smbclient libsmbcli
 enabled libsnappy         && require libsnappy snappy-c.h snappy_compress -lsnappy -lstdc++
 enabled libsoxr           && require libsoxr soxr.h soxr_create -lsoxr
 enabled libssh            && require_pkg_config libssh "libssh >= 0.6.0" libssh/sftp.h sftp_init
-enabled libspeex          && require_pkg_config libspeex speex speex/speex.h speex_decoder_init
+enabled libspeex          && require libspeex speex/speex.h speex_decoder_init -lspeex
 enabled libsrt            && require_pkg_config libsrt "srt >= 1.3.0" srt/srt.h srt_socket
 enabled libsvtav1         && require_pkg_config libsvtav1 "SvtAv1Enc >= 0.9.0" EbSvtAv1Enc.h svt_av1_enc_init_handle
 enabled libtensorflow     && require libtensorflow tensorflow/c/c_api.h TF_Version -ltensorflow
 enabled libtesseract      && require_pkg_config libtesseract tesseract tesseract/capi.h TessBaseAPICreate
 enabled libtheora         && require libtheora theora/theoraenc.h th_info_init -ltheoraenc -ltheoradec -logg
 enabled libtls            && require_pkg_config libtls libtls tls.h tls_configure
+enabled libtorch          && check_cxxflags -std=c++17 && require_cpp libtorch torch/torch.h "torch::Tensor" -ltorch -lc10 -ltorch_cpu -lstdc++ -lpthread
 enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame &&
                             { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame ||
                               die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; }
@ -6925,6 +6991,7 @@ enabled libx264           && require_pkg_config libx264 x264 "stdint.h x264.h" x
                             require_cpp_condition libx264 x264.h "X264_BUILD >= 122" && {
                             [ "$toolchain" != "msvc" ] ||
                             require_cpp_condition libx264 x264.h "X264_BUILD >= 158"; } &&
+                             check_cpp_condition libx264_hdr10 x264.h "X264_BUILD >= 163" &&
                             check_cpp_condition libx262 x264.h "X264_MPEG2"
 enabled libx265           && require_pkg_config libx265 x265 x265.h x265_api_get &&
                             require_cpp_condition libx265 x265.h "X265_BUILD >= 89"
--- a/doc/APIchanges
+++ b/doc/APIchanges
@ -2,6 +2,32 @@ The last version increases of all libraries were on 2024-03-07

 API changes, most recent first:

+2024-03-xx - xxxxxxxxxx - lavu 59.6.100 - film_grain_params.h
+  Add av_film_grain_params_select().
+
+2024-03-xx - xxxxxxxxxx - lavu 59.5.100 - film_grain_params.h
+  Add AVFilmGrainParams.color_range, color_primaries, color_trc, color_space,
+  width, height, subsampling_x, subsampling_y, bit_depth_luma and
+  bit_depth_chroma. Deprecate the corresponding fields from
+  AVFilmGrainH274Params.
+
+2024-03-xx - xxxxxxxxxx - lavc 61.3.100 - jni.h
+  Add av_jni_set_android_app_ctx() and av_jni_get_android_app_ctx().
+
+2024-03-22 - xxxxxxxxxx - lavu 59.4.100 - frame.h
+  Constified the first-level pointee of av_frame_side_data_get()
+  and renamed it to av_frame_side_data_get_c(). From now on,
+  av_frame_side_data_get() is a wrapper around av_frame_side_data_get_c()
+  that accepts AVFrameSideData * const *sd.
+
+2024-03-xx - xxxxxxxxxx - lavc 61.2.100 - avcodec.h
+  Add AVCodecContext.[nb_]decoded_side_data.
+
+2024-03-xx - xxxxxxxxxx - lavu 59.3.100 - frame.h
+  Add av_frame_side_data_free(), av_frame_side_data_new(),
+  av_frame_side_data_clone(), av_frame_side_data_get() as well
+  as AV_FRAME_SIDE_DATA_FLAG_UNIQUE.
+
 2024-03-xx - xxxxxxxxxx - lavu 59.2.100 - channel_layout.h
  Add AV_CHANNEL_LAYOUT_RETYPE_FLAG_CANONICAL.

--- a/doc/muxers.texi
+++ b/doc/muxers.texi
@ -1576,19 +1576,35 @@ This image format is used to store astronomical data.
 For more information regarding the format, visit
@url{https://fits.gsfc.nasa.gov}.

-@section flv
+@section flac
+Raw FLAC audio muxer.

+This muxer accepts exactly one FLAC audio stream. Additionally, it is possible to add
+images with disposition @samp{attached_pic}.
+
+@subsection Options
+@table @option
+@item write_header @var{bool}
+write the file header if set to @code{true}, default is @code{true}
+@end table
+
+@subsection Example
+Use @command{ffmpeg} to store the audio stream from an input file,
+together with several pictures used with @samp{attached_pic}
+disposition:
+@example
+ffmpeg -i INPUT -i pic1.png -i pic2.jpg -map 0:a -map 1 -map 2 -disposition:v attached_pic OUTPUT
+@end example
+
+@section flv
 Adobe Flash Video Format muxer.

-This muxer accepts the following options:
-
+@subsection Options
@table @option
-
@item flvflags @var{flags}
 Possible values:

@table @samp
-
@item aac_seq_header_detect
 Place AAC sequence header based on audio stream data.

@ -1729,24 +1745,26 @@ See also the @ref{framehash} and @ref{md5} muxers.

@anchor{gif}
@section gif
-
 Animated GIF muxer.

-It accepts the following options:
+Note that the GIF format has a very large time base: the delay between two frames can
+therefore not be smaller than one centi second.

+@subsection Options
@table @option
-@item loop
+@item loop @var{bool}
 Set the number of times to loop the output. Use @code{-1} for no loop, @code{0}
 for looping indefinitely (default).

-@item final_delay
+@item final_delay @var{delay}
 Force the delay (expressed in centiseconds) after the last frame. Each frame
 ends with a delay until the next frame. The default is @code{-1}, which is a
 special value to tell the muxer to re-use the previous delay. In case of a
 loop, you might want to customize this value to mark a pause for instance.
@end table

-For example, to encode a gif looping 10 times, with a 5 seconds delay between
+@subsection Example
+Encode a gif looping 10 times, with a 5 seconds delay between
 the loops:
@example
 ffmpeg -i INPUT -loop 10 -final_delay 500 out.gif
@ -1758,8 +1776,17 @@ force the @ref{image2} muxer:
 ffmpeg -i INPUT -c:v gif -f image2 "out%d.gif"
@end example

-Note 2: the GIF format has a very large time base: the delay between two frames
-can therefore not be smaller than one centi second.
+@section gxf
+General eXchange Format (GXF) muxer.
+
+GXF was developed by Grass Valley Group, then standardized by SMPTE as SMPTE
+360M and was extended in SMPTE RDD 14-2007 to include high-definition video
+resolutions.
+
+It accepts at most one video stream with codec @samp{mjpeg}, or
+@samp{mpeg1video}, or @samp{mpeg2video}, or @samp{dvvideo} with resolution
+@samp{512x480} or @samp{608x576}, and several audio streams with rate 48000Hz
+and codec @samp{pcm16_le}.

@anchor{hash}
@section hash
@ -1806,6 +1833,45 @@ ffmpeg -i INPUT -f hash -hash md5 -

 See also the @ref{framehash} muxer.

+@anchor{hds}
+@section hds
+HTTP Dynamic Streaming (HDS) muxer.
+
+HTTP dynamic streaming, or HDS, is an adaptive bitrate streaming method
+developed by Adobe. HDS delivers MP4 video content over HTTP connections. HDS
+can be used for on-demand streaming or live streaming.
+
+This muxer creates an .f4m (Adobe Flash Media Manifest File) manifest, an .abst
+(Adobe Bootstrap File) for each stream, and segment files in a directory
+specified as the output.
+
+These needs to be accessed by an HDS player throuhg HTTPS for it to be able to
+perform playback on the generated stream.
+
+@subsection Options
+@table @option
+@item extra_window_size @var{int}
+number of fragments kept outside of the manifest before removing from disk
+
+@item min_frag_duration @var{microseconds}
+minimum fragment duration (in microseconds), default value is 1 second
+(@code{10000000})
+
+@item remove_at_exit @var{bool}
+remove all fragments when finished when set to @code{true}
+
+@item window_size @var{int}
+number of fragments kept in the manifest, if set to a value different from
+@code{0}. By default all segments are kept in the output directory.
+@end table
+
+@subsection Example
+Use @command{ffmpeg} to generate HDS files to the @file{output.hds} directory in
+real-time rate:
+@example
+ffmpeg -re -i INPUT -f hds -b:v 200k output.hds
+@end example
+
@anchor{hls}
@section hls

--- a/ffbuild/.gitignore
+++ b/ffbuild/.gitignore
@ -5,3 +5,4 @@
 /config.log
 /config.mak
 /config.sh
+/config.out
--- a/fftools/ffmpeg_dec.c
+++ b/fftools/ffmpeg_dec.c
@ -1207,6 +1207,19 @@ static int dec_open(DecoderPriv *dp, AVDictionary **dec_opts,
        return ret;
    }

+    if (dp->dec_ctx->hw_device_ctx) {
+        // Update decoder extra_hw_frames option to account for the
+        // frames held in queues inside the ffmpeg utility.  This is
+        // called after avcodec_open2() because the user-set value of
+        // extra_hw_frames becomes valid in there, and we need to add
+        // this on top of it.
+        int extra_frames = DEFAULT_FRAME_THREAD_QUEUE_SIZE;
+        if (dp->dec_ctx->extra_hw_frames >= 0)
+            dp->dec_ctx->extra_hw_frames += extra_frames;
+        else
+            dp->dec_ctx->extra_hw_frames = extra_frames;
+    }
+
    ret = check_avoptions(*dec_opts);
    if (ret < 0)
        return ret;
--- a/fftools/ffmpeg_enc.c
+++ b/fftools/ffmpeg_enc.c
@ -246,6 +246,21 @@ int enc_open(void *opaque, const AVFrame *frame)
        enc_ctx->colorspace             = frame->colorspace;
        enc_ctx->chroma_sample_location = frame->chroma_location;

+        for (int i = 0; i < frame->nb_side_data; i++) {
+            ret = av_frame_side_data_clone(
+                &enc_ctx->decoded_side_data, &enc_ctx->nb_decoded_side_data,
+                frame->side_data[i], AV_FRAME_SIDE_DATA_FLAG_UNIQUE);
+            if (ret < 0) {
+                av_frame_side_data_free(
+                    &enc_ctx->decoded_side_data,
+                    &enc_ctx->nb_decoded_side_data);
+                av_log(NULL, AV_LOG_ERROR,
+                        "failed to configure video encoder: %s!\n",
+                        av_err2str(ret));
+                return ret;
+            }
+        }
+
        if (enc_ctx->flags & (AV_CODEC_FLAG_INTERLACED_DCT | AV_CODEC_FLAG_INTERLACED_ME) ||
            (frame->flags & AV_FRAME_FLAG_INTERLACED)
 #if FFMPEG_OPT_TOP
@ -631,7 +646,6 @@ static int encode_frame(OutputFile *of, OutputStream *ost, AVFrame *frame,
    if (frame) {
        FrameData *fd = frame_data(frame);

-        fd = frame_data(frame);
        if (!fd)
            return AVERROR(ENOMEM);

--- a/fftools/ffmpeg_sched.c
+++ b/fftools/ffmpeg_sched.c
@ -365,7 +365,21 @@ static int queue_alloc(ThreadQueue **ptq, unsigned nb_streams, unsigned queue_si
    ThreadQueue *tq;
    ObjPool *op;

-    queue_size = queue_size > 0 ? queue_size : 8;
+    if (queue_size <= 0) {
+        if (type == QUEUE_FRAMES)
+            queue_size = DEFAULT_FRAME_THREAD_QUEUE_SIZE;
+        else
+            queue_size = DEFAULT_PACKET_THREAD_QUEUE_SIZE;
+    }
+
+    if (type == QUEUE_FRAMES) {
+        // This queue length is used in the decoder code to ensure that
+        // there are enough entries in fixed-size frame pools to account
+        // for frames held in queues inside the ffmpeg utility.  If this
+        // can ever dynamically change then the corresponding decode
+        // code needs to be updated as well.
+        av_assert0(queue_size == DEFAULT_FRAME_THREAD_QUEUE_SIZE);
+    }

    op = (type == QUEUE_PACKETS) ? objpool_alloc_packets() :
                                   objpool_alloc_frames();
--- a/fftools/ffmpeg_sched.h
+++ b/fftools/ffmpeg_sched.h
@ -233,6 +233,18 @@ int sch_add_filtergraph(Scheduler *sch, unsigned nb_inputs, unsigned nb_outputs,
 */
 int sch_add_mux(Scheduler *sch, SchThreadFunc func, int (*init)(void *),
                void *ctx, int sdp_auto, unsigned thread_queue_size);
+
+/**
+ * Default size of a packet thread queue.  For muxing this can be overridden by
+ * the thread_queue_size option as passed to a call to sch_add_mux().
+ */
+#define DEFAULT_PACKET_THREAD_QUEUE_SIZE 8
+
+/**
+ * Default size of a frame thread queue.
+ */
+#define DEFAULT_FRAME_THREAD_QUEUE_SIZE 8
+
 /**
 * Add a muxed stream for a previously added muxer.
 *
--- a/fftools/ffplay.c
+++ b/fftools/ffplay.c
@ -2040,6 +2040,8 @@ static int configure_audio_filters(VideoState *is, const char *afilters, int for
        goto end;

    if (force_output_format) {
+        av_bprint_clear(&bp);
+        av_channel_layout_describe_bprint(&is->audio_tgt.ch_layout, &bp);
        sample_rates   [0] = is->audio_tgt.freq;
        if ((ret = av_opt_set_int(filt_asink, "all_channel_counts", 0, AV_OPT_SEARCH_CHILDREN)) < 0)
            goto end;
--- a/fftools/ffprobe.c
+++ b/fftools/ffprobe.c
@ -2402,22 +2402,41 @@ static void print_ambient_viewing_environment(WriterContext *w,
 static void print_film_grain_params(WriterContext *w,
                                    const AVFilmGrainParams *fgp)
 {
+    const char *color_range, *color_primaries, *color_trc, *color_space;
+    const char *const film_grain_type_names[] = {
+        [AV_FILM_GRAIN_PARAMS_NONE] = "none",
+        [AV_FILM_GRAIN_PARAMS_AV1]  = "av1",
+        [AV_FILM_GRAIN_PARAMS_H274] = "h274",
+    };
+
    AVBPrint pbuf;
-    if (!fgp)
+    if (!fgp || fgp->type >= FF_ARRAY_ELEMS(film_grain_type_names))
        return;

+    color_range     = av_color_range_name(fgp->color_range);
+    color_primaries = av_color_primaries_name(fgp->color_primaries);
+    color_trc       = av_color_transfer_name(fgp->color_trc);
+    color_space     = av_color_space_name(fgp->color_space);
+
    av_bprint_init(&pbuf, 1, AV_BPRINT_SIZE_UNLIMITED);
+    print_str("type", film_grain_type_names[fgp->type]);
+    print_fmt("seed", "%"PRIu64, fgp->seed);
+    print_int("width", fgp->width);
+    print_int("height", fgp->height);
+    print_int("subsampling_x", fgp->subsampling_x);
+    print_int("subsampling_y", fgp->subsampling_y);
+    print_str("color_range", color_range ? color_range : "unknown");
+    print_str("color_primaries", color_primaries ? color_primaries : "unknown");
+    print_str("color_trc", color_trc ? color_trc : "unknown");
+    print_str("color_space", color_space ? color_space : "unknown");

    switch (fgp->type) {
    case AV_FILM_GRAIN_PARAMS_NONE:
-        print_str("type", "none");
        break;
    case AV_FILM_GRAIN_PARAMS_AV1: {
        const AVFilmGrainAOMParams *aom = &fgp->codec.aom;
        const int num_ar_coeffs_y = 2 * aom->ar_coeff_lag * (aom->ar_coeff_lag + 1);
        const int num_ar_coeffs_uv = num_ar_coeffs_y + !!aom->num_y_points;
-        print_str("type", "av1");
-        print_fmt("seed", "%"PRIu64, fgp->seed);
        print_int("chroma_scaling_from_luma", aom->chroma_scaling_from_luma);
        print_int("scaling_shift", aom->scaling_shift);
        print_int("ar_coeff_lag", aom->ar_coeff_lag);
@ -2431,6 +2450,7 @@ static void print_film_grain_params(WriterContext *w,
        if (aom->num_y_points) {
            writer_print_section_header(w, NULL, SECTION_ID_FRAME_SIDE_DATA_COMPONENT);

+            print_int("bit_depth_luma", fgp->bit_depth_luma);
            print_list_fmt("y_points_value", "%"PRIu8, aom->num_y_points, 1, aom->y_points[idx][0]);
            print_list_fmt("y_points_scaling", "%"PRIu8, aom->num_y_points, 1, aom->y_points[idx][1]);
            print_list_fmt("ar_coeffs_y", "%"PRId8, num_ar_coeffs_y, 1, aom->ar_coeffs_y[idx]);
@ -2445,6 +2465,7 @@ static void print_film_grain_params(WriterContext *w,

            writer_print_section_header(w, NULL, SECTION_ID_FRAME_SIDE_DATA_COMPONENT);

+            print_int("bit_depth_chroma", fgp->bit_depth_chroma);
            print_list_fmt("uv_points_value", "%"PRIu8, aom->num_uv_points[uv], 1, aom->uv_points[uv][idx][0]);
            print_list_fmt("uv_points_scaling", "%"PRIu8, aom->num_uv_points[uv], 1, aom->uv_points[uv][idx][1]);
            print_list_fmt("ar_coeffs_uv", "%"PRId8, num_ar_coeffs_uv, 1, aom->ar_coeffs_uv[uv][idx]);
@ -2462,17 +2483,7 @@ static void print_film_grain_params(WriterContext *w,
    }
    case AV_FILM_GRAIN_PARAMS_H274: {
        const AVFilmGrainH274Params *h274 = &fgp->codec.h274;
-        const char *color_range_str     = av_color_range_name(h274->color_range);
-        const char *color_primaries_str = av_color_primaries_name(h274->color_primaries);
-        const char *color_trc_str       = av_color_transfer_name(h274->color_trc);
-        const char *color_space_str     = av_color_space_name(h274->color_space);
-        print_str("type", "h274");
-        print_fmt("seed", "%"PRIu64, fgp->seed);
        print_int("model_id", h274->model_id);
-        print_str("color_range", color_range_str ? color_range_str : "unknown");
-        print_str("color_primaries", color_primaries_str ? color_primaries_str : "unknown");
-        print_str("color_trc", color_trc_str ? color_trc_str : "unknown");
-        print_str("color_space", color_space_str ? color_space_str : "unknown");
        print_int("blending_mode_id", h274->blending_mode_id);
        print_int("log2_scale_factor", h274->log2_scale_factor);

@ -2483,7 +2494,7 @@ static void print_film_grain_params(WriterContext *w,
                continue;

            writer_print_section_header(w, NULL, SECTION_ID_FRAME_SIDE_DATA_COMPONENT);
-            print_int(c ? "bit_depth_chroma" : "bit_depth_luma", c ? h274->bit_depth_chroma : h274->bit_depth_luma);
+            print_int(c ? "bit_depth_chroma" : "bit_depth_luma", c ? fgp->bit_depth_chroma : fgp->bit_depth_luma);

            writer_print_section_header(w, NULL, SECTION_ID_FRAME_SIDE_DATA_PIECE_LIST);
            for (int i = 0; i < h274->num_intensity_intervals[c]; i++) {
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@ -105,7 +105,7 @@ OBJS-$(CONFIG_H264_SEI)                += h264_sei.o h2645_sei.o
 OBJS-$(CONFIG_HEVCPARSE)               += hevc_parse.o hevc_ps.o hevc_data.o \
                                          h2645data.o h2645_parse.o h2645_vui.o
 OBJS-$(CONFIG_HEVC_SEI)                += hevc_sei.o h2645_sei.o \
-                                          dynamic_hdr_vivid.o
+                                          dynamic_hdr_vivid.o aom_film_grain.o
 OBJS-$(CONFIG_HPELDSP)                 += hpeldsp.o
 OBJS-$(CONFIG_HUFFMAN)                 += huffman.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += huffyuvdsp.o
@ -432,7 +432,7 @@ OBJS-$(CONFIG_HDR_ENCODER)             += hdrenc.o
 OBJS-$(CONFIG_HEVC_DECODER)            += hevcdec.o hevc_mvs.o \
                                          hevc_cabac.o hevc_refs.o hevcpred.o    \
                                          hevcdsp.o hevc_filter.o hevc_data.o \
-                                          h274.o
+                                          h274.o aom_film_grain.o
 OBJS-$(CONFIG_HEVC_AMF_ENCODER)        += amfenc_hevc.o
 OBJS-$(CONFIG_HEVC_CUVID_DECODER)      += cuviddec.o
 OBJS-$(CONFIG_HEVC_MEDIACODEC_DECODER) += mediacodecdec.o
--- a/libavcodec/aacdec_common.c
+++ b/libavcodec/aacdec_common.c
@ -43,7 +43,7 @@ const uint8_t ff_aac_channel_layout_map[16][16][3] = {
    { { TYPE_SCE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 0, AAC_CHANNEL_FRONT }, { TYPE_SCE, 1, AAC_CHANNEL_BACK }, },
    { { TYPE_SCE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 1, AAC_CHANNEL_BACK }, },
    { { TYPE_SCE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 1, AAC_CHANNEL_BACK }, { TYPE_LFE, 0, AAC_CHANNEL_LFE  }, },
-    { { TYPE_SCE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 1, AAC_CHANNEL_FRONT }, { TYPE_CPE, 2, AAC_CHANNEL_BACK }, { TYPE_LFE, 0, AAC_CHANNEL_LFE  }, },
+    { { TYPE_SCE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 0, AAC_CHANNEL_FRONT }, { TYPE_CPE, 1, AAC_CHANNEL_SIDE }, { TYPE_CPE, 2, AAC_CHANNEL_BACK }, { TYPE_LFE, 0, AAC_CHANNEL_LFE  }, },
    { { 0, } },
    { { 0, } },
    { { 0, } },
--- a/libavcodec/aacdec_template.c
+++ b/libavcodec/aacdec_template.c
@ -577,7 +577,7 @@ static ChannelElement *get_che(AACDecContext *ac, int type, int elem_id)
 {
    /* For PCE based channel configurations map the channels solely based
     * on tags. */
-    if (!ac->oc[1].m4ac.chan_config) {
+    if (!ac->oc[1].m4ac.chan_config || ac->oc[1].m4ac.pce) {
        return ac->tag_che_map[type][elem_id];
    }
    // Allow single CPE stereo files to be signalled with mono configuration.
@ -3219,7 +3219,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, AVFrame *frame,
            } else {
                err = output_configure(ac, layout_map, tags, OC_TRIAL_PCE, 1);
                if (!err)
-                    ac->oc[1].m4ac.chan_config = 0;
+                    ac->oc[1].m4ac.pce = 1;
                pce_found = 1;
            }
            break;
--- a/libavcodec/aom_film_grain.c
+++ b/libavcodec/aom_film_grain.c
@ -0,0 +1,548 @@
+/*
+ * AOM film grain synthesis
+ * Copyright (c) 2023 Niklas Haas <ffmpeg@haasn.xyz>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AOM film grain synthesis.
+ * @author Niklas Haas <ffmpeg@haasn.xyz>
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/imgutils.h"
+
+#include "aom_film_grain.h"
+#include "get_bits.h"
+
+// Common/shared helpers (not dependent on BIT_DEPTH)
+static inline int get_random_number(const int bits, unsigned *const state) {
+    const int r = *state;
+    unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
+    *state = (r >> 1) | (bit << 15);
+
+    return (*state >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static inline int round2(const int x, const uint64_t shift) {
+    return (x + ((1 << shift) >> 1)) >> shift;
+}
+
+enum {
+    GRAIN_WIDTH      = 82,
+    GRAIN_HEIGHT     = 73,
+    SUB_GRAIN_WIDTH  = 44,
+    SUB_GRAIN_HEIGHT = 38,
+    FG_BLOCK_SIZE    = 32,
+};
+
+static const int16_t gaussian_sequence[2048];
+
+#define BIT_DEPTH 16
+#include "aom_film_grain_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 8
+#include "aom_film_grain_template.c"
+#undef BIT_DEPTH
+
+
+int ff_aom_apply_film_grain(AVFrame *out, const AVFrame *in,
+                            const AVFilmGrainParams *params)
+{
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out->format);
+    const int subx = desc->log2_chroma_w, suby = desc->log2_chroma_h;
+    const int pxstep = desc->comp[0].step;
+
+    av_assert0(out->format == in->format);
+    av_assert0(params->type == AV_FILM_GRAIN_PARAMS_AV1);
+
+    // Copy over the non-modified planes
+    if (!params->codec.aom.num_y_points) {
+        av_image_copy_plane(out->data[0], out->linesize[0],
+                            in->data[0], in->linesize[0],
+                            out->width * pxstep, out->height);
+    }
+    for (int uv = 0; uv < 2; uv++) {
+        if (!data->num_uv_points[uv]) {
+            av_image_copy_plane(out->data[1+uv], out->linesize[1+uv],
+                                in->data[1+uv], in->linesize[1+uv],
+                                AV_CEIL_RSHIFT(out->width, subx) * pxstep,
+                                AV_CEIL_RSHIFT(out->height, suby));
+        }
+    }
+
+    switch (in->format) {
+    case AV_PIX_FMT_GRAY8:
+    case AV_PIX_FMT_YUV420P:
+    case AV_PIX_FMT_YUV422P:
+    case AV_PIX_FMT_YUV444P:
+    case AV_PIX_FMT_YUVJ420P:
+    case AV_PIX_FMT_YUVJ422P:
+    case AV_PIX_FMT_YUVJ444P:
+        return apply_film_grain_8(out, in, params);
+    case AV_PIX_FMT_GRAY9:
+    case AV_PIX_FMT_YUV420P9:
+    case AV_PIX_FMT_YUV422P9:
+    case AV_PIX_FMT_YUV444P9:
+        return apply_film_grain_16(out, in, params, 9);
+    case AV_PIX_FMT_GRAY10:
+    case AV_PIX_FMT_YUV420P10:
+    case AV_PIX_FMT_YUV422P10:
+    case AV_PIX_FMT_YUV444P10:
+        return apply_film_grain_16(out, in, params, 10);
+    case AV_PIX_FMT_GRAY12:
+    case AV_PIX_FMT_YUV420P12:
+    case AV_PIX_FMT_YUV422P12:
+    case AV_PIX_FMT_YUV444P12:
+        return apply_film_grain_16(out, in, params, 12);
+    }
+
+    /* The AV1 spec only defines film grain synthesis for these formats */
+    return AVERROR_INVALIDDATA;
+}
+
+int ff_aom_parse_film_grain_sets(AVFilmGrainAFGS1Params *s,
+                                 const uint8_t *payload, int payload_size)
+{
+    GetBitContext gbc, *gb = &gbc;
+    AVFilmGrainAOMParams *aom;
+    AVFilmGrainParams *fgp, *ref = NULL;
+    int ret, num_sets, n, i, uv, num_y_coeffs, update_grain, luma_only;
+
+    ret = init_get_bits8(gb, payload, payload_size);
+    if (ret < 0)
+        return ret;
+
+    s->enable = get_bits1(gb);
+    if (!s->enable)
+        return 0;
+
+    skip_bits(gb, 4); // reserved
+    num_sets = get_bits(gb, 3) + 1;
+    for (n = 0; n < num_sets; n++) {
+        int payload_4byte, payload_size, set_idx, apply_units_log2, vsc_flag;
+        int predict_scaling, predict_y_scaling, predict_uv_scaling[2];
+        int payload_bits, start_position;
+
+        start_position = get_bits_count(gb);
+        payload_4byte = get_bits1(gb);
+        payload_size = get_bits(gb, payload_4byte ? 2 : 8);
+        set_idx = get_bits(gb, 3);
+        fgp = &s->sets[set_idx];
+        aom = &fgp->codec.aom;
+
+        fgp->type = get_bits1(gb) ? AV_FILM_GRAIN_PARAMS_AV1 : AV_FILM_GRAIN_PARAMS_NONE;
+        if (!fgp->type)
+            continue;
+
+        fgp->seed = get_bits(gb, 16);
+        update_grain = get_bits1(gb);
+        if (!update_grain)
+            continue;
+
+        apply_units_log2  = get_bits(gb, 4);
+        fgp->width  = get_bits(gb, 12) << apply_units_log2;
+        fgp->height = get_bits(gb, 12) << apply_units_log2;
+        luma_only = get_bits1(gb);
+        if (luma_only) {
+            fgp->subsampling_x = fgp->subsampling_y = 0;
+        } else {
+            fgp->subsampling_x = get_bits1(gb);
+            fgp->subsampling_y = get_bits1(gb);
+        }
+
+        fgp->bit_depth_luma  = fgp->bit_depth_chroma = 0;
+        fgp->color_primaries = AVCOL_PRI_UNSPECIFIED;
+        fgp->color_trc       = AVCOL_TRC_UNSPECIFIED;
+        fgp->color_space     = AVCOL_SPC_UNSPECIFIED;
+        fgp->color_range     = AVCOL_RANGE_UNSPECIFIED;
+
+        vsc_flag = get_bits1(gb); // video_signal_characteristics_flag
+        if (vsc_flag) {
+            int cicp_flag;
+            fgp->bit_depth_luma = get_bits(gb, 3) + 8;
+            if (!luma_only)
+                fgp->bit_depth_chroma = fgp->bit_depth_luma;
+            cicp_flag = get_bits1(gb);
+            if (cicp_flag) {
+                fgp->color_primaries = get_bits(gb, 8);
+                fgp->color_trc = get_bits(gb, 8);
+                fgp->color_space = get_bits(gb, 8);
+                fgp->color_range = get_bits1(gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
+                if (fgp->color_primaries > AVCOL_PRI_NB ||
+                    fgp->color_primaries == AVCOL_PRI_RESERVED ||
+                    fgp->color_primaries == AVCOL_PRI_RESERVED0 ||
+                    fgp->color_trc > AVCOL_TRC_NB ||
+                    fgp->color_trc == AVCOL_TRC_RESERVED ||
+                    fgp->color_trc == AVCOL_TRC_RESERVED0 ||
+                    fgp->color_space > AVCOL_SPC_NB ||
+                    fgp->color_space == AVCOL_SPC_RESERVED)
+                    goto error;
+            }
+        }
+
+        predict_scaling = get_bits1(gb);
+        if (predict_scaling && (!ref || ref == fgp))
+            goto error; // prediction must be from valid, different set
+
+        predict_y_scaling = predict_scaling ? get_bits1(gb) : 0;
+        if (predict_y_scaling) {
+            int y_scale, y_offset, bits_res;
+            y_scale = get_bits(gb, 9) - 256;
+            y_offset = get_bits(gb, 9) - 256;
+            bits_res = get_bits(gb, 3);
+            if (bits_res) {
+                int res[14], pred, granularity;
+                aom->num_y_points = ref->codec.aom.num_y_points;
+                for (i = 0; i < aom->num_y_points; i++)
+                    res[i] = get_bits(gb, bits_res);
+                granularity = get_bits(gb, 3);
+                for (i = 0; i < aom->num_y_points; i++) {
+                    pred = ref->codec.aom.y_points[i][1];
+                    pred = ((pred * y_scale + 8) >> 4) + y_offset;
+                    pred += (res[i] - (1 << (bits_res - 1))) * granularity;
+                    aom->y_points[i][0] = ref->codec.aom.y_points[i][0];
+                    aom->y_points[i][1] = av_clip_uint8(pred);
+                }
+            }
+        } else {
+            aom->num_y_points = get_bits(gb, 4);
+            if (aom->num_y_points > 14) {
+                goto error;
+            } else if (aom->num_y_points) {
+                int bits_inc, bits_scaling;
+                int y_value = 0;
+                bits_inc = get_bits(gb, 3) + 1;
+                bits_scaling = get_bits(gb, 2) + 5;
+                for (i = 0; i < aom->num_y_points; i++) {
+                    y_value += get_bits(gb, bits_inc);
+                    if (y_value > UINT8_MAX)
+                        goto error;
+                    aom->y_points[i][0] = y_value;
+                    aom->y_points[i][1] = get_bits(gb, bits_scaling);
+                }
+            }
+        }
+
+        if (luma_only) {
+            aom->chroma_scaling_from_luma = 0;
+            aom->num_uv_points[0] = aom->num_uv_points[1] = 0;
+        } else {
+            aom->chroma_scaling_from_luma = get_bits1(gb);
+            if (aom->chroma_scaling_from_luma) {
+                aom->num_uv_points[0] = aom->num_uv_points[1] = 0;
+            } else {
+                for (uv = 0; uv < 2; uv++) {
+                    predict_uv_scaling[uv] = predict_scaling ? get_bits1(gb) : 0;
+                    if (predict_uv_scaling[uv]) {
+                        int uv_scale, uv_offset, bits_res;
+                        uv_scale = get_bits(gb, 9) - 256;
+                        uv_offset = get_bits(gb, 9) - 256;
+                        bits_res = get_bits(gb, 3);
+                        aom->uv_mult[uv] = ref->codec.aom.uv_mult[uv];
+                        aom->uv_mult_luma[uv] = ref->codec.aom.uv_mult_luma[uv];
+                        aom->uv_offset[uv] = ref->codec.aom.uv_offset[uv];
+                        if (bits_res) {
+                            int res[10], pred, granularity;
+                            aom->num_uv_points[uv] = ref->codec.aom.num_uv_points[uv];
+                            for (i = 0; i < aom->num_uv_points[uv]; i++)
+                                res[i] = get_bits(gb, bits_res);
+                            granularity = get_bits(gb, 3);
+                            for (i = 0; i < aom->num_uv_points[uv]; i++) {
+                                pred = ref->codec.aom.uv_points[uv][i][1];
+                                pred = ((pred * uv_scale + 8) >> 4) + uv_offset;
+                                pred += (res[i] - (1 << (bits_res - 1))) * granularity;
+                                aom->uv_points[uv][i][0] = ref->codec.aom.uv_points[uv][i][0];
+                                aom->uv_points[uv][i][1] = av_clip_uint8(pred);
+                            }
+                        }
+                    } else {
+                        int bits_inc, bits_scaling, uv_offset;
+                        int uv_value = 0;
+                        aom->num_uv_points[uv] = get_bits(gb, 4);
+                        if (aom->num_uv_points[uv] > 10)
+                            goto error;
+                        bits_inc = get_bits(gb, 3) + 1;
+                        bits_scaling = get_bits(gb, 2) + 5;
+                        uv_offset = get_bits(gb, 8);
+                        for (i = 0; i < aom->num_uv_points[uv]; i++) {
+                            uv_value += get_bits(gb, bits_inc);
+                            if (uv_value > UINT8_MAX)
+                                goto error;
+                            aom->uv_points[uv][i][0] = uv_value;
+                            aom->uv_points[uv][i][1] = get_bits(gb, bits_scaling) + uv_offset;
+                        }
+                    }
+                }
+            }
+        }
+
+        aom->scaling_shift = get_bits(gb, 2) + 8;
+        aom->ar_coeff_lag = get_bits(gb, 2);
+        num_y_coeffs = 2 * aom->ar_coeff_lag * (aom->ar_coeff_lag + 1);
+        if (aom->num_y_points) {
+            int ar_bits = get_bits(gb, 2) + 5;
+            for (i = 0; i < num_y_coeffs; i++)
+                aom->ar_coeffs_y[i] = get_bits(gb, ar_bits) - (1 << (ar_bits - 1));
+        }
+        for (uv = 0; uv < 2; uv++) {
+            if (aom->chroma_scaling_from_luma || aom->num_uv_points[uv]) {
+                int ar_bits = get_bits(gb, 2) + 5;
+                for (i = 0; i < num_y_coeffs + !!aom->num_y_points; i++)
+                    aom->ar_coeffs_uv[uv][i] = get_bits(gb, ar_bits) - (1 << (ar_bits - 1));
+            }
+        }
+        aom->ar_coeff_shift = get_bits(gb, 2) + 6;
+        aom->grain_scale_shift = get_bits(gb, 2);
+        for (uv = 0; uv < 2; uv++) {
+            if (aom->num_uv_points[uv] && !predict_uv_scaling[uv]) {
+                aom->uv_mult[uv]      = get_bits(gb, 8) - 128;
+                aom->uv_mult_luma[uv] = get_bits(gb, 8) - 128;
+                aom->uv_offset[uv]    = get_bits(gb, 9) - 256;
+            }
+        }
+        aom->overlap_flag = get_bits1(gb);
+        aom->limit_output_range = get_bits1(gb);
+
+        // use first set as reference only if it was fully transmitted
+        if (n == 0)
+            ref = fgp;
+
+        payload_bits = get_bits_count(gb) - start_position;
+        if (payload_bits > payload_size * 8)
+            goto error;
+        skip_bits(gb, payload_size * 8 - payload_bits);
+    }
+    return 0;
+
+error:
+    memset(s, 0, sizeof(*s));
+    return AVERROR_INVALIDDATA;
+}
+
+int ff_aom_attach_film_grain_sets(const AVFilmGrainAFGS1Params *s, AVFrame *frame)
+{
+    AVFilmGrainParams *fgp;
+    if (!s->enable)
+        return 0;
+
+    for (int i = 0; i < FF_ARRAY_ELEMS(s->sets); i++) {
+        if (s->sets[i].type != AV_FILM_GRAIN_PARAMS_AV1)
+            continue;
+        fgp = av_film_grain_params_create_side_data(frame);
+        if (!fgp)
+            return AVERROR(ENOMEM);
+        memcpy(fgp, &s->sets[i], sizeof(*fgp));
+    }
+
+    return 0;
+}
+
+// Taken from the AV1 spec. Range is [-2048, 2047], mean is 0 and stddev is 512
+static const int16_t gaussian_sequence[2048] = {
+    56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+    224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+    112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+    -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+    432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+    192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+    540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+    248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+    248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+    340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+    220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+    -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+    60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+    488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+    -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+    -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+    -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+    -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+    728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+    4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+    772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+    -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+    -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+    -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+    1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+    204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+    548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+    -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+    96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+    -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+    240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+    -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+    896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+    -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+    -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+    -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+    -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+    -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+    424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+    436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+    -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+    -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+    496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+    56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+    -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+    540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+    424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+    -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+    756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+    -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+    60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+    -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+    -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+    308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+    -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+    -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+    284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+    264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+    -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+    908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+    124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+    1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+    -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+    -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+    -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+    320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+    -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+    -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+    -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+    -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+    -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+    636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+    -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+    -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+    392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+    -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+    -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+    -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+    756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+    -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+    472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+    844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+    60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+    -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+    -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+    472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+    652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+    -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+    -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+    -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+    -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+    220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+    412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+    320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+    372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+    924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+    332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+    436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+    -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+    1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+    -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+    -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+    -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+    528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+    -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+    -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+    1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+    20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+    96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+    192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+    648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+    816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+    648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+    -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+    -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+    -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+    384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+    -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+    -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+    64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+    -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+    128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+    112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+    828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+    -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+    0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+    -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+    24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+    508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+    716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+    600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+    -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+    -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+    344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+    -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+    164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+    192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+    288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+    -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+    -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+    556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+    268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+    884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+    -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+    -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+    244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+    -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+    -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+    -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+    1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+    -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+    344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+    -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+    1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+    -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+    504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+    76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+    116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+    28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+    -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+    -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+    -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+    -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+    252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+    312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+    732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+    124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+    -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+    440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+    -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+    648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+    680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+    -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+    -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+    -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+    -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+    372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+    -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+    -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+    -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+    -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+    52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+    716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+    -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+    -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+    104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+    428,   -484
+};
--- a/libavcodec/aom_film_grain.h
+++ b/libavcodec/aom_film_grain.h
@ -0,0 +1,51 @@
+/*
+ * AOM film grain synthesis
+ * Copyright (c) 2021 Niklas Haas <ffmpeg@haasn.xyz>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AOM film grain synthesis.
+ * @author Niklas Haas <ffmpeg@haasn.xyz>
+ */
+
+#ifndef AVCODEC_AOM_FILM_GRAIN_H
+#define AVCODEC_AOM_FILM_GRAIN_H
+
+#include "libavutil/film_grain_params.h"
+
+typedef struct AVFilmGrainAFGS1Params {
+    int enable;
+    AVFilmGrainParams sets[8];
+} AVFilmGrainAFGS1Params;
+
+// Synthesizes film grain on top of `in` and stores the result to `out`. `out`
+// must already have been allocated and set to the same size and format as `in`.
+int ff_aom_apply_film_grain(AVFrame *out, const AVFrame *in,
+                            const AVFilmGrainParams *params);
+
+// Parse AFGS1 parameter sets from an ITU-T T.35 payload. Returns 0 on success,
+// or a negative error code.
+int ff_aom_parse_film_grain_sets(AVFilmGrainAFGS1Params *s,
+                                 const uint8_t *payload, int payload_size);
+
+// Attach all valid film grain param sets to `frame`.
+int ff_aom_attach_film_grain_sets(const AVFilmGrainAFGS1Params *s, AVFrame *frame);
+
+#endif /* AVCODEC_AOM_FILM_GRAIN_H */
--- a/libavcodec/aom_film_grain_template.c
+++ b/libavcodec/aom_film_grain_template.c
@ -0,0 +1,577 @@
+/*
+ * AOM film grain synthesis
+ * Copyright (c) 2023 Niklas Haas <ffmpeg@haasn.xyz>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bit_depth_template.c"
+
+#undef entry
+#undef bitdepth
+#undef bitdepth_max
+#undef HBD_DECL
+#undef HBD_CALL
+#undef SCALING_SIZE
+
+#if BIT_DEPTH > 8
+# define entry int16_t
+# define bitdepth_max ((1 << bitdepth) - 1)
+# define HBD_DECL , const int bitdepth
+# define HBD_CALL , bitdepth
+# define SCALING_SIZE 4096
+#else
+# define entry int8_t
+# define bitdepth 8
+# define bitdepth_max UINT8_MAX
+# define HBD_DECL
+# define HBD_CALL
+# define SCALING_SIZE 256
+#endif
+
+static void FUNC(generate_grain_y_c)(entry buf[][GRAIN_WIDTH],
+                                     const AVFilmGrainParams *const params
+                                     HBD_DECL)
+{
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const int bitdepth_min_8 = bitdepth - 8;
+    unsigned seed = params->seed;
+    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+    const int ar_pad = 3;
+    const int ar_lag = data->ar_coeff_lag;
+
+    for (int y = 0; y < GRAIN_HEIGHT; y++) {
+        for (int x = 0; x < GRAIN_WIDTH; x++) {
+            const int value = get_random_number(11, &seed);
+            buf[y][x] = round2(gaussian_sequence[ value ], shift);
+        }
+    }
+
+    for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
+        for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_y;
+            int sum = 0, grain;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    if (!dx && !dy)
+                        break;
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            buf[y][x] = av_clip(grain, grain_min, grain_max);
+        }
+    }
+}
+
+static void
+FUNC(generate_grain_uv_c)(entry buf[][GRAIN_WIDTH],
+                          const entry buf_y[][GRAIN_WIDTH],
+                          const AVFilmGrainParams *const params, const intptr_t uv,
+                          const int subx, const int suby HBD_DECL)
+{
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const int bitdepth_min_8 = bitdepth - 8;
+    unsigned seed = params->seed ^ (uv ? 0x49d8 : 0xb524);
+    const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+    const int chromaW = subx ? SUB_GRAIN_WIDTH  : GRAIN_WIDTH;
+    const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
+
+    const int ar_pad = 3;
+    const int ar_lag = data->ar_coeff_lag;
+
+    for (int y = 0; y < chromaH; y++) {
+        for (int x = 0; x < chromaW; x++) {
+            const int value = get_random_number(11, &seed);
+            buf[y][x] = round2(gaussian_sequence[ value ], shift);
+        }
+    }
+
+    for (int y = ar_pad; y < chromaH; y++) {
+        for (int x = ar_pad; x < chromaW - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_uv[uv];
+            int sum = 0, grain;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    // For the final (current) pixel, we need to add in the
+                    // contribution from the luma grain texture
+                    if (!dx && !dy) {
+                        const int lumaX = ((x - ar_pad) << subx) + ar_pad;
+                        const int lumaY = ((y - ar_pad) << suby) + ar_pad;
+                        int luma = 0;
+                        if (!data->num_y_points)
+                            break;
+                        for (int i = 0; i <= suby; i++) {
+                            for (int j = 0; j <= subx; j++) {
+                                luma += buf_y[lumaY + i][lumaX + j];
+                            }
+                        }
+                        luma = round2(luma, subx + suby);
+                        sum += luma * (*coeff);
+                        break;
+                    }
+
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            buf[y][x] = av_clip(grain, grain_min, grain_max);
+        }
+    }
+}
+
+// samples from the correct block of a grain LUT, while taking into account the
+// offsets provided by the offsets cache
+static inline entry FUNC(sample_lut)(const entry grain_lut[][GRAIN_WIDTH],
+                                     const int offsets[2][2],
+                                     const int subx, const int suby,
+                                     const int bx, const int by,
+                                     const int x, const int y)
+{
+    const int randval = offsets[bx][by];
+    const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
+    const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
+    return grain_lut[offy + y + (FG_BLOCK_SIZE >> suby) * by]
+                    [offx + x + (FG_BLOCK_SIZE >> subx) * bx];
+}
+
+static void FUNC(fgy_32x32xn_c)(pixel *const dst_row, const pixel *const src_row,
+                                const ptrdiff_t stride,
+                                const AVFilmGrainParams *const params, const size_t pw,
+                                const uint8_t scaling[SCALING_SIZE],
+                                const entry grain_lut[][GRAIN_WIDTH],
+                                const int bh, const int row_num HBD_DECL)
+{
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const int rows = 1 + (data->overlap_flag && row_num > 0);
+    const int bitdepth_min_8 = bitdepth - 8;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+    unsigned seed[2];
+    int offsets[2 /* col offset */][2 /* row offset */];
+
+    int min_value, max_value;
+    if (data->limit_output_range) {
+        min_value = 16 << bitdepth_min_8;
+        max_value = 235 << bitdepth_min_8;
+    } else {
+        min_value = 0;
+        max_value = bitdepth_max;
+    }
+
+    // seed[0] contains the current row, seed[1] contains the previous
+    for (int i = 0; i < rows; i++) {
+        seed[i] = params->seed;
+        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
+        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+    }
+
+    av_assert1(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
+
+    // process this row in FG_BLOCK_SIZE^2 blocks
+    for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) {
+        const int bw = FFMIN(FG_BLOCK_SIZE, (int) pw - bx);
+        const pixel *src;
+        pixel *dst;
+        int noise;
+
+        // x/y block offsets to compensate for overlapped regions
+        const int ystart = data->overlap_flag && row_num ? FFMIN(2, bh) : 0;
+        const int xstart = data->overlap_flag && bx      ? FFMIN(2, bw) : 0;
+
+        static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
+
+        if (data->overlap_flag && bx) {
+            // shift previous offsets left
+            for (int i = 0; i < rows; i++)
+                offsets[1][i] = offsets[0][i];
+        }
+
+        // update current offsets
+        for (int i = 0; i < rows; i++)
+            offsets[0][i] = get_random_number(8, &seed[i]);
+
+#define add_noise_y(x, y, grain)                                                \
+        src = (const pixel*)((const char*)src_row + (y) * stride) + (x) + bx;   \
+        dst = (pixel*)((char*)dst_row + (y) * stride) + (x) + bx;               \
+        noise = round2(scaling[ *src ] * (grain), data->scaling_shift);         \
+        *dst = av_clip(*src + noise, min_value, max_value);
+
+        for (int y = ystart; y < bh; y++) {
+            // Non-overlapped image region (straightforward)
+            for (int x = xstart; x < bw; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                add_noise_y(x, y, grain);
+            }
+
+            // Special case for overlapped column
+            for (int x = 0; x < xstart; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int old   = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 1, 0, x, y);
+                grain = round2(old * w[x][0] + grain * w[x][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+                add_noise_y(x, y, grain);
+            }
+        }
+
+        for (int y = 0; y < ystart; y++) {
+            // Special case for overlapped row (sans corner)
+            for (int x = xstart; x < bw; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int old   = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 1, x, y);
+                grain = round2(old * w[y][0] + grain * w[y][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+                add_noise_y(x, y, grain);
+            }
+
+            // Special case for doubly-overlapped corner
+            for (int x = 0; x < xstart; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int top = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 0, 1, x, y);
+                int old = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 1, 1, x, y);
+
+                // Blend the top pixel with the top left block
+                top = round2(old * w[x][0] + top * w[x][1], 5);
+                top = av_clip(top, grain_min, grain_max);
+
+                // Blend the current pixel with the left block
+                old = FUNC(sample_lut)(grain_lut, offsets, 0, 0, 1, 0, x, y);
+                grain = round2(old * w[x][0] + grain * w[x][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+
+                // Mix the row rows together and apply grain
+                grain = round2(top * w[y][0] + grain * w[y][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+                add_noise_y(x, y, grain);
+            }
+        }
+    }
+}
+
+static void
+FUNC(fguv_32x32xn_c)(pixel *const dst_row, const pixel *const src_row,
+                     const ptrdiff_t stride, const AVFilmGrainParams *const params,
+                     const size_t pw, const uint8_t scaling[SCALING_SIZE],
+                     const entry grain_lut[][GRAIN_WIDTH], const int bh,
+                     const int row_num, const pixel *const luma_row,
+                     const ptrdiff_t luma_stride, const int uv, const int is_id,
+                     const int sx, const int sy HBD_DECL)
+{
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const int rows = 1 + (data->overlap_flag && row_num > 0);
+    const int bitdepth_min_8 = bitdepth - 8;
+    const int grain_ctr = 128 << bitdepth_min_8;
+    const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+    unsigned seed[2];
+    int offsets[2 /* col offset */][2 /* row offset */];
+
+    int min_value, max_value;
+    if (data->limit_output_range) {
+        min_value = 16 << bitdepth_min_8;
+        max_value = (is_id ? 235 : 240) << bitdepth_min_8;
+    } else {
+        min_value = 0;
+        max_value = bitdepth_max;
+    }
+
+    // seed[0] contains the current row, seed[1] contains the previous
+    for (int i = 0; i < rows; i++) {
+        seed[i] = params->seed;
+        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
+        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+    }
+
+    av_assert1(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
+
+    // process this row in FG_BLOCK_SIZE^2 blocks (subsampled)
+    for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) {
+        const int bw = FFMIN(FG_BLOCK_SIZE >> sx, (int)(pw - bx));
+        int val, lx, ly, noise;
+        const pixel *src, *luma;
+        pixel *dst, avg;
+
+        // x/y block offsets to compensate for overlapped regions
+        const int ystart = data->overlap_flag && row_num ? FFMIN(2 >> sy, bh) : 0;
+        const int xstart = data->overlap_flag && bx      ? FFMIN(2 >> sx, bw) : 0;
+
+        static const int w[2 /* sub */][2 /* off */][2] = {
+            { { 27, 17 }, { 17, 27 } },
+            { { 23, 22 } },
+        };
+
+        if (data->overlap_flag && bx) {
+            // shift previous offsets left
+            for (int i = 0; i < rows; i++)
+                offsets[1][i] = offsets[0][i];
+        }
+
+        // update current offsets
+        for (int i = 0; i < rows; i++)
+            offsets[0][i] = get_random_number(8, &seed[i]);
+
+#define add_noise_uv(x, y, grain)                                               \
+            lx = (bx + x) << sx;                                                \
+            ly = y << sy;                                                       \
+            luma = (const pixel*)((const char*)luma_row + ly * luma_stride) + lx;\
+            avg = luma[0];                                                      \
+            if (sx)                                                             \
+                avg = (avg + luma[1] + 1) >> 1;                                 \
+            src = (const pixel*)((const char *)src_row + (y) * stride) + bx + (x);\
+            dst = (pixel *) ((char *) dst_row + (y) * stride) + bx + (x);       \
+            val = avg;                                                          \
+            if (!data->chroma_scaling_from_luma) {                              \
+                const int combined = avg * data->uv_mult_luma[uv] +             \
+                                    *src * data->uv_mult[uv];                   \
+                val = av_clip( (combined >> 6) +                                \
+                               (data->uv_offset[uv] * (1 << bitdepth_min_8)),   \
+                               0, bitdepth_max );                               \
+            }                                                                   \
+            noise = round2(scaling[ val ] * (grain), data->scaling_shift);      \
+            *dst = av_clip(*src + noise, min_value, max_value);
+
+        for (int y = ystart; y < bh; y++) {
+            // Non-overlapped image region (straightforward)
+            for (int x = xstart; x < bw; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                add_noise_uv(x, y, grain);
+            }
+
+            // Special case for overlapped column
+            for (int x = 0; x < xstart; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                int old   = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 1, 0, x, y);
+                grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+                add_noise_uv(x, y, grain);
+            }
+        }
+
+        for (int y = 0; y < ystart; y++) {
+            // Special case for overlapped row (sans corner)
+            for (int x = xstart; x < bw; x++) {
+                int grain = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                int old   = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 1, x, y);
+                grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+                add_noise_uv(x, y, grain);
+            }
+
+            // Special case for doubly-overlapped corner
+            for (int x = 0; x < xstart; x++) {
+                int top = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 1, x, y);
+                int old = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 1, 1, x, y);
+                int grain = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 0, 0, x, y);
+
+                // Blend the top pixel with the top left block
+                top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5);
+                top = av_clip(top, grain_min, grain_max);
+
+                // Blend the current pixel with the left block
+                old = FUNC(sample_lut)(grain_lut, offsets, sx, sy, 1, 0, x, y);
+                grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+
+                // Mix the row rows together and apply to image
+                grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5);
+                grain = av_clip(grain, grain_min, grain_max);
+                add_noise_uv(x, y, grain);
+            }
+        }
+    }
+}
+
+static void FUNC(generate_scaling)(const uint8_t points[][2], const int num,
+                                   uint8_t scaling[SCALING_SIZE] HBD_DECL)
+{
+    const int shift_x = bitdepth - 8;
+    const int scaling_size = 1 << bitdepth;
+    const int max_value = points[num - 1][0] << shift_x;
+    av_assert0(scaling_size <= SCALING_SIZE);
+
+    if (num == 0) {
+        memset(scaling, 0, scaling_size);
+        return;
+    }
+
+    // Fill up the preceding entries with the initial value
+    memset(scaling, points[0][1], points[0][0] << shift_x);
+
+    // Linearly interpolate the values in the middle
+    for (int i = 0; i < num - 1; i++) {
+        const int bx = points[i][0];
+        const int by = points[i][1];
+        const int ex = points[i+1][0];
+        const int ey = points[i+1][1];
+        const int dx = ex - bx;
+        const int dy = ey - by;
+        const int delta = dy * ((0x10000 + (dx >> 1)) / dx);
+        av_assert1(dx > 0);
+        for (int x = 0, d = 0x8000; x < dx; x++) {
+            scaling[(bx + x) << shift_x] = by + (d >> 16);
+            d += delta;
+        }
+    }
+
+    // Fill up the remaining entries with the final value
+    memset(&scaling[max_value], points[num - 1][1], scaling_size - max_value);
+
+#if BIT_DEPTH != 8
+    for (int i = 0; i < num - 1; i++) {
+        const int pad = 1 << shift_x, rnd = pad >> 1;
+        const int bx = points[i][0] << shift_x;
+        const int ex = points[i+1][0] << shift_x;
+        const int dx = ex - bx;
+        for (int x = 0; x < dx; x += pad) {
+            const int range = scaling[bx + x + pad] - scaling[bx + x];
+            for (int n = 1, r = rnd; n < pad; n++) {
+                r += range;
+                scaling[bx + x + n] = scaling[bx + x] + (r >> shift_x);
+            }
+        }
+    }
+#endif
+}
+
+static av_always_inline void
+FUNC(apply_grain_row)(AVFrame *out, const AVFrame *in,
+                      const int ss_x, const int ss_y,
+                      const uint8_t scaling[3][SCALING_SIZE],
+                      const entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH],
+                      const AVFilmGrainParams *params,
+                      const int row HBD_DECL)
+{
+    // Synthesize grain for the affected planes
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const int cpw = (out->width + ss_x) >> ss_x;
+    const int is_id = out->colorspace == AVCOL_SPC_RGB;
+    const int bh = (FFMIN(out->height - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE) + ss_y) >> ss_y;
+    const ptrdiff_t uv_off = row * FG_BLOCK_SIZE * out->linesize[1] >> ss_y;
+    pixel *const luma_src = (pixel *)
+        ((char *) in->data[0] + row * FG_BLOCK_SIZE * in->linesize[0]);
+
+    if (data->num_y_points) {
+        const int bh = FFMIN(out->height - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE);
+        const ptrdiff_t off = row * FG_BLOCK_SIZE * out->linesize[0];
+        FUNC(fgy_32x32xn_c)((pixel *) ((char *) out->data[0] + off), luma_src,
+                            out->linesize[0], params, out->width, scaling[0],
+                            grain_lut[0], bh, row HBD_CALL);
+    }
+
+    if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
+        !data->chroma_scaling_from_luma)
+    {
+        return;
+    }
+
+    // extend padding pixels
+    if (out->width & ss_x) {
+        pixel *ptr = luma_src;
+        for (int y = 0; y < bh; y++) {
+            ptr[out->width] = ptr[out->width - 1];
+            ptr = (pixel *) ((char *) ptr + (in->linesize[0] << ss_y));
+        }
+    }
+
+    if (data->chroma_scaling_from_luma) {
+        for (int pl = 0; pl < 2; pl++)
+            FUNC(fguv_32x32xn_c)((pixel *) ((char *) out->data[1 + pl] + uv_off),
+                                 (const pixel *) ((const char *) in->data[1 + pl] + uv_off),
+                                 in->linesize[1], params, cpw, scaling[0],
+                                 grain_lut[1 + pl], bh, row, luma_src,
+                                 in->linesize[0], pl, is_id, ss_x, ss_y HBD_CALL);
+    } else {
+        for (int pl = 0; pl < 2; pl++) {
+            if (data->num_uv_points[pl]) {
+                FUNC(fguv_32x32xn_c)((pixel *) ((char *) out->data[1 + pl] + uv_off),
+                                     (const pixel *) ((const char *) in->data[1 + pl] + uv_off),
+                                     in->linesize[1], params, cpw, scaling[1 + pl],
+                                     grain_lut[1 + pl], bh, row, luma_src,
+                                     in->linesize[0], pl, is_id, ss_x, ss_y HBD_CALL);
+            }
+        }
+    }
+}
+
+static int FUNC(apply_film_grain)(AVFrame *out_frame, const AVFrame *in_frame,
+                                  const AVFilmGrainParams *params HBD_DECL)
+{
+    entry grain_lut[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
+    uint8_t scaling[3][SCALING_SIZE];
+
+    const AVFilmGrainAOMParams *const data = &params->codec.aom;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out_frame->format);
+    const int rows = AV_CEIL_RSHIFT(out_frame->height, 5); /* log2(FG_BLOCK_SIZE) */
+    const int subx = desc->log2_chroma_w, suby = desc->log2_chroma_h;
+
+    // Generate grain LUTs as needed
+    FUNC(generate_grain_y_c)(grain_lut[0], params HBD_CALL);
+    if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
+        FUNC(generate_grain_uv_c)(grain_lut[1], grain_lut[0], params, 0, subx, suby HBD_CALL);
+    if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
+        FUNC(generate_grain_uv_c)(grain_lut[2], grain_lut[0], params, 1, subx, suby HBD_CALL);
+
+    // Generate scaling LUTs as needed
+    if (data->num_y_points || data->chroma_scaling_from_luma)
+        FUNC(generate_scaling)(data->y_points, data->num_y_points, scaling[0] HBD_CALL);
+    if (data->num_uv_points[0])
+        FUNC(generate_scaling)(data->uv_points[0], data->num_uv_points[0], scaling[1] HBD_CALL);
+    if (data->num_uv_points[1])
+        FUNC(generate_scaling)(data->uv_points[1], data->num_uv_points[1], scaling[2] HBD_CALL);
+
+    for (int row = 0; row < rows; row++) {
+        FUNC(apply_grain_row)(out_frame, in_frame, subx, suby, scaling, grain_lut,
+                              params, row HBD_CALL);
+    }
+
+    return 0;
+}
--- a/libavcodec/av1dec.c
+++ b/libavcodec/av1dec.c
@ -34,6 +34,7 @@
 #include "decode.h"
 #include "hwaccel_internal.h"
 #include "internal.h"
+#include "itut35.h"
 #include "hwconfig.h"
 #include "profiles.h"
 #include "refstruct.h"
@ -951,7 +952,7 @@ static int export_itut_t35(AVCodecContext *avctx, AVFrame *frame,

    provider_code = bytestream2_get_be16(&gb);
    switch (provider_code) {
-    case 0x31: { // atsc_provider_code
+    case ITU_T_T35_PROVIDER_CODE_ATSC: {
        uint32_t user_identifier = bytestream2_get_be32(&gb);
        switch (user_identifier) {
        case MKBETAG('G', 'A', '9', '4'): { // closed captions
@ -975,12 +976,12 @@ static int export_itut_t35(AVCodecContext *avctx, AVFrame *frame,
        }
        break;
    }
-    case 0x3C: { // smpte_provider_code
+    case ITU_T_T35_PROVIDER_CODE_SMTPE: {
        AVDynamicHDRPlus *hdrplus;
        int provider_oriented_code = bytestream2_get_be16(&gb);
        int application_identifier = bytestream2_get_byte(&gb);

-        if (itut_t35->itu_t_t35_country_code != 0xB5 ||
+        if (itut_t35->itu_t_t35_country_code != ITU_T_T35_COUNTRY_CODE_US ||
            provider_oriented_code != 1 || application_identifier != 4)
            break;

@ -994,9 +995,10 @@ static int export_itut_t35(AVCodecContext *avctx, AVFrame *frame,
            return ret;
        break;
    }
-    case 0x3B: { // dolby_provider_code
+    case ITU_T_T35_PROVIDER_CODE_DOLBY: {
        int provider_oriented_code = bytestream2_get_be32(&gb);
-        if (itut_t35->itu_t_t35_country_code != 0xB5 || provider_oriented_code != 0x800)
+        if (itut_t35->itu_t_t35_country_code != ITU_T_T35_COUNTRY_CODE_US ||
+            provider_oriented_code != 0x800)
            break;

        ret = ff_dovi_rpu_parse(&s->dovi, gb.buffer, gb.buffer_end - gb.buffer);
@ -1072,9 +1074,11 @@ static int export_film_grain(AVCodecContext *avctx, AVFrame *frame)
 {
    AV1DecContext *s = avctx->priv_data;
    const AV1RawFilmGrainParams *film_grain = &s->cur_frame.film_grain;
+    const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(frame->format);
    AVFilmGrainParams *fgp;
    AVFilmGrainAOMParams *aom;

+    av_assert0(pixdesc);
    if (!film_grain->apply_grain)
        return 0;

@ -1084,6 +1088,14 @@ static int export_film_grain(AVCodecContext *avctx, AVFrame *frame)

    fgp->type = AV_FILM_GRAIN_PARAMS_AV1;
    fgp->seed = film_grain->grain_seed;
+    fgp->width = frame->width;
+    fgp->height = frame->height;
+    fgp->color_range = frame->color_range;
+    fgp->color_primaries = frame->color_primaries;
+    fgp->color_trc = frame->color_trc;
+    fgp->color_space = frame->colorspace;
+    fgp->subsampling_x = pixdesc->log2_chroma_w;
+    fgp->subsampling_y = pixdesc->log2_chroma_h;

    aom = &fgp->codec.aom;
    aom->chroma_scaling_from_luma = film_grain->chroma_scaling_from_luma;
@ -1138,7 +1150,7 @@ static int set_output_frame(AVCodecContext *avctx, AVFrame *frame)
    // TODO: all layers
    if (s->operating_point_idc &&
        av_log2(s->operating_point_idc >> 8) > s->cur_frame.spatial_id)
-        return 0;
+        return AVERROR(EAGAIN);

    ret = av_frame_ref(frame, srcframe);
    if (ret < 0)
@ -1333,7 +1345,7 @@ static int av1_receive_frame_internal(AVCodecContext *avctx, AVFrame *frame)

                if (s->cur_frame.f->buf[0]) {
                    ret = set_output_frame(avctx, frame);
-                    if (ret < 0)
+                    if (ret < 0 && ret != AVERROR(EAGAIN))
                        av_log(avctx, AV_LOG_ERROR, "Set output frame error.\n");
                }

@ -1445,11 +1457,13 @@ static int av1_receive_frame_internal(AVCodecContext *avctx, AVFrame *frame)

            if (s->raw_frame_header->show_frame && s->cur_frame.f->buf[0]) {
                ret = set_output_frame(avctx, frame);
-                if (ret < 0) {
+                if (ret < 0 && ret != AVERROR(EAGAIN)) {
                    av_log(avctx, AV_LOG_ERROR, "Set output frame error\n");
                    goto end;
                }
-            }
+            } else if (show_frame)
+                ret = AVERROR_INVALIDDATA;
+
            raw_tile_group = NULL;
            s->raw_frame_header = NULL;
            if (show_frame) {
--- a/libavcodec/avcodec.c
+++ b/libavcodec/avcodec.c
@ -247,8 +247,10 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
        && !(codec->capabilities & AV_CODEC_CAP_CHANNEL_CONF)) {
        av_log(avctx, AV_LOG_ERROR, "%s requires channel layout to be set\n",
               av_codec_is_decoder(codec) ? "Decoder" : "Encoder");
-        ret = AVERROR(EINVAL);
-        goto free_and_end;
+        if (!av_codec_is_decoder(codec)) {
+            ret = AVERROR(EINVAL);
+            goto free_and_end;
+        }
    }
    if (avctx->ch_layout.nb_channels && !av_channel_layout_check(&avctx->ch_layout)) {
        av_log(avctx, AV_LOG_ERROR, "Invalid channel layout\n");
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@ -1665,6 +1665,8 @@ typedef struct AVCodecContext {
 #define FF_PROFILE_DTS_ES             30
 #define FF_PROFILE_DTS_96_24          40
 #define FF_PROFILE_DTS_HD_HRA         50
+#define FF_PROFILE_DTS_HD_HRA_X       51
+#define FF_PROFILE_DTS_HD_HRA_X_IMAX  52
 #define FF_PROFILE_DTS_HD_MA          60
 #define FF_PROFILE_DTS_EXPRESS        70
 #define FF_PROFILE_DTS_HD_MA_X        61
@ -1696,11 +1698,13 @@ typedef struct AVCodecContext {
 #define FF_PROFILE_H264_HIGH_422             122
 #define FF_PROFILE_H264_HIGH_422_INTRA       (122|FF_PROFILE_H264_INTRA)
 #define FF_PROFILE_H264_STEREO_HIGH          128
+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
 #define FF_PROFILE_H264_HIGH_444             144
 #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
 #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
 #define FF_PROFILE_H264_CAVLC_444            44

+
 #define FF_PROFILE_VC1_SIMPLE   0
 #define FF_PROFILE_VC1_MAIN     1
 #define FF_PROFILE_VC1_COMPLEX  2
@ -2029,6 +2033,13 @@ typedef struct AVCodecContext {
     */
    int64_t frame_num;

+    /**
+     * Is the stream completely progressive?
+     * - decoding: set by avcodec
+     * - encoding: unused
+     */
+    int progressive_sequence;
+
    /**
     * Decoding only. May be set by the caller before avcodec_open2() to an
     * av_malloc()'ed array (or via AVOptions). Owned and freed by the decoder
@ -2062,6 +2073,19 @@ typedef struct AVCodecContext {
     * Number of entries in side_data_prefer_packet.
     */
    unsigned nb_side_data_prefer_packet;
+
+    /**
+     * Array containing static side data, such as HDR10 CLL / MDCV structures.
+     * Side data entries should be allocated by usage of helpers defined in
+     * libavutil/frame.h.
+     *
+     * - encoding: may be set by user before calling avcodec_open2() for
+     *             encoder configuration. Afterwards owned and freed by the
+     *             encoder.
+     * - decoding: unused
+     */
+    AVFrameSideData  **decoded_side_data;
+    int             nb_decoded_side_data;
 } AVCodecContext;

 /**
@ -2211,6 +2235,12 @@ typedef struct AVSubtitleRect {
    char *ass;
 } AVSubtitleRect;

+typedef struct AVSubtitleDVDPalette {
+    uint32_t start_display_time;
+    uint8_t colormap[4];
+    uint8_t alpha[4];
+} AVSubtitleDVDPalette;
+
 typedef struct AVSubtitle {
    uint16_t format; /* 0 = graphics */
    uint32_t start_display_time; /* relative to packet pts, in ms */
@ -2218,6 +2248,9 @@ typedef struct AVSubtitle {
    unsigned num_rects;
    AVSubtitleRect **rects;
    int64_t pts;    ///< Same as packet pts, in AV_TIME_BASE
+
+    unsigned num_dvd_palette;
+    AVSubtitleDVDPalette **dvd_palette;
 } AVSubtitle;

 /**
@ -2730,6 +2763,7 @@ typedef struct AVCodecParserContext {
 /// Set if the parser has a valid file offset
 #define PARSER_FLAG_FETCHED_OFFSET            0x0004
 #define PARSER_FLAG_USE_CODEC_TS              0x1000
+#define PARSER_FLAG_NO_TIMESTAMP_MANGLING     0x2000

    int64_t offset;      ///< byte offset from starting packet start
    int64_t cur_frame_end[AV_PARSER_PTS_NB];
--- a/libavcodec/bsf/extract_extradata.c
+++ b/libavcodec/bsf/extract_extradata.c
@ -166,10 +166,10 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
        VVC_VPS_NUT, VVC_SPS_NUT, VVC_PPS_NUT,
    };
    static const int extradata_nal_types_hevc[] = {
-        HEVC_NAL_VPS, HEVC_NAL_SPS, HEVC_NAL_PPS,
+        HEVC_NAL_VPS, HEVC_NAL_SPS, HEVC_NAL_PPS, HEVC_NAL_SEI_PREFIX, HEVC_NAL_SEI_SUFFIX,
    };
    static const int extradata_nal_types_h264[] = {
-        H264_NAL_SPS, H264_NAL_PPS,
+        H264_NAL_SPS, H264_NAL_SUB_SPS, H264_NAL_PPS, H264_NAL_SEI,
    };

    ExtractExtradataContext *s = ctx->priv_data;
@ -206,7 +206,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
                if (nal->type == HEVC_NAL_SPS) has_sps = 1;
                if (nal->type == HEVC_NAL_VPS) has_vps = 1;
            } else {
-                if (nal->type == H264_NAL_SPS) has_sps = 1;
+                if (nal->type == H264_NAL_SPS || nal->type == H264_NAL_SUB_SPS) has_sps = 1;
            }
        } else if (s->remove) {
            filtered_size += nal->raw_size + 3;
@ -216,7 +216,8 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
    if (extradata_size &&
        ((ctx->par_in->codec_id == AV_CODEC_ID_VVC  && has_sps) ||
         (ctx->par_in->codec_id == AV_CODEC_ID_HEVC && has_sps && has_vps) ||
-         (ctx->par_in->codec_id == AV_CODEC_ID_H264 && has_sps))) {
+         (ctx->par_in->codec_id == AV_CODEC_ID_H264 && has_sps) ||
+         (ctx->par_in->codec_id == AV_CODEC_ID_H264_MVC && has_sps))) {
        AVBufferRef *filtered_buf = NULL;
        PutByteContext pb_filtered_data, pb_extradata;
        uint8_t *extradata;
@ -368,6 +369,7 @@ static const struct {
    { AV_CODEC_ID_AVS3,       extract_extradata_mpeg4   },
    { AV_CODEC_ID_CAVS,       extract_extradata_mpeg4   },
    { AV_CODEC_ID_H264,       extract_extradata_h2645   },
+    { AV_CODEC_ID_H264_MVC,   extract_extradata_h2645   },
    { AV_CODEC_ID_HEVC,       extract_extradata_h2645   },
    { AV_CODEC_ID_MPEG1VIDEO, extract_extradata_mpeg12  },
    { AV_CODEC_ID_MPEG2VIDEO, extract_extradata_mpeg12  },
@ -438,6 +440,7 @@ static const enum AVCodecID codec_ids[] = {
    AV_CODEC_ID_AVS3,
    AV_CODEC_ID_CAVS,
    AV_CODEC_ID_H264,
+    AV_CODEC_ID_H264_MVC,
    AV_CODEC_ID_HEVC,
    AV_CODEC_ID_MPEG1VIDEO,
    AV_CODEC_ID_MPEG2VIDEO,
--- a/libavcodec/bsf/remove_extradata.c
+++ b/libavcodec/bsf/remove_extradata.c
@ -76,7 +76,7 @@ static int h264_split(const uint8_t *buf, int buf_size)
        if ((state & 0xFFFFFF00) != 0x100)
            break;
        nalu_type = state & 0x1F;
-        if (nalu_type == H264_NAL_SPS) {
+        if (nalu_type == H264_NAL_SPS || nalu_type == H264_NAL_SUB_SPS) {
            has_sps = 1;
        } else if (nalu_type == H264_NAL_PPS)
            has_pps = 1;
@ -204,6 +204,7 @@ static int remove_extradata(AVBSFContext *ctx, AVPacket *pkt)
            i = mpeg4video_split(pkt->data, pkt->size);
            break;
        case AV_CODEC_ID_H264:
+        case AV_CODEC_ID_H264_MVC:
            i = h264_split(pkt->data, pkt->size);
            break;
        case AV_CODEC_ID_HEVC:
--- a/libavcodec/cbs_h266_syntax_template.c
+++ b/libavcodec/cbs_h266_syntax_template.c
@ -2072,6 +2072,8 @@ static int FUNC(pps) (CodedBitstreamContext *ctx, RWContext *rw,

                tile_x = tile_idx % current->num_tile_columns;
                tile_y = tile_idx / current->num_tile_columns;
+                if (tile_y >= current->num_tile_rows)
+                    return AVERROR_INVALIDDATA;

                ctu_x = 0, ctu_y = 0;
                for (j = 0; j < tile_x; j++) {
--- a/libavcodec/codec_desc.c
+++ b/libavcodec/codec_desc.c
@ -1959,6 +1959,14 @@ static const AVCodecDescriptor codec_descriptors[] = {
        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
    },

+    {
+        .id        = AV_CODEC_ID_H264_MVC,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "h264_mvc",
+        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+
    /* various PCM "codecs" */
    {
        .id        = AV_CODEC_ID_PCM_S16LE,
--- a/libavcodec/codec_id.h
+++ b/libavcodec/codec_id.h
@ -323,6 +323,8 @@ enum AVCodecID {
    AV_CODEC_ID_VMIX,
    AV_CODEC_ID_LEAD,

+    AV_CODEC_ID_H264_MVC,
+
    /* various PCM "codecs" */
    AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
    AV_CODEC_ID_PCM_S16LE = 0x10000,
--- a/libavcodec/dca_core.c
+++ b/libavcodec/dca_core.c
@ -2369,9 +2369,14 @@ int ff_dca_core_filter_frame(DCACoreDecoder *s, AVFrame *frame)
        return ret;

    // Set profile, bit rate, etc
-    if (s->ext_audio_mask & DCA_EXSS_MASK)
-        avctx->profile = AV_PROFILE_DTS_HD_HRA;
-    else if (s->ext_audio_mask & (DCA_CSS_XXCH | DCA_CSS_XCH))
+    if (s->ext_audio_mask & DCA_EXSS_MASK) {
+        if (dca->exss.x_syncword_present)
+            avctx->profile = FF_PROFILE_DTS_HD_HRA_X;
+        else if (dca->exss.x_imax_syncword_present)
+            avctx->profile = FF_PROFILE_DTS_HD_HRA_X_IMAX;
+        else
+            avctx->profile = AV_PROFILE_DTS_HD_HRA;
+    } else if (s->ext_audio_mask & (DCA_CSS_XXCH | DCA_CSS_XCH))
        avctx->profile = AV_PROFILE_DTS_ES;
    else if (s->ext_audio_mask & DCA_CSS_X96)
        avctx->profile = AV_PROFILE_DTS_96_24;
--- a/libavcodec/dca_exss.c
+++ b/libavcodec/dca_exss.c
@ -19,6 +19,7 @@
 */

 #include "dcadec.h"
+#include "dca_syncwords.h"

 static void parse_xll_parameters(DCAExssParser *s, DCAExssAsset *asset)
 {
@ -510,5 +511,17 @@ int ff_dca_exss_parse(DCAExssParser *s, const uint8_t *data, int size)
        return AVERROR_INVALIDDATA;
    }

+    // Check for extradata extensions
+    if ((s->exss_size - offset) > 10) {
+        if (AV_RB32(data + offset) == 0x3a429b0a) {
+            unsigned int extradata_syncword = AV_RB32(data + offset + 6);
+            if (extradata_syncword == DCA_SYNCWORD_XLL_X) {
+                s->x_syncword_present = 1;
+            } else if ((extradata_syncword >> 1) == (DCA_SYNCWORD_XLL_X_IMAX >> 1)) {
+                s->x_imax_syncword_present = 1;
+            }
+        }
+    }
+
    return 0;
 }
--- a/libavcodec/dca_exss.h
+++ b/libavcodec/dca_exss.h
@ -84,6 +84,9 @@ typedef struct DCAExssParser {
    int     nmixoutconfigs;         ///< Number of mixing configurations
    int     nmixoutchs[4];          ///< Speaker layout mask for mixer output channels

+    int     x_syncword_present;     ///< DTS:X extension syncword detected
+    int     x_imax_syncword_present;///< DTS:X IMAX extension syncword detected
+
    DCAExssAsset   assets[1];    ///< Audio asset descriptors
 } DCAExssParser;

--- a/libavcodec/decode.c
+++ b/libavcodec/decode.c
@ -1326,8 +1326,8 @@ int ff_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
            goto try_again;
        }
        if (hw_config->hwaccel) {
-            av_log(avctx, AV_LOG_DEBUG, "Format %s requires hwaccel "
-                   "initialisation.\n", desc->name);
+            av_log(avctx, AV_LOG_DEBUG, "Format %s requires hwaccel %s "
+                   "initialisation.\n", desc->name, hw_config->hwaccel->p.name);
            err = hwaccel_init(avctx, hw_config->hwaccel);
            if (err < 0)
                goto try_again;
--- a/libavcodec/defs.h
+++ b/libavcodec/defs.h
@ -87,6 +87,8 @@
 #define AV_PROFILE_DTS_ES             30
 #define AV_PROFILE_DTS_96_24          40
 #define AV_PROFILE_DTS_HD_HRA         50
+#define AV_PROFILE_DTS_HD_HRA_X       51
+#define AV_PROFILE_DTS_HD_HRA_X_IMAX  52
 #define AV_PROFILE_DTS_HD_MA          60
 #define AV_PROFILE_DTS_EXPRESS        70
 #define AV_PROFILE_DTS_HD_MA_X        61
--- a/libavcodec/dovi_rpu.c
+++ b/libavcodec/dovi_rpu.c
@ -68,7 +68,7 @@ void ff_dovi_ctx_replace(DOVIContext *s, const DOVIContext *s0)
    s->mapping = s0->mapping;
    s->color = s0->color;
    s->dv_profile = s0->dv_profile;
-    for (int i = 0; i < DOVI_MAX_DM_ID; i++)
+    for (int i = 0; i <= DOVI_MAX_DM_ID; i++)
        ff_refstruct_replace(&s->vdr[i], s0->vdr[i]);
 }

@ -145,7 +145,7 @@ static inline uint64_t get_ue_coef(GetBitContext *gb, const AVDOVIRpuDataHeader
    case RPU_COEFF_FIXED:
        ipart = get_ue_golomb_long(gb);
        fpart.u32 = get_bits_long(gb, hdr->coef_log2_denom);
-        return (ipart << hdr->coef_log2_denom) + fpart.u32;
+        return (ipart << hdr->coef_log2_denom) | fpart.u32;

    case RPU_COEFF_FLOAT:
        fpart.u32 = get_bits_long(gb, 32);
@ -164,7 +164,7 @@ static inline int64_t get_se_coef(GetBitContext *gb, const AVDOVIRpuDataHeader *
    case RPU_COEFF_FIXED:
        ipart = get_se_golomb_long(gb);
        fpart.u32 = get_bits_long(gb, hdr->coef_log2_denom);
-        return ipart * (1LL << hdr->coef_log2_denom) + fpart.u32;
+        return ipart * (1LL << hdr->coef_log2_denom) | fpart.u32;

    case RPU_COEFF_FLOAT:
        fpart.u32 = get_bits_long(gb, 32);
--- a/libavcodec/dvdsubdec.c
+++ b/libavcodec/dvdsubdec.c
@ -146,16 +146,12 @@ static void guess_palette(DVDSubContext* ctx,
                          uint32_t *rgba_palette,
                          uint32_t subtitle_color)
 {
-    static const uint8_t level_map[4][4] = {
+    static const uint8_t level_map[4] = {
        // this configuration (full range, lowest to highest) in tests
        // seemed most common, so assume this
-        {0xff},
-        {0x00, 0xff},
-        {0x00, 0x80, 0xff},
-        {0x00, 0x55, 0xaa, 0xff},
+        0x00, 0xe0, 0x80, 0x20
    };
-    uint8_t color_used[16] = { 0 };
-    int nb_opaque_colors, i, level, j, r, g, b;
+    int i, level, r, g, b;
    uint8_t *colormap = ctx->colormap, *alpha = ctx->alpha;

    if(ctx->has_palette) {
@ -168,33 +164,13 @@ static void guess_palette(DVDSubContext* ctx,
    for(i = 0; i < 4; i++)
        rgba_palette[i] = 0;

-    nb_opaque_colors = 0;
-    for(i = 0; i < 4; i++) {
-        if (alpha[i] != 0 && !color_used[colormap[i]]) {
-            color_used[colormap[i]] = 1;
-            nb_opaque_colors++;
-        }
-    }
-
-    if (nb_opaque_colors == 0)
-        return;
-
-    j = 0;
-    memset(color_used, 0, 16);
    for(i = 0; i < 4; i++) {
        if (alpha[i] != 0) {
-            if (!color_used[colormap[i]])  {
-                level = level_map[nb_opaque_colors - 1][j];
-                r = (((subtitle_color >> 16) & 0xff) * level) >> 8;
-                g = (((subtitle_color >> 8) & 0xff) * level) >> 8;
-                b = (((subtitle_color >> 0) & 0xff) * level) >> 8;
-                rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17U) << 24);
-                color_used[colormap[i]] = (i + 1);
-                j++;
-            } else {
-                rgba_palette[i] = (rgba_palette[color_used[colormap[i]] - 1] & 0x00ffffff) |
-                                    ((alpha[i] * 17U) << 24);
-            }
+            level = level_map[i];
+            r = (((subtitle_color >> 16) & 0xff) * level) >> 8;
+            g = (((subtitle_color >> 8) & 0xff) * level) >> 8;
+            b = (((subtitle_color >> 0) & 0xff) * level) >> 8;
+            rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17U) << 24);
        }
    }
 }
@ -348,7 +324,7 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
            case 0xff:
                goto the_end;
            default:
-                ff_dlog(NULL, "unrecognised subpicture command 0x%x\n", cmd);
+                av_log(ctx, AV_LOG_WARNING, "unrecognised subpicture command 0x%x\n", cmd);
                goto the_end;
            }
        }
@ -356,6 +332,14 @@ static int decode_dvd_subtitles(DVDSubContext *ctx, AVSubtitle *sub_header,
        if (offset1 >= buf_size || offset2 >= buf_size)
            goto fail;

+        /* store dvd palette info in subtitle struct for use by caller */
+        i = sub_header->num_dvd_palette++;
+        sub_header->dvd_palette = av_realloc(sub_header->dvd_palette, sizeof(AVSubtitleDVDPalette *) * (i+1));
+        sub_header->dvd_palette[i] = av_mallocz(sizeof(AVSubtitleDVDPalette));
+        sub_header->dvd_palette[i]->start_display_time = (date << 10) / 90;
+        memcpy(sub_header->dvd_palette[i]->colormap, colormap, 4);
+        memcpy(sub_header->dvd_palette[i]->alpha, alpha, 4);
+        /* parse rle subtitles */
        if (offset1 >= 0 && offset2 >= 0) {
            int w, h;
            uint8_t *bitmap;
--- a/libavcodec/dxva2.c
+++ b/libavcodec/dxva2.c
@ -754,14 +754,13 @@ static void *get_surface(const AVCodecContext *avctx, const AVFrame *frame)
 {
 #if CONFIG_D3D11VA
    if (frame->format == AV_PIX_FMT_D3D11) {
-        FFDXVASharedContext *sctx = DXVA_SHARED_CONTEXT(avctx);
+        AVDXVAContext *ctx = DXVA_CONTEXT(avctx);
        intptr_t index = (intptr_t)frame->data[1];
-        if (index < 0 || index >= sctx->nb_d3d11_views ||
-            sctx->d3d11_texture != (ID3D11Texture2D *)frame->data[0]) {
+        if (index < 0 || index >= D3D11VA_CONTEXT(ctx)->surface_count) {
            av_log((void *)avctx, AV_LOG_ERROR, "get_buffer frame is invalid!\n");
            return NULL;
        }
-        return sctx->d3d11_views[index];
+        return D3D11VA_CONTEXT(ctx)->surface[index];
    }
 #endif
    return frame->data[3];
--- a/libavcodec/dxva2_av1.c
+++ b/libavcodec/dxva2_av1.c
@ -29,6 +29,10 @@
 #include "av1dec.h"
 #include "hwaccel_internal.h"

+#if !HAVE_DXVA_PICPARAMS_AV1
+#include "compat/windows/dxva_av1.h"
+#endif
+
 #define MAX_TILES 256

 struct AV1DXVAContext {
--- a/libavcodec/dxva2_hevc.c
+++ b/libavcodec/dxva2_hevc.c
@ -29,6 +29,10 @@
 #include "hevcdec.h"
 #include "hwaccel_internal.h"

+#if !HAVE_DXVA_PICPARAMS_HEVC
+#include "compat/windows/dxva_hevc.h"
+#endif
+
 #define MAX_SLICES 256

 struct hevc_dxva2_picture_context {
@ -164,7 +168,7 @@ void ff_dxva2_hevc_fill_picture_parameters(const AVCodecContext *avctx, AVDXVACo
    for (i = 0, j = 0; i < FF_ARRAY_ELEMS(pp->RefPicList); i++) {
        const HEVCFrame *frame = NULL;
        while (!frame && j < FF_ARRAY_ELEMS(h->DPB)) {
-            if (&h->DPB[j] != current_picture && (h->DPB[j].flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF)))
+            if (&h->DPB[j] != current_picture && (h->DPB[j].flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF)) && !h->DPB[j].missing)
                frame = &h->DPB[j];
            j++;
        }
--- a/libavcodec/dxva2_mpeg2.c
+++ b/libavcodec/dxva2_mpeg2.c
@ -38,6 +38,7 @@ struct dxva2_picture_context {

    const uint8_t          *bitstream;
    unsigned               bitstream_size;
+    int                    frame_start;
 };

 void ff_dxva2_mpeg2_fill_picture_parameters(AVCodecContext *avctx,
@ -272,6 +273,7 @@ static int dxva2_mpeg2_start_frame(AVCodecContext *avctx,
    ctx_pic->slice_count    = 0;
    ctx_pic->bitstream_size = 0;
    ctx_pic->bitstream      = NULL;
+    ctx_pic->frame_start    = 1;
    return 0;
 }

@ -305,8 +307,9 @@ static int dxva2_mpeg2_end_frame(AVCodecContext *avctx)
        s->current_picture_ptr->hwaccel_picture_private;
    int ret;

-    if (ctx_pic->slice_count <= 0 || ctx_pic->bitstream_size <= 0)
+    if (ctx_pic->slice_count <= 0 || ctx_pic->bitstream_size <= 0 || !ctx_pic->frame_start)
        return -1;
+    ctx_pic->frame_start = 0;
    ret = ff_dxva2_common_end_frame(avctx, s->current_picture_ptr->f,
                                    &ctx_pic->pp, sizeof(ctx_pic->pp),
                                    &ctx_pic->qm, sizeof(ctx_pic->qm),
--- a/libavcodec/dxva2_vp9.c
+++ b/libavcodec/dxva2_vp9.c
@ -29,6 +29,10 @@
 #include "hwaccel_internal.h"
 #include "vp9shared.h"

+#if !HAVE_DXVA_PICPARAMS_VP9
+#include "compat/windows/dxva_vpx.h"
+#endif
+
 struct vp9_dxva2_picture_context {
    DXVA_PicParams_VP9    pp;
    DXVA_Slice_VPx_Short  slice;
--- a/libavcodec/ffjni.c
+++ b/libavcodec/ffjni.c
@ -236,17 +236,9 @@ done:
    av_free(name);
    av_free(message);

-    if (class_class) {
-        (*env)->DeleteLocalRef(env, class_class);
-    }
-
-    if (exception_class) {
-        (*env)->DeleteLocalRef(env, exception_class);
-    }
-
-    if (string) {
-        (*env)->DeleteLocalRef(env, string);
-    }
+    (*env)->DeleteLocalRef(env, class_class);
+    (*env)->DeleteLocalRef(env, exception_class);
+    (*env)->DeleteLocalRef(env, string);

    return ret;
 }
--- a/libavcodec/ffjni.h
+++ b/libavcodec/ffjni.h
@ -24,6 +24,7 @@
 #define AVCODEC_FFJNI_H

 #include <jni.h>
+#include <stddef.h>

 /*
 * Attach permanently a JNI environment to the current thread and retrieve it.
@ -105,7 +106,7 @@ struct FFJniField {
    const char *method;
    const char *signature;
    enum FFJniFieldType type;
-    int offset;
+    size_t offset;
    int mandatory;

 };
--- a/libavcodec/fraps.c
+++ b/libavcodec/fraps.c
@ -42,6 +42,7 @@
 #include "bswapdsp.h"
 #include "codec_internal.h"
 #include "thread.h"
+#include "threadframe.h"

 #define FPS_TAG MKTAG('F', 'P', 'S', 'x')
 #define VLC_BITS 11
@ -52,10 +53,15 @@
 typedef struct FrapsContext {
    AVCodecContext *avctx;
    BswapDSPContext bdsp;
+    int cur_index, prev_index;
+    int next_cur_index, next_prev_index;
+    ThreadFrame frames[2];
    uint8_t *tmpbuf;
    int tmpbuf_size;
 } FrapsContext;

+static av_cold int decode_end(AVCodecContext *avctx);
+
 /**
 * initializes decoder
 * @param avctx codec context
@ -64,12 +70,46 @@ typedef struct FrapsContext {
 static av_cold int decode_init(AVCodecContext *avctx)
 {
    FrapsContext * const s = avctx->priv_data;
+    int i;
+
+    s->prev_index = 0;
+    s->cur_index = 1;

    s->avctx  = avctx;
    s->tmpbuf = NULL;

    ff_bswapdsp_init(&s->bdsp);

+    for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
+        s->frames[i].f = av_frame_alloc();
+        if (!s->frames[i].f) {
+            decode_end(avctx);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+}
+
+static int update_thread_context(AVCodecContext *avctx, const AVCodecContext *avctx_from)
+{
+    FrapsContext *dst = avctx->priv_data, *src = avctx_from->priv_data;
+    int i, ret;
+
+    if (avctx == avctx_from) return 0;
+
+    dst->cur_index  = src->next_cur_index;
+    dst->prev_index = src->next_prev_index;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(dst->frames); i++) {
+        ff_thread_release_ext_buffer(&dst->frames[i]);
+        if (src->frames[i].f->data[0]) {
+            ret = ff_thread_ref_frame(&dst->frames[i], &src->frames[i]);
+            if (ret < 0)
+                return ret;
+        }
+    }
+
    return 0;
 }

@ -132,18 +172,52 @@ static int fraps2_decode_plane(FrapsContext *s, uint8_t *dst, int stride, int w,
    return 0;
 }

-static int decode_frame(AVCodecContext *avctx, AVFrame *f,
+static void frame_copy(FrapsContext *s,
+                       uint8_t *dst_data[3], const int dst_linesizes[3],
+                       uint8_t *src_data[3], const int src_linesizes[3],
+                       unsigned int version, int width, int height)
+{
+    int i, k, h, bwidth;
+    uint8_t *src, *dst;
+    int planes = (version & 1) ? 1 : 3;
+
+    for (i = 0; i < planes; i++) {
+        dst = dst_data[i];
+        src = src_data[i];
+        if (version & 1) {
+            /* RGB data */
+            h = height;
+            bwidth = width * 3;
+        } else {
+            /* YUV 4:2:0 data */
+            h = i ? height >> 1 : height;
+            bwidth = i ? width >> 1 : width;
+        }
+
+        ff_thread_await_progress(&s->frames[s->prev_index], i, 0);
+        for (k = 0; k < h; k++) {
+            memcpy(dst, src, bwidth);
+            dst += dst_linesizes[i];
+            src += src_linesizes[i];
+        }
+        ff_thread_report_progress(&s->frames[s->cur_index], i, 0);
+    }
+}
+
+static int decode_frame(AVCodecContext *avctx, AVFrame *fout,
                        int *got_frame, AVPacket *avpkt)
 {
    FrapsContext * const s = avctx->priv_data;
    const uint8_t *buf     = avpkt->data;
    int buf_size           = avpkt->size;
+    ThreadFrame *frame, *prev_frame;
+    AVFrame *f;
    uint32_t header;
    unsigned int version,header_size;
    const uint32_t *buf32;
    uint32_t *luma1,*luma2,*cb,*cr;
    uint32_t offs[4];
-    int i, j, ret, is_chroma;
+    int i, j, ret, is_chroma, is_Pframe;
    const int planes = 3;
    int is_pal;
    uint8_t *out;
@ -153,6 +227,10 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
        return AVERROR_INVALIDDATA;
    }

+    frame = &s->frames[s->cur_index];
+    prev_frame = &s->frames[s->prev_index];
+    f = frame->f;
+
    header      = AV_RL32(buf);
    version     = header & 0xff;
    is_pal      = buf[1] == 2 && version == 1;
@ -179,22 +257,16 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
        if (version == 0) needed_size /= 2;
        needed_size += header_size;
        /* bit 31 means same as previous pic */
-        if (header & (1U<<31)) {
-            *got_frame = 0;
-            return buf_size;
-        }
-        if (buf_size != needed_size) {
+        is_Pframe = (header & (1U<<31)) ? 1 : 0;
+        if (!is_Pframe && buf_size != needed_size) {
            av_log(avctx, AV_LOG_ERROR,
                   "Invalid frame length %d (should be %d)\n",
                   buf_size, needed_size);
            return AVERROR_INVALIDDATA;
        }
    } else {
-        /* skip frame */
-        if (buf_size == 8) {
-            *got_frame = 0;
-            return buf_size;
-        }
+        is_Pframe = buf_size == 8 ? 1 : 0;
+        if (!is_Pframe) {
        if (AV_RL32(buf) != FPS_TAG || buf_size < planes*1024 + 24) {
            av_log(avctx, AV_LOG_ERROR, "error in data stream\n");
            return AVERROR_INVALIDDATA;
@ -212,19 +284,43 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
            if (!s->tmpbuf)
                return AVERROR(ENOMEM);
        }
+        }
    }

-    f->pict_type = AV_PICTURE_TYPE_I;
-    f->flags |= AV_FRAME_FLAG_KEY;
+    if (is_Pframe && !prev_frame->f->data[0]) {
+        av_log(avctx, AV_LOG_ERROR, "decoding must start with keyframe\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    ff_thread_release_ext_buffer(frame);
+
+    f->pict_type = is_Pframe ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I;
+    f->flags |= is_Pframe ? 0 : AV_FRAME_FLAG_KEY;;

    avctx->pix_fmt = version & 1 ? is_pal ? AV_PIX_FMT_PAL8 : AV_PIX_FMT_BGR24 : AV_PIX_FMT_YUVJ420P;
    avctx->color_range = version & 1 ? AVCOL_RANGE_UNSPECIFIED
                                     : AVCOL_RANGE_JPEG;
    avctx->colorspace = version & 1 ? AVCOL_SPC_UNSPECIFIED : AVCOL_SPC_BT709;

-    if ((ret = ff_thread_get_buffer(avctx, f, 0)) < 0)
+    if ((ret = ff_thread_get_ext_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
        return ret;

+    s->next_prev_index = s->cur_index;
+    s->next_cur_index  = (s->cur_index - 1) & 1;
+
+    ff_thread_finish_setup(avctx);
+
+    /* Copy previous frame */
+    if (is_Pframe) {
+        frame_copy(s,
+                   frame->f->data,
+                   frame->f->linesize,
+                   prev_frame->f->data,
+                   prev_frame->f->linesize,
+                   version, avctx->width, avctx->height);
+        goto end;
+    }
+
    switch (version) {
    case 0:
    default:
@ -250,6 +346,7 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
                *cb++    = *buf32++;
            }
        }
+        ff_thread_report_progress(frame, INT_MAX, 0);
        break;

    case 1:
@ -272,6 +369,7 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
                       &buf[y * avctx->width * 3],
                       3 * avctx->width);
        }
+        ff_thread_report_progress(frame, INT_MAX, 0);
        break;

    case 2:
@ -288,8 +386,13 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
                                           buf + offs[i], offs[i + 1] - offs[i],
                                           is_chroma, 1)) < 0) {
                av_log(avctx, AV_LOG_ERROR, "Error decoding plane %i\n", i);
-                return ret;
-            }
+                if (avctx->active_thread_type & FF_THREAD_FRAME) {
+                    ff_thread_report_progress(frame, INT_MAX, 0);
+                    break;
+                } else
+                    return ret;
+            } else
+                ff_thread_report_progress(frame, i, 0);
        }
        break;
    case 3:
@ -300,7 +403,10 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
                                           -f->linesize[0], avctx->width, avctx->height,
                                           buf + offs[i], offs[i + 1] - offs[i], 0, 3)) < 0) {
                av_log(avctx, AV_LOG_ERROR, "Error decoding plane %i\n", i);
-                return ret;
+                if (avctx->active_thread_type & FF_THREAD_FRAME)
+                    break;
+                else
+                    return ret;
            }
        }
        out = f->data[0];
@ -314,11 +420,21 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
            }
            out += f->linesize[0] - 3*avctx->width;
        }
+        ff_thread_report_progress(frame, INT_MAX, 0);
        break;
    }

+end:
+    if ((ret = av_frame_ref(fout, frame->f)) < 0)
+        return ret;
    *got_frame = 1;

+    s->prev_index = s->next_prev_index;
+    s->cur_index  = s->next_cur_index;
+
+    /* Only release frames that aren't used anymore */
+    ff_thread_release_ext_buffer(&s->frames[s->cur_index]);
+
    return buf_size;
 }

@ -330,8 +446,16 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *f,
 static av_cold int decode_end(AVCodecContext *avctx)
 {
    FrapsContext *s = avctx->priv_data;
+    int i;

    av_freep(&s->tmpbuf);
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
+        if (s->frames[i].f)
+            ff_thread_release_ext_buffer(&s->frames[i]);
+        av_frame_free(&s->frames[i].f);
+    }
+
    return 0;
 }

@ -345,4 +469,6 @@ const FFCodec ff_fraps_decoder = {
    .close          = decode_end,
    FF_CODEC_DECODE_CB(decode_frame),
    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .caps_internal = FF_CODEC_CAP_ALLOCATE_PROGRESS,
+    UPDATE_THREAD_CONTEXT(update_thread_context),
 };
--- a/libavcodec/h2645_sei.c
+++ b/libavcodec/h2645_sei.c
@ -40,6 +40,7 @@
 #include "get_bits.h"
 #include "golomb.h"
 #include "h2645_sei.h"
+#include "itut35.h"

 #define IS_H264(codec_id) (CONFIG_H264_SEI && CONFIG_HEVC_SEI ? codec_id == AV_CODEC_ID_H264 : CONFIG_H264_SEI)
 #define IS_HEVC(codec_id) (CONFIG_H264_SEI && CONFIG_HEVC_SEI ? codec_id == AV_CODEC_ID_HEVC : CONFIG_HEVC_SEI)
@ -140,7 +141,8 @@ static int decode_registered_user_data(H2645SEI *h, GetByteContext *gb,
        bytestream2_skipu(gb, 1);  // itu_t_t35_country_code_extension_byte
    }

-    if (country_code != 0xB5 && country_code != 0x26) { // usa_country_code and cn_country_code
+    if (country_code != ITU_T_T35_COUNTRY_CODE_US &&
+        country_code != ITU_T_T35_COUNTRY_CODE_CN) {
        av_log(logctx, AV_LOG_VERBOSE,
               "Unsupported User Data Registered ITU-T T35 SEI message (country_code = %d)\n",
               country_code);
@ -151,7 +153,7 @@ static int decode_registered_user_data(H2645SEI *h, GetByteContext *gb,
    provider_code = bytestream2_get_be16u(gb);

    switch (provider_code) {
-    case 0x31: { // atsc_provider_code
+    case ITU_T_T35_PROVIDER_CODE_ATSC: {
        uint32_t user_identifier;

        if (bytestream2_get_bytes_left(gb) < 4)
@ -172,7 +174,7 @@ static int decode_registered_user_data(H2645SEI *h, GetByteContext *gb,
        break;
    }
 #if CONFIG_HEVC_SEI
-    case 0x04: { // cuva_provider_code
+    case ITU_T_T35_PROVIDER_CODE_CUVA: {
        const uint16_t cuva_provider_oriented_code = 0x0005;
        uint16_t provider_oriented_code;

@ -188,7 +190,7 @@ static int decode_registered_user_data(H2645SEI *h, GetByteContext *gb,
        }
        break;
    }
-    case 0x3C: { // smpte_provider_code
+    case ITU_T_T35_PROVIDER_CODE_SMTPE: {
        // A/341 Amendment - 2094-40
        const uint16_t smpte2094_40_provider_oriented_code = 0x0001;
        const uint8_t smpte2094_40_application_identifier = 0x04;
@ -209,6 +211,24 @@ static int decode_registered_user_data(H2645SEI *h, GetByteContext *gb,
        }
        break;
    }
+    case 0x5890: { // aom_provider_code
+        const uint16_t aom_grain_provider_oriented_code = 0x0001;
+        uint16_t provider_oriented_code;
+
+        if (!IS_HEVC(codec_id))
+            goto unsupported_provider_code;
+
+        if (bytestream2_get_bytes_left(gb) < 2)
+            return AVERROR_INVALIDDATA;
+
+        provider_oriented_code = bytestream2_get_byteu(gb);
+        if (provider_oriented_code == aom_grain_provider_oriented_code) {
+            return ff_aom_parse_film_grain_sets(&h->aom_film_grain,
+                                                gb->buffer,
+                                                bytestream2_get_bytes_left(gb));
+        }
+        break;
+    }
    unsupported_provider_code:
 #endif
    default:
@ -641,35 +661,45 @@ int ff_h2645_sei_to_frame(AVFrame *frame, H2645SEI *sei,
        h274      = &fgp->codec.h274;

        fgp->seed = seed;
+        fgp->width = frame->width;
+        fgp->height = frame->height;
+
+        /* H.274 mandates film grain be applied to 4:4:4 frames */
+        fgp->subsampling_x = fgp->subsampling_y = 0;

        h274->model_id = fgc->model_id;
        if (fgc->separate_colour_description_present_flag) {
-            h274->bit_depth_luma   = fgc->bit_depth_luma;
-            h274->bit_depth_chroma = fgc->bit_depth_chroma;
-            h274->color_range      = fgc->full_range + 1;
-            h274->color_primaries  = fgc->color_primaries;
-            h274->color_trc        = fgc->transfer_characteristics;
-            h274->color_space      = fgc->matrix_coeffs;
+            fgp->bit_depth_luma   = fgc->bit_depth_luma;
+            fgp->bit_depth_chroma = fgc->bit_depth_chroma;
+            fgp->color_range      = fgc->full_range + 1;
+            fgp->color_primaries  = fgc->color_primaries;
+            fgp->color_trc        = fgc->transfer_characteristics;
+            fgp->color_space      = fgc->matrix_coeffs;
        } else {
-            h274->bit_depth_luma   = bit_depth_luma;
-            h274->bit_depth_chroma = bit_depth_chroma;
+            fgp->bit_depth_luma   = bit_depth_luma;
+            fgp->bit_depth_chroma = bit_depth_chroma;
            if (vui->video_signal_type_present_flag)
-                h274->color_range = vui->video_full_range_flag + 1;
-            else
-                h274->color_range = AVCOL_RANGE_UNSPECIFIED;
+                fgp->color_range = vui->video_full_range_flag + 1;
            if (vui->colour_description_present_flag) {
-                h274->color_primaries = vui->colour_primaries;
-                h274->color_trc       = vui->transfer_characteristics;
-                h274->color_space     = vui->matrix_coeffs;
-            } else {
-                h274->color_primaries = AVCOL_PRI_UNSPECIFIED;
-                h274->color_trc       = AVCOL_TRC_UNSPECIFIED;
-                h274->color_space     = AVCOL_SPC_UNSPECIFIED;
+                fgp->color_primaries = vui->colour_primaries;
+                fgp->color_trc       = vui->transfer_characteristics;
+                fgp->color_space     = vui->matrix_coeffs;
            }
        }
        h274->blending_mode_id  = fgc->blending_mode_id;
        h274->log2_scale_factor = fgc->log2_scale_factor;

+#if FF_API_H274_FILM_GRAIN_VCS
+FF_DISABLE_DEPRECATION_WARNINGS
+        h274->bit_depth_luma   = fgp->bit_depth_luma;
+        h274->bit_depth_chroma = fgp->bit_depth_chroma;
+        h274->color_range      = fgp->color_range;
+        h274->color_primaries  = fgp->color_primaries;
+        h274->color_trc        = fgp->color_trc;
+        h274->color_space      = fgp->color_space;
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
        memcpy(&h274->component_model_present, &fgc->comp_model_present_flag,
               sizeof(h274->component_model_present));
        memcpy(&h274->num_intensity_intervals, &fgc->num_intensity_intervals,
@ -692,6 +722,12 @@ int ff_h2645_sei_to_frame(AVFrame *frame, H2645SEI *sei,
            avctx->properties |= FF_CODEC_PROPERTY_FILM_GRAIN;
    }

+#if CONFIG_HEVC_SEI
+    ret = ff_aom_attach_film_grain_sets(&sei->aom_film_grain, frame);
+    if (ret < 0)
+        return ret;
+#endif
+
    if (sei->ambient_viewing_environment.present) {
        H2645SEIAmbientViewingEnvironment *env =
            &sei->ambient_viewing_environment;
@ -788,4 +824,5 @@ void ff_h2645_sei_reset(H2645SEI *s)
    s->ambient_viewing_environment.present = 0;
    s->mastering_display.present = 0;
    s->content_light.present = 0;
+    s->aom_film_grain.enable = 0;
 }
--- a/libavcodec/h2645_sei.h
+++ b/libavcodec/h2645_sei.h
@ -23,7 +23,9 @@

 #include "libavutil/buffer.h"
 #include "libavutil/frame.h"
+#include "libavutil/film_grain_params.h"

+#include "aom_film_grain.h"
 #include "avcodec.h"
 #include "bytestream.h"
 #include "codec_id.h"
@ -132,6 +134,7 @@ typedef struct H2645SEI {
    H2645SEIAmbientViewingEnvironment ambient_viewing_environment;
    H2645SEIMasteringDisplay mastering_display;
    H2645SEIContentLight content_light;
+    AVFilmGrainAFGS1Params aom_film_grain;
 } H2645SEI;

 enum {
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@ -60,11 +60,13 @@ typedef struct H264ParseContext {
    int nal_length_size;
    int got_first;
    int picture_structure;
-    uint8_t parse_history[6];
+    uint8_t parse_history[9];
    int parse_history_count;
    int parse_last_mb;
    int64_t reference_dts;
    int last_frame_num, last_picture_structure;
+    int is_mvc;
+    int slice_ext;
 } H264ParseContext;

 static int find_start_code(const uint8_t *buf, int buf_size,
@ -122,14 +124,17 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
        } else if (state <= 5) {
            int nalu_type = buf[i] & 0x1F;
            if (nalu_type == H264_NAL_SEI || nalu_type == H264_NAL_SPS ||
-                nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD) {
+                nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD ||
+                nalu_type == H264_NAL_SUB_SPS) {
                if (pc->frame_start_found) {
                    i++;
                    goto found;
                }
            } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA ||
-                       nalu_type == H264_NAL_IDR_SLICE) {
+                       nalu_type == H264_NAL_IDR_SLICE || (p->is_mvc && nalu_type == H264_NAL_EXTEN_SLICE)) {
                state += 8;
+
+                p->slice_ext = (nalu_type == H264_NAL_EXTEN_SLICE);
                continue;
            }
            state = 7;
@ -138,20 +143,22 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
            GetBitContext gb;
            p->parse_history[p->parse_history_count++] = buf[i];

-            init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
-            mb= get_ue_golomb_long(&gb);
-            if (get_bits_left(&gb) > 0 || p->parse_history_count > 5) {
-                p->parse_last_mb = mb;
-                if (pc->frame_start_found) {
-                    if (mb <= last_mb) {
-                        i -= p->parse_history_count - 1;
-                        p->parse_history_count = 0;
-                        goto found;
-                    }
-                } else
-                    pc->frame_start_found = 1;
-                p->parse_history_count = 0;
-                state = 7;
+            if (!p->slice_ext || p->parse_history_count > 3) {
+                init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
+                mb= get_ue_golomb_long(&gb);
+                if (get_bits_left(&gb) > 0 || p->parse_history_count > (5 + 3*p->slice_ext)) {
+                    p->parse_last_mb = mb;
+                    if (pc->frame_start_found) {
+                        if (mb <= last_mb) {
+                            i -= p->parse_history_count - 1;
+                            p->parse_history_count = 0;
+                            goto found;
+                        }
+                    } else
+                        pc->frame_start_found = 1;
+                    p->parse_history_count = 0;
+                    state = 7;
+                }
            }
        }
    }
@ -605,6 +612,9 @@ static int h264_parse(AVCodecParserContext *s,
    } else {
        next = h264_find_frame_end(p, buf, buf_size, avctx);

+        if (next == END_NOT_FOUND && pc->frame_start_found == 0)
+            s->fetch_timestamp = 1;
+
        if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
            *poutbuf      = NULL;
            *poutbuf_size = 0;
@ -617,7 +627,8 @@ static int h264_parse(AVCodecParserContext *s,
        }
    }

-    parse_nal_units(s, avctx, buf, buf_size);
+    if (!p->is_mvc)
+        parse_nal_units(s, avctx, buf, buf_size);

    if (avctx->framerate.num)
        time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){2, 1}));
@ -688,3 +699,22 @@ const AVCodecParser ff_h264_parser = {
    .parser_parse   = h264_parse,
    .parser_close   = h264_close,
 };
+
+static av_cold int init_mvc(AVCodecParserContext *s)
+{
+    H264ParseContext *p = s->priv_data;
+    int ret = init(s);
+    if (ret < 0)
+        return ret;
+
+    p->is_mvc = 1;
+    return 0;
+}
+
+AVCodecParser ff_h264_mvc_parser = {
+    .codec_ids      = { AV_CODEC_ID_H264_MVC },
+    .priv_data_size = sizeof(H264ParseContext),
+    .parser_init    = init_mvc,
+    .parser_parse   = h264_parse,
+    .parser_close   = h264_close,
+};
--- a/libavcodec/h264_slice.c
+++ b/libavcodec/h264_slice.c
@ -470,6 +470,7 @@ int ff_h264_update_thread_context_for_user(AVCodecContext *dst,

    h->is_avc = h1->is_avc;
    h->nal_length_size = h1->nal_length_size;
+    h->x264_build = h1->x264_build;

    return 0;
 }
@ -866,14 +867,9 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
        if (CHROMA444(h)) {
            if (h->avctx->colorspace == AVCOL_SPC_RGB)
                *fmt++ = AV_PIX_FMT_GBRP;
-            else if (h->avctx->color_range == AVCOL_RANGE_JPEG)
-                *fmt++ = AV_PIX_FMT_YUVJ444P;
            else
                *fmt++ = AV_PIX_FMT_YUV444P;
        } else if (CHROMA422(h)) {
-            if (h->avctx->color_range == AVCOL_RANGE_JPEG)
-                *fmt++ = AV_PIX_FMT_YUVJ422P;
-            else
                *fmt++ = AV_PIX_FMT_YUV422P;
        } else {
 #if CONFIG_H264_DXVA2_HWACCEL
@ -889,9 +885,6 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
 #if CONFIG_H264_VAAPI_HWACCEL
            *fmt++ = AV_PIX_FMT_VAAPI;
 #endif
-            if (h->avctx->color_range == AVCOL_RANGE_JPEG)
-                *fmt++ = AV_PIX_FMT_YUVJ420P;
-            else
                *fmt++ = AV_PIX_FMT_YUV420P;
        }
        break;
@ -909,6 +902,11 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
    return ff_get_format(h->avctx, pix_fmts);
 }

+enum AVPixelFormat ff_h264_get_pixel_format(H264Context *h)
+{
+    return get_pixel_format(h, 1);
+}
+
 /* export coded and cropped frame dimensions to AVCodecContext */
 static void init_dimensions(H264Context *h)
 {
@ -1086,6 +1084,7 @@ static int h264_init_ps(H264Context *h, const H264SliceContext *sl, int first_sl
        h->avctx->profile = ff_h264_get_profile(sps);
        h->avctx->level   = sps->level_idc;
        h->avctx->refs    = sps->ref_frame_count;
+        h->avctx->progressive_sequence = sps->frame_mbs_only_flag;

        h->mb_width  = sps->mb_width;
        h->mb_height = sps->mb_height;
@ -1185,6 +1184,7 @@ static int h264_export_frame_props(H264Context *h)
        const H264SEIPictureTiming *pt = &h->sei.picture_timing;
        switch (pt->pic_struct) {
        case H264_SEI_PIC_STRUCT_FRAME:
+            interlaced_frame = FIELD_OR_MBAFF_PICTURE(h);
            break;
        case H264_SEI_PIC_STRUCT_TOP_FIELD:
        case H264_SEI_PIC_STRUCT_BOTTOM_FIELD:
--- a/libavcodec/h264dec.c
+++ b/libavcodec/h264dec.c
@ -417,6 +417,28 @@ FF_ENABLE_DEPRECATION_WARNINGS
               ret = 0;
           }
        }
+
+        /* activate the first SPS to determine basic stream information */
+        if (!h->ps.sps) {
+            int i;
+            for (i = 0; i < FF_ARRAY_ELEMS(h->ps.pps_list) && !h->ps.sps; i++) {
+                if (h->ps.pps_list[i]) {
+                    ff_refstruct_replace(&h->ps.pps, h->ps.pps_list[i]);
+                    h->ps.sps = h->ps.pps->sps;
+                }
+            }
+        }
+    }
+
+    if (h->ps.sps) {
+        h->avctx->colorspace = h->ps.sps->vui.matrix_coeffs;
+        h->avctx->pix_fmt    = ff_h264_get_pixel_format(h);
+        if (h->avctx->pix_fmt < 0)
+            h->avctx->pix_fmt = AV_PIX_FMT_NONE;
+
+        h->avctx->profile    = ff_h264_get_profile(h->ps.sps);
+        h->avctx->level      = h->ps.sps->level_idc;
+        h->avctx->refs       = h->ps.sps->ref_frame_count;
    }

    if (h->ps.sps && h->ps.sps->bitstream_restriction_flag &&
@ -497,9 +519,6 @@ static void h264_decode_flush(AVCodecContext *avctx)
    h->mb_y = 0;
    h->non_gray = 0;

-    ff_h264_free_tables(h);
-    h->context_initialized = 0;
-
    if (FF_HW_HAS_CB(avctx, flush))
        FF_HW_SIMPLE_CALL(avctx, flush);
 }
--- a/libavcodec/h264dec.h
+++ b/libavcodec/h264dec.h
@ -58,7 +58,7 @@
 * The maximum number of slices supported by the decoder.
 * must be a power of 2
 */
-#define MAX_SLICES 32
+#define MAX_SLICES 256

 #ifdef ALLOW_INTERLACE
 #define MB_MBAFF(h)    (h)->mb_mbaff
@ -699,4 +699,6 @@ void ff_h264_free_tables(H264Context *h);

 void ff_h264_set_erpic(ERPicture *dst, const H264Picture *src);

+enum AVPixelFormat ff_h264_get_pixel_format(H264Context *h);
+
 #endif /* AVCODEC_H264DEC_H */
--- a/libavcodec/hevc_ps.c
+++ b/libavcodec/hevc_ps.c
@ -370,7 +370,7 @@ static void decode_sublayer_hrd(GetBitContext *gb, unsigned int nb_cpb,
            par->bit_rate_du_value_minus1[i] = get_ue_golomb_long(gb);
        }

-        par->cbr_flag = get_bits1(gb);
+        par->cbr_flag |= get_bits1(gb) << i;
    }
 }

@ -378,24 +378,24 @@ static int decode_hrd(GetBitContext *gb, int common_inf_present,
                      HEVCHdrParams *hdr, int max_sublayers)
 {
    if (common_inf_present) {
-        hdr->flags.nal_hrd_parameters_present_flag = get_bits1(gb);
-        hdr->flags.vcl_hrd_parameters_present_flag = get_bits1(gb);
+        hdr->nal_hrd_parameters_present_flag = get_bits1(gb);
+        hdr->vcl_hrd_parameters_present_flag = get_bits1(gb);

-        if (hdr->flags.nal_hrd_parameters_present_flag ||
-            hdr->flags.vcl_hrd_parameters_present_flag) {
-            hdr->flags.sub_pic_hrd_params_present_flag = get_bits1(gb);
+        if (hdr->nal_hrd_parameters_present_flag ||
+            hdr->vcl_hrd_parameters_present_flag) {
+            hdr->sub_pic_hrd_params_present_flag = get_bits1(gb);

-            if (hdr->flags.sub_pic_hrd_params_present_flag) {
+            if (hdr->sub_pic_hrd_params_present_flag) {
                hdr->tick_divisor_minus2 = get_bits(gb, 8);
                hdr->du_cpb_removal_delay_increment_length_minus1 = get_bits(gb, 5);
-                hdr->flags.sub_pic_cpb_params_in_pic_timing_sei_flag = get_bits1(gb);
+                hdr->sub_pic_cpb_params_in_pic_timing_sei_flag = get_bits1(gb);
                hdr->dpb_output_delay_du_length_minus1 = get_bits(gb, 5);
            }

            hdr->bit_rate_scale = get_bits(gb, 4);
            hdr->cpb_size_scale = get_bits(gb, 4);

-            if (hdr->flags.sub_pic_hrd_params_present_flag)
+            if (hdr->sub_pic_hrd_params_present_flag)
                hdr->cpb_size_du_scale = get_bits(gb, 4);

            hdr->initial_cpb_removal_delay_length_minus1 = get_bits(gb, 5);
@ -405,18 +405,22 @@ static int decode_hrd(GetBitContext *gb, int common_inf_present,
    }

    for (int i = 0; i < max_sublayers; i++) {
-        hdr->flags.fixed_pic_rate_general_flag = get_bits1(gb);
+        unsigned fixed_pic_rate_general_flag = get_bits1(gb);
+        unsigned fixed_pic_rate_within_cvs_flag = 0;
+        unsigned low_delay_hrd_flag = 0;
+        hdr->flags.fixed_pic_rate_general_flag |= fixed_pic_rate_general_flag << i;

-        if (!hdr->flags.fixed_pic_rate_general_flag)
-            hdr->flags.fixed_pic_rate_within_cvs_flag = get_bits1(gb);
+        if (!fixed_pic_rate_general_flag)
+            fixed_pic_rate_within_cvs_flag = get_bits1(gb);
+        hdr->flags.fixed_pic_rate_within_cvs_flag |= fixed_pic_rate_within_cvs_flag << i;

-        if (hdr->flags.fixed_pic_rate_within_cvs_flag ||
-            hdr->flags.fixed_pic_rate_general_flag)
+        if (fixed_pic_rate_within_cvs_flag || fixed_pic_rate_general_flag)
            hdr->elemental_duration_in_tc_minus1[i] = get_ue_golomb_long(gb);
        else
-            hdr->flags.low_delay_hrd_flag = get_bits1(gb);
+            low_delay_hrd_flag = get_bits1(gb);
+        hdr->flags.low_delay_hrd_flag |= low_delay_hrd_flag << i;

-        if (!hdr->flags.low_delay_hrd_flag) {
+        if (!low_delay_hrd_flag) {
            unsigned cpb_cnt_minus1 = get_ue_golomb_long(gb);
            if (cpb_cnt_minus1 > 31) {
                av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n",
@ -426,25 +430,32 @@ static int decode_hrd(GetBitContext *gb, int common_inf_present,
            hdr->cpb_cnt_minus1[i] = cpb_cnt_minus1;
        }

-        if (hdr->flags.nal_hrd_parameters_present_flag)
+        if (hdr->nal_hrd_parameters_present_flag)
            decode_sublayer_hrd(gb, hdr->cpb_cnt_minus1[i]+1, &hdr->nal_params[i],
-                                hdr->flags.sub_pic_hrd_params_present_flag);
+                                hdr->sub_pic_hrd_params_present_flag);

-        if (hdr->flags.vcl_hrd_parameters_present_flag)
+        if (hdr->vcl_hrd_parameters_present_flag)
            decode_sublayer_hrd(gb, hdr->cpb_cnt_minus1[i]+1, &hdr->vcl_params[i],
-                                hdr->flags.sub_pic_hrd_params_present_flag);
+                                hdr->sub_pic_hrd_params_present_flag);
    }

    return 0;
 }

+static void uninit_vps(FFRefStructOpaque opaque, void *obj)
+{
+    HEVCVPS *vps = obj;
+
+    av_freep(&vps->hdr);
+}
+
 int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
                           HEVCParamSets *ps)
 {
    int i,j;
    int vps_id = 0;
    ptrdiff_t nal_size;
-    HEVCVPS *vps = ff_refstruct_allocz(sizeof(*vps));
+    HEVCVPS *vps = ff_refstruct_alloc_ext(sizeof(*vps), 0, NULL, uninit_vps);

    if (!vps)
        return AVERROR(ENOMEM);
@ -533,6 +544,11 @@ int ff_hevc_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
                   "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
            goto err;
        }
+
+        vps->hdr = av_calloc(vps->vps_num_hrd_parameters, sizeof(*vps->hdr));
+        if (!vps->hdr)
+            goto err;
+
        for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
            int common_inf_present = 1;

@ -577,8 +593,6 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
    ff_h2645_decode_common_vui_params(gb, &sps->vui.common, avctx);

    if (vui->common.video_signal_type_present_flag) {
-        if (vui->common.video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P)
-            sps->pix_fmt = AV_PIX_FMT_YUVJ420P;
        if (vui->common.colour_description_present_flag) {
            if (vui->common.matrix_coeffs == AVCOL_SPC_RGB) {
                switch (sps->pix_fmt) {
--- a/libavcodec/hevc_ps.h
+++ b/libavcodec/hevc_ps.h
@ -39,18 +39,19 @@ typedef struct HEVCSublayerHdrParams {
    uint32_t cbr_flag;
 } HEVCSublayerHdrParams;

+// flags in bitmask form
 typedef struct HEVCHdrFlagParams {
-    uint32_t nal_hrd_parameters_present_flag;
-    uint32_t vcl_hrd_parameters_present_flag;
-    uint32_t sub_pic_hrd_params_present_flag;
-    uint32_t sub_pic_cpb_params_in_pic_timing_sei_flag;
-    uint32_t fixed_pic_rate_general_flag;
-    uint32_t fixed_pic_rate_within_cvs_flag;
-    uint32_t low_delay_hrd_flag;
+    uint8_t fixed_pic_rate_general_flag;
+    uint8_t fixed_pic_rate_within_cvs_flag;
+    uint8_t low_delay_hrd_flag;
 } HEVCHdrFlagParams;

 typedef struct HEVCHdrParams {
    HEVCHdrFlagParams flags;
+    uint8_t nal_hrd_parameters_present_flag;
+    uint8_t vcl_hrd_parameters_present_flag;
+    uint8_t sub_pic_hrd_params_present_flag;
+    uint8_t sub_pic_cpb_params_in_pic_timing_sei_flag;

    uint8_t tick_divisor_minus2;
    uint8_t du_cpb_removal_delay_increment_length_minus1;
@ -152,7 +153,7 @@ typedef struct PTL {

 typedef struct HEVCVPS {
    unsigned int vps_id;
-    HEVCHdrParams hdr[HEVC_MAX_LAYER_SETS];
+    HEVCHdrParams *hdr;

    uint8_t vps_temporal_id_nesting_flag;
    int vps_max_layers;
--- a/libavcodec/hevc_refs.c
+++ b/libavcodec/hevc_refs.c
@ -50,6 +50,8 @@ void ff_hevc_unref_frame(HEVCFrame *frame, int flags)
        frame->refPicList = NULL;

        ff_refstruct_unref(&frame->hwaccel_picture_private);
+
+        frame->missing = 0;
    }
 }

@ -440,6 +442,7 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
    frame->poc      = poc;
    frame->sequence = HEVC_SEQUENCE_COUNTER_INVALID;
    frame->flags    = 0;
+    frame->missing  = 1;

    if (s->threads_type == FF_THREAD_FRAME)
        ff_thread_report_progress(&frame->tf, INT_MAX, 0);
--- a/libavcodec/hevcdec.c
+++ b/libavcodec/hevcdec.c
@ -29,12 +29,14 @@
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
 #include "libavutil/film_grain_params.h"
+#include "libavutil/hdr_dynamic_metadata.h"
 #include "libavutil/internal.h"
 #include "libavutil/md5.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/timecode.h"

+#include "aom_film_grain.h"
 #include "bswapdsp.h"
 #include "cabac_functions.h"
 #include "codec_internal.h"
@ -388,7 +390,8 @@ static int export_stream_params_from_sei(HEVCContext *s)
        avctx->color_trc = s->sei.common.alternative_transfer.preferred_transfer_characteristics;
    }

-    if (s->sei.common.film_grain_characteristics.present)
+    if (s->sei.common.film_grain_characteristics.present ||
+        s->sei.common.aom_film_grain.enable)
        avctx->properties |= FF_CODEC_PROPERTY_FILM_GRAIN;

    return 0;
@ -2815,7 +2818,30 @@ static int set_side_data(HEVCContext *s)
    }

    if (s->sei.common.dynamic_hdr_plus.info) {
-        AVBufferRef *info_ref = av_buffer_ref(s->sei.common.dynamic_hdr_plus.info);
+        AVBufferRef *info_ref;
+        AVDynamicHDRPlus *metadata = (AVDynamicHDRPlus*)s->sei.common.dynamic_hdr_plus.info->data;
+
+        // fill in window 0 (full frame) and convert to relative coordinates
+        if (metadata->params[0].window_lower_right_corner_x.num == 0)
+        {
+            // ensure the buffer is writable
+            av_buffer_make_writable(&s->sei.common.dynamic_hdr_plus.info);
+            metadata = (AVDynamicHDRPlus*)s->sei.common.dynamic_hdr_plus.info->data;
+
+            // Convert coordinates to relative coordinate in [0, 1].
+            metadata->params[0].window_upper_left_corner_x.num  = 0;
+            metadata->params[0].window_upper_left_corner_y.num  = 0;
+            metadata->params[0].window_lower_right_corner_x.num = out->width - 1;
+            metadata->params[0].window_lower_right_corner_y.num = out->height - 1;
+            for (int w = 0; w < metadata->num_windows; w++) {
+                metadata->params[w].window_upper_left_corner_x.den = out->width - 1;
+                metadata->params[w].window_upper_left_corner_y.den = out->height - 1;
+                metadata->params[w].window_lower_right_corner_x.den = out->width - 1;
+                metadata->params[w].window_lower_right_corner_y.den = out->height - 1;
+            }
+        }
+
+        info_ref = av_buffer_ref(s->sei.common.dynamic_hdr_plus.info);
        if (!info_ref)
            return AVERROR(ENOMEM);

@ -2885,11 +2911,13 @@ static int hevc_frame_start(HEVCContext *s)
    else
        s->ref->frame->flags &= ~AV_FRAME_FLAG_KEY;

-    s->ref->needs_fg = s->sei.common.film_grain_characteristics.present &&
+    s->ref->needs_fg = (s->sei.common.film_grain_characteristics.present ||
+                        s->sei.common.aom_film_grain.enable) &&
        !(s->avctx->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN) &&
        !s->avctx->hwaccel;

    if (s->ref->needs_fg &&
+        s->sei.common.film_grain_characteristics.present &&
        !ff_h274_film_grain_params_supported(s->sei.common.film_grain_characteristics.model_id,
                                             s->ref->frame->format)) {
        av_log_once(s->avctx, AV_LOG_WARNING, AV_LOG_DEBUG, &s->film_grain_warning_shown,
@ -2934,14 +2962,24 @@ fail:
 static int hevc_frame_end(HEVCContext *s)
 {
    HEVCFrame *out = s->ref;
-    const AVFrameSideData *sd;
+    const AVFilmGrainParams *fgp;
    av_unused int ret;

    if (out->needs_fg) {
-        sd = av_frame_get_side_data(out->frame, AV_FRAME_DATA_FILM_GRAIN_PARAMS);
-        av_assert0(out->frame_grain->buf[0] && sd);
-        ret = ff_h274_apply_film_grain(out->frame_grain, out->frame, &s->h274db,
-                                       (AVFilmGrainParams *) sd->data);
+        av_assert0(out->frame_grain->buf[0]);
+        fgp = av_film_grain_params_select(out->frame);
+        switch (fgp->type) {
+        case AV_FILM_GRAIN_PARAMS_NONE:
+            av_assert0(0);
+            return AVERROR_BUG;
+        case AV_FILM_GRAIN_PARAMS_H274:
+            ret = ff_h274_apply_film_grain(out->frame_grain, out->frame,
+                                           &s->h274db, fgp);
+            break;
+        case AV_FILM_GRAIN_PARAMS_AV1:
+            ret = ff_aom_apply_film_grain(out->frame_grain, out->frame, fgp);
+            break;
+        }
        av_assert1(ret >= 0);
    }

@ -3596,6 +3634,7 @@ static int hevc_update_thread_context(AVCodecContext *dst,
    s->sei.common.alternative_transfer = s0->sei.common.alternative_transfer;
    s->sei.common.mastering_display    = s0->sei.common.mastering_display;
    s->sei.common.content_light        = s0->sei.common.content_light;
+    s->sei.common.aom_film_grain       = s0->sei.common.aom_film_grain;

    ret = export_stream_params_from_sei(s);
    if (ret < 0)
@ -3639,7 +3678,8 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
        if (avctx->extradata_size > 0 && avctx->extradata) {
            ret = hevc_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1);
            if (ret < 0) {
-                return ret;
+                s->is_nalff = 0;
+                av_log(avctx, AV_LOG_ERROR, "Invalid extradata ignored\n");
            }
        }

--- a/libavcodec/hevcdec.h
+++ b/libavcodec/hevcdec.h
@ -377,6 +377,11 @@ typedef struct HEVCFrame {
     * A combination of HEVC_FRAME_FLAG_*
     */
    uint8_t flags;
+
+    /**
+     * 1 - a dummy frame generated in place of a missing frame
+     */
+    int missing;
 } HEVCFrame;

 typedef struct HEVCLocalContext {
--- a/libavcodec/hevcpred.c
+++ b/libavcodec/hevcpred.c
@ -78,4 +78,7 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
 #if ARCH_MIPS
    ff_hevc_pred_init_mips(hpc, bit_depth);
 #endif
+#if ARCH_X86
+    ff_hevc_pred_init_x86(hpc, bit_depth);
+#endif
 }
--- a/libavcodec/hevcpred.h
+++ b/libavcodec/hevcpred.h
@ -42,5 +42,6 @@ typedef struct HEVCPredContext {

 void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
 void ff_hevc_pred_init_mips(HEVCPredContext *hpc, int bit_depth);
+void ff_hevc_pred_init_x86(HEVCPredContext *hpc, int bit_depth);

 #endif /* AVCODEC_HEVCPRED_H */
--- a/libavcodec/hwconfig.h
+++ b/libavcodec/hwconfig.h
@ -64,7 +64,7 @@ void ff_hwaccel_uninit(AVCodecContext *avctx);
 #define HWACCEL_DXVA2(codec) \
    HW_CONFIG_HWACCEL(1, 1, 1, DXVA2_VLD,    DXVA2,        ff_ ## codec ## _dxva2_hwaccel)
 #define HWACCEL_D3D11VA2(codec) \
-    HW_CONFIG_HWACCEL(1, 1, 0, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
+    HW_CONFIG_HWACCEL(1, 1, 1, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
 #define HWACCEL_NVDEC(codec) \
    HW_CONFIG_HWACCEL(1, 1, 0, CUDA,         CUDA,         ff_ ## codec ## _nvdec_hwaccel)
 #define HWACCEL_VAAPI(codec) \
--- a/libavcodec/itut35.h
+++ b/libavcodec/itut35.h
@ -0,0 +1,30 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ITUT35_H
+#define AVCODEC_ITUT35_H
+
+#define ITU_T_T35_COUNTRY_CODE_CN 0x26
+#define ITU_T_T35_COUNTRY_CODE_US 0xB5
+
+#define ITU_T_T35_PROVIDER_CODE_ATSC  0x31
+#define ITU_T_T35_PROVIDER_CODE_CUVA  0x04
+#define ITU_T_T35_PROVIDER_CODE_DOLBY 0x3B
+#define ITU_T_T35_PROVIDER_CODE_SMTPE 0x3C
+
+#endif /* AVCODEC_ITUT35_H */
--- a/libavcodec/jni.c
+++ b/libavcodec/jni.c
@ -35,6 +35,7 @@
 #include "ffjni.h"

 static void *java_vm;
+static void *android_app_ctx;
 static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;

 int av_jni_set_java_vm(void *vm, void *log_ctx)
@ -77,3 +78,45 @@ void *av_jni_get_java_vm(void *log_ctx)
 }

 #endif
+
+#if defined(__ANDROID__)
+
+int av_jni_set_android_app_ctx(void *app_ctx, void *log_ctx)
+{
+#if CONFIG_JNI
+    JNIEnv *env = ff_jni_get_env(log_ctx);
+    if (!env)
+        return AVERROR(EINVAL);
+
+    jobjectRefType type = (*env)->GetObjectRefType(env, app_ctx);
+    if (type != JNIGlobalRefType) {
+        av_log(log_ctx, AV_LOG_ERROR, "Application context must be passed as a global reference");
+        return AVERROR(EINVAL);
+    }
+
+    pthread_mutex_lock(&lock);
+    android_app_ctx = app_ctx;
+    pthread_mutex_unlock(&lock);
+
+    return 0;
+#else
+    return AVERROR(ENOSYS);
+#endif
+}
+
+void *av_jni_get_android_app_ctx(void)
+{
+#if CONFIG_JNI
+    void *ctx;
+
+    pthread_mutex_lock(&lock);
+    ctx = android_app_ctx;
+    pthread_mutex_unlock(&lock);
+
+    return ctx;
+#else
+    return NULL;
+#endif
+}
+
+#endif
--- a/libavcodec/jni.h
+++ b/libavcodec/jni.h
@ -43,4 +43,25 @@ int av_jni_set_java_vm(void *vm, void *log_ctx);
 */
 void *av_jni_get_java_vm(void *log_ctx);

+/*
+ * Set the Android application context which will be used to retrieve the Android
+ * content resolver to handle content uris.
+ *
+ * This function is only available on Android.
+ *
+ * @param app_ctx global JNI reference to the Android application context
+ * @return 0 on success, < 0 otherwise
+ */
+int av_jni_set_android_app_ctx(void *app_ctx, void *log_ctx);
+
+/*
+ * Get the Android application context that has been set with
+ * av_jni_set_android_app_ctx.
+ *
+ * This function is only available on Android.
+ *
+ * @return a pointer the the Android application context
+ */
+void *av_jni_get_android_app_ctx(void);
+
 #endif /* AVCODEC_JNI_H */
--- a/libavcodec/libavcodec.v
+++ b/libavcodec/libavcodec.v
@ -4,6 +4,10 @@ LIBAVCODEC_MAJOR {
        avcodec_*;
        avpriv_*;
        avsubtitle_free;
+        #LAV usage
+        ff_vc1_pixel_aspect;
+        ff_crop_tab;
+        ff_flac_is_extradata_valid;
    local:
        *;
 };
--- a/libavcodec/libdav1d.c
+++ b/libavcodec/libdav1d.c
@ -37,6 +37,7 @@
 #include "decode.h"
 #include "dovi_rpu.h"
 #include "internal.h"
+#include "itut35.h"

 #define FF_DAV1D_VERSION_AT_LEAST(x,y) \
    (DAV1D_API_VERSION_MAJOR > (x) || DAV1D_API_VERSION_MAJOR == (x) && DAV1D_API_VERSION_MINOR >= (y))
@ -304,10 +305,6 @@ static void libdav1d_flush(AVCodecContext *c)
    dav1d_flush(dav1d->c);
 }

-typedef struct OpaqueData {
-    void    *pkt_orig_opaque;
-} OpaqueData;
-
 static void libdav1d_data_free(const uint8_t *data, void *opaque) {
    AVBufferRef *buf = opaque;

@ -317,7 +314,6 @@ static void libdav1d_data_free(const uint8_t *data, void *opaque) {
 static void libdav1d_user_data_free(const uint8_t *data, void *opaque) {
    AVPacket *pkt = opaque;
    av_assert0(data == opaque);
-    av_free(pkt->opaque);
    av_packet_free(&pkt);
 }

@ -340,8 +336,6 @@ static int libdav1d_receive_frame_internal(AVCodecContext *c, Dav1dPicture *p)
        }

        if (pkt->size) {
-            OpaqueData *od = NULL;
-
            res = dav1d_data_wrap(data, pkt->data, pkt->size,
                                  libdav1d_data_free, pkt->buf);
            if (res < 0) {
@ -351,21 +345,9 @@ static int libdav1d_receive_frame_internal(AVCodecContext *c, Dav1dPicture *p)

            pkt->buf = NULL;

-            if (pkt->opaque && (c->flags & AV_CODEC_FLAG_COPY_OPAQUE)) {
-                od = av_mallocz(sizeof(*od));
-                if (!od) {
-                    av_packet_free(&pkt);
-                    dav1d_data_unref(data);
-                    return AVERROR(ENOMEM);
-                }
-                od->pkt_orig_opaque  = pkt->opaque;
-            }
-            pkt->opaque = od;
-
            res = dav1d_data_wrap_user_data(data, (const uint8_t *)pkt,
                                            libdav1d_user_data_free, pkt);
            if (res < 0) {
-                av_free(pkt->opaque);
                av_packet_free(&pkt);
                dav1d_data_unref(data);
                return res;
@ -404,7 +386,6 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
    Libdav1dContext *dav1d = c->priv_data;
    Dav1dPicture pic = { 0 }, *p = &pic;
    AVPacket *pkt;
-    OpaqueData *od = NULL;
 #if FF_DAV1D_VERSION_AT_LEAST(5,1)
    enum Dav1dEventFlags event_flags = 0;
 #endif
@ -459,16 +440,9 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
    ff_set_sar(c, frame->sample_aspect_ratio);

    pkt = (AVPacket *)p->m.user_data.data;
-    od  = pkt->opaque;
-
-    // restore the original user opaque value for
-    // ff_decode_frame_props_from_pkt()
-    pkt->opaque = od ? od->pkt_orig_opaque : NULL;
-    av_freep(&od);

    // match timestamps and packet size
    res = ff_decode_frame_props_from_pkt(c, frame, pkt);
-    pkt->opaque = NULL;
    if (res < 0)
        goto fail;

@ -542,7 +516,7 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)

        provider_code = bytestream2_get_be16(&gb);
        switch (provider_code) {
-        case 0x31: { // atsc_provider_code
+        case ITU_T_T35_PROVIDER_CODE_ATSC: {
            uint32_t user_identifier = bytestream2_get_be32(&gb);
            switch (user_identifier) {
            case MKBETAG('G', 'A', '9', '4'): { // closed captions
@ -566,12 +540,12 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
            }
            break;
        }
-        case 0x3C: { // smpte_provider_code
+        case ITU_T_T35_PROVIDER_CODE_SMTPE: {
            AVDynamicHDRPlus *hdrplus;
            int provider_oriented_code = bytestream2_get_be16(&gb);
            int application_identifier = bytestream2_get_byte(&gb);

-            if (itut_t35->country_code != 0xB5 ||
+            if (itut_t35->country_code != ITU_T_T35_COUNTRY_CODE_US ||
                provider_oriented_code != 1 || application_identifier != 4)
                break;

@ -587,9 +561,10 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
                goto fail;
            break;
        }
-        case 0x3B: { // dolby_provider_code
+        case ITU_T_T35_PROVIDER_CODE_DOLBY: {
            int provider_oriented_code = bytestream2_get_be32(&gb);
-            if (itut_t35->country_code != 0xB5 || provider_oriented_code != 0x800)
+            if (itut_t35->country_code != ITU_T_T35_COUNTRY_CODE_US ||
+                provider_oriented_code != 0x800)
                break;

            res = ff_dovi_rpu_parse(&dav1d->dovi, gb.buffer, gb.buffer_end - gb.buffer);
@ -613,6 +588,8 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
    if (p->frame_hdr->film_grain.present && (!dav1d->apply_grain ||
        (c->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN))) {
        AVFilmGrainParams *fgp = av_film_grain_params_create_side_data(frame);
+        const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(frame->format);
+        av_assert0(pixdesc);
        if (!fgp) {
            res = AVERROR(ENOMEM);
            goto fail;
@ -620,6 +597,14 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)

        fgp->type = AV_FILM_GRAIN_PARAMS_AV1;
        fgp->seed = p->frame_hdr->film_grain.data.seed;
+        fgp->width = frame->width;
+        fgp->height = frame->height;
+        fgp->color_range = frame->color_range;
+        fgp->color_primaries = frame->color_primaries;
+        fgp->color_trc = frame->color_trc;
+        fgp->color_space = frame->colorspace;
+        fgp->subsampling_x = pixdesc->log2_chroma_w;
+        fgp->subsampling_y = pixdesc->log2_chroma_h;
        fgp->codec.aom.num_y_points = p->frame_hdr->film_grain.data.num_y_points;
        fgp->codec.aom.chroma_scaling_from_luma = p->frame_hdr->film_grain.data.chroma_scaling_from_luma;
        fgp->codec.aom.scaling_shift = p->frame_hdr->film_grain.data.scaling_shift;
--- a/libavcodec/librav1e.c
+++ b/libavcodec/librav1e.c
@ -472,12 +472,8 @@ static int librav1e_receive_packet(AVCodecContext *avctx, AVPacket *pkt)

            if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) {
                fd->frame_opaque = frame->opaque;
-                ret = av_buffer_replace(&fd->frame_opaque_ref, frame->opaque_ref);
-                if (ret < 0) {
-                    frame_data_free(fd);
-                    av_frame_unref(frame);
-                    return ret;
-                }
+                fd->frame_opaque_ref = frame->opaque_ref;
+                frame->opaque_ref    = NULL;
            }

            rframe = rav1e_frame_new(ctx->ctx);
--- a/libavcodec/libsvtav1.c
+++ b/libavcodec/libsvtav1.c
@ -27,6 +27,8 @@
 #include "libavutil/common.h"
 #include "libavutil/frame.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mastering_display_metadata.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/avassert.h"
@ -136,6 +138,69 @@ static int alloc_buffer(EbSvtAv1EncConfiguration *config, SvtContext *svt_enc)

 }

+static void handle_mdcv(struct EbSvtAv1MasteringDisplayInfo *dst,
+                        const AVMasteringDisplayMetadata *mdcv)
+{
+    if (mdcv->has_primaries) {
+        const struct EbSvtAv1ChromaPoints *const points[] = {
+            &dst->r,
+            &dst->g,
+            &dst->b,
+        };
+
+        for (int i = 0; i < 3; i++) {
+            const struct EbSvtAv1ChromaPoints *dst = points[i];
+            const AVRational *src = mdcv->display_primaries[i];
+
+            AV_WB16(&dst->x,
+                    av_rescale_q(1, src[0], (AVRational){ 1, (1 << 16) }));
+            AV_WB16(&dst->y,
+                    av_rescale_q(1, src[1], (AVRational){ 1, (1 << 16) }));
+        }
+
+        AV_WB16(&dst->white_point.x,
+                av_rescale_q(1, mdcv->white_point[0],
+                             (AVRational){ 1, (1 << 16) }));
+        AV_WB16(&dst->white_point.y,
+                av_rescale_q(1, mdcv->white_point[1],
+                             (AVRational){ 1, (1 << 16) }));
+    }
+
+    if (mdcv->has_luminance) {
+        AV_WB32(&dst->max_luma,
+                av_rescale_q(1, mdcv->max_luminance,
+                             (AVRational){ 1, (1 << 8) }));
+        AV_WB32(&dst->min_luma,
+                av_rescale_q(1, mdcv->min_luminance,
+                             (AVRational){ 1, (1 << 14) }));
+    }
+}
+
+static void handle_side_data(AVCodecContext *avctx,
+                             EbSvtAv1EncConfiguration *param)
+{
+    const AVFrameSideData *cll_sd =
+        av_frame_side_data_get(avctx->decoded_side_data,
+            avctx->nb_decoded_side_data, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
+    const AVFrameSideData *mdcv_sd =
+        av_frame_side_data_get(avctx->decoded_side_data,
+            avctx->nb_decoded_side_data,
+            AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
+
+    if (cll_sd) {
+        const AVContentLightMetadata *cll =
+            (AVContentLightMetadata *)cll_sd->data;
+
+        AV_WB16(&param->content_light_level.max_cll, cll->MaxCLL);
+        AV_WB16(&param->content_light_level.max_fall, cll->MaxFALL);
+    }
+
+    if (mdcv_sd) {
+        handle_mdcv(&param->mastering_display,
+                    (AVMasteringDisplayMetadata *)mdcv_sd->data);
+    }
+}
+
 static int config_enc_params(EbSvtAv1EncConfiguration *param,
                             AVCodecContext *avctx)
 {
@ -254,6 +319,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
    /* 2 = IDR, closed GOP, 1 = CRA, open GOP */
    param->intra_refresh_type = avctx->flags & AV_CODEC_FLAG_CLOSED_GOP ? 2 : 1;

+    handle_side_data(avctx, param);
+
 #if SVT_AV1_CHECK_VERSION(0, 9, 1)
    while ((en = av_dict_get(svt_enc->svtav1_opts, "", en, AV_DICT_IGNORE_SUFFIX))) {
        EbErrorType ret = svt_av1_enc_parse_parameter(param, en->key, en->value);
--- a/libavcodec/libx264.c
+++ b/libavcodec/libx264.c
@ -25,6 +25,7 @@
 #include "libavutil/eval.h"
 #include "libavutil/internal.h"
 #include "libavutil/opt.h"
+#include "libavutil/mastering_display_metadata.h"
 #include "libavutil/mem.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/stereo3d.h"
@ -38,6 +39,7 @@
 #include "packet_internal.h"
 #include "atsc_a53.h"
 #include "sei.h"
+#include "golomb.h"

 #include <x264.h>
 #include <float.h>
@ -847,12 +849,224 @@ static int convert_pix_fmt(enum AVPixelFormat pix_fmt)
    return 0;
 }

+static int save_sei(AVCodecContext *avctx, x264_nal_t *nal)
+{
+    X264Context *x4 = avctx->priv_data;
+
+    av_log(avctx, AV_LOG_INFO, "%s\n", nal->p_payload + 25);
+    x4->sei_size = nal->i_payload;
+    x4->sei = av_malloc(x4->sei_size);
+    if (!x4->sei)
+        return AVERROR(ENOMEM);
+
+    memcpy(x4->sei, nal->p_payload, nal->i_payload);
+
+    return 0;
+}
+
+#if CONFIG_LIBX264_ENCODER
+static int set_avcc_extradata(AVCodecContext *avctx, x264_nal_t *nal, int nnal)
+{
+    x264_nal_t *sps_nal = NULL;
+    x264_nal_t *pps_nal = NULL;
+    uint8_t *p, *sps;
+    int ret;
+
+    /* We know it's in the order of SPS/PPS/SEI, but it's not documented in x264 API.
+     * The x264 param i_sps_id implies there is a single pair of SPS/PPS.
+     */
+    for (int i = 0; i < nnal; i++) {
+        switch (nal[i].i_type) {
+        case NAL_SPS:
+            sps_nal = &nal[i];
+            break;
+        case NAL_PPS:
+            pps_nal = &nal[i];
+            break;
+        case NAL_SEI:
+            ret = save_sei(avctx, &nal[i]);
+            if (ret < 0)
+                return ret;
+            break;
+        }
+    }
+    if (!sps_nal || !pps_nal)
+        return AVERROR_EXTERNAL;
+
+    avctx->extradata_size = sps_nal->i_payload + pps_nal->i_payload + 7;
+    avctx->extradata = av_mallocz(avctx->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!avctx->extradata)
+        return AVERROR(ENOMEM);
+
+    // Now create AVCDecoderConfigurationRecord
+    p = avctx->extradata;
+    // Skip size part
+    sps = sps_nal->p_payload + 4;
+    *p++ = 1; // version
+    *p++ = sps[1]; // AVCProfileIndication
+    *p++ = sps[2]; // profile_compatibility
+    *p++ = sps[3]; // AVCLevelIndication
+    *p++ = 0xFF;
+    *p++ = 0xE0 | 0x01; // 3 bits reserved (111) + 5 bits number of sps
+    memcpy(p, sps_nal->p_payload + 2, sps_nal->i_payload - 2);
+    // Make sps has AV_INPUT_BUFFER_PADDING_SIZE padding, so it can be used
+    // with GetBitContext
+    sps = p + 2;
+    p += sps_nal->i_payload - 2;
+    *p++ = 1;
+    memcpy(p, pps_nal->p_payload + 2, pps_nal->i_payload - 2);
+    p += pps_nal->i_payload - 2;
+
+    if (sps[3] != 66 && sps[3] != 77 && sps[3] != 88) {
+        GetBitContext gbc;
+        int chroma_format_idc;
+        int bit_depth_luma_minus8, bit_depth_chroma_minus8;
+
+        /* It's not possible to have emulation prevention byte before
+         * bit_depth_chroma_minus8 due to the range of sps id, chroma_format_idc
+         * and so on. So we can read directly without need to escape emulation
+         * prevention byte.
+         *
+         * +4 to skip until sps id.
+         */
+        init_get_bits8(&gbc, sps + 4, sps_nal->i_payload - 4 - 4);
+        // Skip sps id
+        get_ue_golomb_31(&gbc);
+        chroma_format_idc = get_ue_golomb_31(&gbc);
+        if (chroma_format_idc == 3)
+            skip_bits1(&gbc);
+        bit_depth_luma_minus8 = get_ue_golomb_31(&gbc);
+        bit_depth_chroma_minus8 = get_ue_golomb_31(&gbc);
+
+        *p++ = 0xFC | chroma_format_idc;
+        *p++ = 0xF8 | bit_depth_luma_minus8;
+        *p++ = 0xF8 | bit_depth_chroma_minus8;
+        *p++ = 0;
+    }
+    av_assert2(avctx->extradata + avctx->extradata_size >= p);
+    avctx->extradata_size = p - avctx->extradata;
+
+    return 0;
+}
+#endif
+
+static int set_extradata(AVCodecContext *avctx)
+{
+    X264Context *x4 = avctx->priv_data;
+    x264_nal_t *nal;
+    uint8_t *p;
+    int nnal, s;
+
+    s = x264_encoder_headers(x4->enc, &nal, &nnal);
+    if (s < 0)
+        return AVERROR_EXTERNAL;
+
+#if CONFIG_LIBX264_ENCODER
+    if (!x4->params.b_annexb)
+        return set_avcc_extradata(avctx, nal, nnal);
+#endif
+
+    avctx->extradata = p = av_mallocz(s + AV_INPUT_BUFFER_PADDING_SIZE);
+    if (!p)
+        return AVERROR(ENOMEM);
+
+    for (int i = 0; i < nnal; i++) {
+        /* Don't put the SEI in extradata. */
+        if (nal[i].i_type == NAL_SEI) {
+            s = save_sei(avctx, &nal[i]);
+            if (s < 0)
+                return s;
+            continue;
+        }
+        memcpy(p, nal[i].p_payload, nal[i].i_payload);
+        p += nal[i].i_payload;
+    }
+    avctx->extradata_size = p - avctx->extradata;
+
+    return 0;
+}
+
 #define PARSE_X264_OPT(name, var)\
    if (x4->var && x264_param_parse(&x4->params, name, x4->var) < 0) {\
        av_log(avctx, AV_LOG_ERROR, "Error parsing option '%s' with value '%s'.\n", name, x4->var);\
        return AVERROR(EINVAL);\
    }

+#if CONFIG_LIBX264_HDR10
+static void handle_mdcv(x264_param_t *params,
+                        const AVMasteringDisplayMetadata *mdcv)
+{
+    if (!mdcv->has_primaries && !mdcv->has_luminance)
+        return;
+
+    params->mastering_display.b_mastering_display = 1;
+
+    if (mdcv->has_primaries) {
+        int *const points[][2] = {
+            {
+                &params->mastering_display.i_red_x,
+                &params->mastering_display.i_red_y
+            },
+            {
+                &params->mastering_display.i_green_x,
+                &params->mastering_display.i_green_y
+            },
+            {
+                &params->mastering_display.i_blue_x,
+                &params->mastering_display.i_blue_y
+            },
+        };
+
+        for (int i = 0; i < 3; i++) {
+            const AVRational *src = mdcv->display_primaries[i];
+            int *dst[2] = { points[i][0], points[i][1] };
+
+            *dst[0] = av_rescale_q(1, src[0], (AVRational){ 1, 50000 });
+            *dst[1] = av_rescale_q(1, src[1], (AVRational){ 1, 50000 });
+        }
+
+        params->mastering_display.i_white_x =
+            av_rescale_q(1, mdcv->white_point[0], (AVRational){ 1, 50000 });
+        params->mastering_display.i_white_y =
+            av_rescale_q(1, mdcv->white_point[1], (AVRational){ 1, 50000 });
+    }
+
+    if (mdcv->has_luminance) {
+        params->mastering_display.i_display_max =
+            av_rescale_q(1, mdcv->max_luminance, (AVRational){ 1, 10000 });
+        params->mastering_display.i_display_min =
+            av_rescale_q(1, mdcv->min_luminance, (AVRational){ 1, 10000 });
+    }
+}
+#endif // CONFIG_LIBX264_HDR10
+
+static void handle_side_data(AVCodecContext *avctx, x264_param_t *params)
+{
+#if CONFIG_LIBX264_HDR10
+    const AVFrameSideData *cll_sd =
+        av_frame_side_data_get(avctx->decoded_side_data,
+            avctx->nb_decoded_side_data, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
+    const AVFrameSideData *mdcv_sd =
+        av_frame_side_data_get(avctx->decoded_side_data,
+            avctx->nb_decoded_side_data,
+            AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
+
+    if (cll_sd) {
+        const AVContentLightMetadata *cll =
+            (AVContentLightMetadata *)cll_sd->data;
+
+        params->content_light_level.i_max_cll  = cll->MaxCLL;
+        params->content_light_level.i_max_fall = cll->MaxFALL;
+
+        params->content_light_level.b_cll = 1;
+    }
+
+    if (mdcv_sd) {
+        handle_mdcv(params, (AVMasteringDisplayMetadata *)mdcv_sd->data);
+    }
+#endif // CONFIG_LIBX264_HDR10
+}
+
 static av_cold int X264_init(AVCodecContext *avctx)
 {
    X264Context *x4 = avctx->priv_data;
@ -1153,6 +1367,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
    if (avctx->chroma_sample_location != AVCHROMA_LOC_UNSPECIFIED)
        x4->params.vui.i_chroma_loc = avctx->chroma_sample_location - 1;

+    handle_side_data(avctx, &x4->params);
+
    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER)
        x4->params.b_repeat_headers = 0;

@ -1215,30 +1431,9 @@ FF_ENABLE_DEPRECATION_WARNINGS
        return AVERROR_EXTERNAL;

    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) {
-        x264_nal_t *nal;
-        uint8_t *p;
-        int nnal, s, i;
-
-        s = x264_encoder_headers(x4->enc, &nal, &nnal);
-        avctx->extradata = p = av_mallocz(s + AV_INPUT_BUFFER_PADDING_SIZE);
-        if (!p)
-            return AVERROR(ENOMEM);
-
-        for (i = 0; i < nnal; i++) {
-            /* Don't put the SEI in extradata. */
-            if (nal[i].i_type == NAL_SEI) {
-                av_log(avctx, AV_LOG_INFO, "%s\n", nal[i].p_payload+25);
-                x4->sei_size = nal[i].i_payload;
-                x4->sei      = av_malloc(x4->sei_size);
-                if (!x4->sei)
-                    return AVERROR(ENOMEM);
-                memcpy(x4->sei, nal[i].p_payload, nal[i].i_payload);
-                continue;
-            }
-            memcpy(p, nal[i].p_payload, nal[i].i_payload);
-            p += nal[i].i_payload;
-        }
-        avctx->extradata_size = p - avctx->extradata;
+        ret = set_extradata(avctx);
+        if (ret < 0)
+            return ret;
    }

    cpb_props = ff_encode_add_cpb_side_data(avctx);
--- a/libavcodec/libx265.c
+++ b/libavcodec/libx265.c
@ -30,13 +30,12 @@
 #include "libavutil/avassert.h"
 #include "libavutil/buffer.h"
 #include "libavutil/internal.h"
-#include "libavutil/common.h"
+#include "libavutil/mastering_display_metadata.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
 #include "avcodec.h"
 #include "codec_internal.h"
 #include "encode.h"
-#include "internal.h"
 #include "packet_internal.h"
 #include "atsc_a53.h"
 #include "sei.h"
@ -176,6 +175,68 @@ static av_cold int libx265_param_parse_int(AVCodecContext *avctx,
    return 0;
 }

+static int handle_mdcv(void *logctx, const x265_api *api,
+                       x265_param *params,
+                       const AVMasteringDisplayMetadata *mdcv)
+{
+    char buf[10 /* # of PRId64s */ * 20 /* max strlen for %PRId64 */ + sizeof("G(,)B(,)R(,)WP(,)L(,)")];
+
+    // G(%hu,%hu)B(%hu,%hu)R(%hu,%hu)WP(%hu,%hu)L(%u,%u)
+    snprintf(buf, sizeof(buf),
+        "G(%"PRId64",%"PRId64")B(%"PRId64",%"PRId64")R(%"PRId64",%"PRId64")"
+        "WP(%"PRId64",%"PRId64")L(%"PRId64",%"PRId64")",
+        av_rescale_q(1, mdcv->display_primaries[1][0], (AVRational){ 1, 50000 }),
+        av_rescale_q(1, mdcv->display_primaries[1][1], (AVRational){ 1, 50000 }),
+        av_rescale_q(1, mdcv->display_primaries[2][0], (AVRational){ 1, 50000 }),
+        av_rescale_q(1, mdcv->display_primaries[2][1], (AVRational){ 1, 50000 }),
+        av_rescale_q(1, mdcv->display_primaries[0][0], (AVRational){ 1, 50000 }),
+        av_rescale_q(1, mdcv->display_primaries[0][1], (AVRational){ 1, 50000 }),
+        av_rescale_q(1, mdcv->white_point[0], (AVRational){ 1, 50000 }),
+        av_rescale_q(1, mdcv->white_point[1], (AVRational){ 1, 50000 }),
+        av_rescale_q(1, mdcv->max_luminance,  (AVRational){ 1, 10000 }),
+        av_rescale_q(1, mdcv->min_luminance,  (AVRational){ 1, 10000 }));
+
+    if (api->param_parse(params, "master-display", buf) ==
+            X265_PARAM_BAD_VALUE) {
+        av_log(logctx, AV_LOG_ERROR,
+               "Invalid value \"%s\" for param \"master-display\".\n",
+               buf);
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
+static int handle_side_data(AVCodecContext *avctx, const x265_api *api,
+                            x265_param *params)
+{
+    const AVFrameSideData *cll_sd =
+        av_frame_side_data_get(avctx->decoded_side_data,
+            avctx->nb_decoded_side_data, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
+    const AVFrameSideData *mdcv_sd =
+        av_frame_side_data_get(avctx->decoded_side_data,
+            avctx->nb_decoded_side_data,
+            AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
+
+    if (cll_sd) {
+        const AVContentLightMetadata *cll =
+            (AVContentLightMetadata *)cll_sd->data;
+
+        params->maxCLL  = cll->MaxCLL;
+        params->maxFALL = cll->MaxFALL;
+    }
+
+    if (mdcv_sd) {
+        int ret = handle_mdcv(
+            avctx, api, params,
+            (AVMasteringDisplayMetadata *)mdcv_sd->data);
+        if (ret < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
 static av_cold int libx265_encode_init(AVCodecContext *avctx)
 {
    libx265Context *ctx = avctx->priv_data;
@ -336,6 +397,13 @@ FF_ENABLE_DEPRECATION_WARNINGS
        return AVERROR_BUG;
    }

+    ret = handle_side_data(avctx, ctx->api, ctx->params);
+    if (ret < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed handling side data! (%s)\n",
+               av_err2str(ret));
+        return ret;
+    }
+
    if (ctx->crf >= 0) {
        char crf[6];

--- a/libavcodec/mediacodec_wrapper.c
+++ b/libavcodec/mediacodec_wrapper.c
@ -60,31 +60,33 @@ struct JNIAMediaCodecListFields {
    jfieldID level_id;
 };

+#define OFFSET(x) offsetof(struct JNIAMediaCodecListFields, x)
 static const struct FFJniField jni_amediacodeclist_mapping[] = {
-    { "android/media/MediaCodecList", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecListFields, mediacodec_list_class), 1 },
-        { "android/media/MediaCodecList", "<init>", "(I)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, init_id), 0 },
-        { "android/media/MediaCodecList", "findDecoderForFormat", "(Landroid/media/MediaFormat;)Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, find_decoder_for_format_id), 0 },
+    { "android/media/MediaCodecList", NULL, NULL, FF_JNI_CLASS, OFFSET(mediacodec_list_class), 1 },
+        { "android/media/MediaCodecList", "<init>", "(I)V", FF_JNI_METHOD, OFFSET(init_id), 0 },
+        { "android/media/MediaCodecList", "findDecoderForFormat", "(Landroid/media/MediaFormat;)Ljava/lang/String;", FF_JNI_METHOD, OFFSET(find_decoder_for_format_id), 0 },

-        { "android/media/MediaCodecList", "getCodecCount", "()I", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecListFields, get_codec_count_id), 1 },
-        { "android/media/MediaCodecList", "getCodecInfoAt", "(I)Landroid/media/MediaCodecInfo;", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecListFields, get_codec_info_at_id), 1 },
+        { "android/media/MediaCodecList", "getCodecCount", "()I", FF_JNI_STATIC_METHOD, OFFSET(get_codec_count_id), 1 },
+        { "android/media/MediaCodecList", "getCodecInfoAt", "(I)Landroid/media/MediaCodecInfo;", FF_JNI_STATIC_METHOD, OFFSET(get_codec_info_at_id), 1 },

-    { "android/media/MediaCodecInfo", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecListFields, mediacodec_info_class), 1 },
-        { "android/media/MediaCodecInfo", "getName", "()Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, get_name_id), 1 },
-        { "android/media/MediaCodecInfo", "getCapabilitiesForType", "(Ljava/lang/String;)Landroid/media/MediaCodecInfo$CodecCapabilities;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, get_codec_capabilities_id), 1 },
-        { "android/media/MediaCodecInfo", "getSupportedTypes", "()[Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, get_supported_types_id), 1 },
-        { "android/media/MediaCodecInfo", "isEncoder", "()Z", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, is_encoder_id), 1 },
-        { "android/media/MediaCodecInfo", "isSoftwareOnly", "()Z", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecListFields, is_software_only_id), 0 },
+    { "android/media/MediaCodecInfo", NULL, NULL, FF_JNI_CLASS, OFFSET(mediacodec_info_class), 1 },
+        { "android/media/MediaCodecInfo", "getName", "()Ljava/lang/String;", FF_JNI_METHOD, OFFSET(get_name_id), 1 },
+        { "android/media/MediaCodecInfo", "getCapabilitiesForType", "(Ljava/lang/String;)Landroid/media/MediaCodecInfo$CodecCapabilities;", FF_JNI_METHOD, OFFSET(get_codec_capabilities_id), 1 },
+        { "android/media/MediaCodecInfo", "getSupportedTypes", "()[Ljava/lang/String;", FF_JNI_METHOD, OFFSET(get_supported_types_id), 1 },
+        { "android/media/MediaCodecInfo", "isEncoder", "()Z", FF_JNI_METHOD, OFFSET(is_encoder_id), 1 },
+        { "android/media/MediaCodecInfo", "isSoftwareOnly", "()Z", FF_JNI_METHOD, OFFSET(is_software_only_id), 0 },

-    { "android/media/MediaCodecInfo$CodecCapabilities", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecListFields, codec_capabilities_class), 1 },
-        { "android/media/MediaCodecInfo$CodecCapabilities", "colorFormats", "[I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecListFields, color_formats_id), 1 },
-        { "android/media/MediaCodecInfo$CodecCapabilities", "profileLevels", "[Landroid/media/MediaCodecInfo$CodecProfileLevel;", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecListFields, profile_levels_id), 1 },
+    { "android/media/MediaCodecInfo$CodecCapabilities", NULL, NULL, FF_JNI_CLASS, OFFSET(codec_capabilities_class), 1 },
+        { "android/media/MediaCodecInfo$CodecCapabilities", "colorFormats", "[I", FF_JNI_FIELD, OFFSET(color_formats_id), 1 },
+        { "android/media/MediaCodecInfo$CodecCapabilities", "profileLevels", "[Landroid/media/MediaCodecInfo$CodecProfileLevel;", FF_JNI_FIELD, OFFSET(profile_levels_id), 1 },

-    { "android/media/MediaCodecInfo$CodecProfileLevel", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecListFields, codec_profile_level_class), 1 },
-        { "android/media/MediaCodecInfo$CodecProfileLevel", "profile", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecListFields, profile_id), 1 },
-        { "android/media/MediaCodecInfo$CodecProfileLevel", "level", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecListFields, level_id), 1 },
+    { "android/media/MediaCodecInfo$CodecProfileLevel", NULL, NULL, FF_JNI_CLASS, OFFSET(codec_profile_level_class), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "profile", "I", FF_JNI_FIELD, OFFSET(profile_id), 1 },
+        { "android/media/MediaCodecInfo$CodecProfileLevel", "level", "I", FF_JNI_FIELD, OFFSET(level_id), 1 },

    { NULL }
 };
+#undef OFFSET

 struct JNIAMediaFormatFields {

@ -110,29 +112,31 @@ struct JNIAMediaFormatFields {

 };

+#define OFFSET(x) offsetof(struct JNIAMediaFormatFields, x)
 static const struct FFJniField jni_amediaformat_mapping[] = {
-    { "android/media/MediaFormat", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaFormatFields, mediaformat_class), 1 },
+    { "android/media/MediaFormat", NULL, NULL, FF_JNI_CLASS, OFFSET(mediaformat_class), 1 },

-        { "android/media/MediaFormat", "<init>", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, init_id), 1 },
+        { "android/media/MediaFormat", "<init>", "()V", FF_JNI_METHOD, OFFSET(init_id), 1 },

-        { "android/media/MediaFormat", "containsKey", "(Ljava/lang/String;)Z", FF_JNI_METHOD,offsetof(struct JNIAMediaFormatFields, contains_key_id), 1 },
+        { "android/media/MediaFormat", "containsKey", "(Ljava/lang/String;)Z", FF_JNI_METHOD, OFFSET(contains_key_id), 1 },

-        { "android/media/MediaFormat", "getInteger", "(Ljava/lang/String;)I", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_integer_id), 1 },
-        { "android/media/MediaFormat", "getLong", "(Ljava/lang/String;)J", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_long_id), 1 },
-        { "android/media/MediaFormat", "getFloat", "(Ljava/lang/String;)F", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_float_id), 1 },
-        { "android/media/MediaFormat", "getByteBuffer", "(Ljava/lang/String;)Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_bytebuffer_id), 1 },
-        { "android/media/MediaFormat", "getString", "(Ljava/lang/String;)Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, get_string_id), 1 },
+        { "android/media/MediaFormat", "getInteger", "(Ljava/lang/String;)I", FF_JNI_METHOD, OFFSET(get_integer_id), 1 },
+        { "android/media/MediaFormat", "getLong", "(Ljava/lang/String;)J", FF_JNI_METHOD, OFFSET(get_long_id), 1 },
+        { "android/media/MediaFormat", "getFloat", "(Ljava/lang/String;)F", FF_JNI_METHOD, OFFSET(get_float_id), 1 },
+        { "android/media/MediaFormat", "getByteBuffer", "(Ljava/lang/String;)Ljava/nio/ByteBuffer;", FF_JNI_METHOD, OFFSET(get_bytebuffer_id), 1 },
+        { "android/media/MediaFormat", "getString", "(Ljava/lang/String;)Ljava/lang/String;", FF_JNI_METHOD, OFFSET(get_string_id), 1 },

-        { "android/media/MediaFormat", "setInteger", "(Ljava/lang/String;I)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_integer_id), 1 },
-        { "android/media/MediaFormat", "setLong", "(Ljava/lang/String;J)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_long_id), 1 },
-        { "android/media/MediaFormat", "setFloat", "(Ljava/lang/String;F)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_float_id), 1 },
-        { "android/media/MediaFormat", "setByteBuffer", "(Ljava/lang/String;Ljava/nio/ByteBuffer;)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_bytebuffer_id), 1 },
-        { "android/media/MediaFormat", "setString", "(Ljava/lang/String;Ljava/lang/String;)V", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, set_string_id), 1 },
+        { "android/media/MediaFormat", "setInteger", "(Ljava/lang/String;I)V", FF_JNI_METHOD, OFFSET(set_integer_id), 1 },
+        { "android/media/MediaFormat", "setLong", "(Ljava/lang/String;J)V", FF_JNI_METHOD, OFFSET(set_long_id), 1 },
+        { "android/media/MediaFormat", "setFloat", "(Ljava/lang/String;F)V", FF_JNI_METHOD, OFFSET(set_float_id), 1 },
+        { "android/media/MediaFormat", "setByteBuffer", "(Ljava/lang/String;Ljava/nio/ByteBuffer;)V", FF_JNI_METHOD, OFFSET(set_bytebuffer_id), 1 },
+        { "android/media/MediaFormat", "setString", "(Ljava/lang/String;Ljava/lang/String;)V", FF_JNI_METHOD, OFFSET(set_string_id), 1 },

-        { "android/media/MediaFormat", "toString", "()Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaFormatFields, to_string_id), 1 },
+        { "android/media/MediaFormat", "toString", "()Ljava/lang/String;", FF_JNI_METHOD, OFFSET(to_string_id), 1 },

    { NULL }
 };
+#undef OFFSET

 static const AVClass amediaformat_class = {
    .class_name = "amediaformat",
@ -202,57 +206,59 @@ struct JNIAMediaCodecFields {

 };

+#define OFFSET(x) offsetof(struct JNIAMediaCodecFields, x)
 static const struct FFJniField jni_amediacodec_mapping[] = {
-    { "android/media/MediaCodec", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecFields, mediacodec_class), 1 },
+    { "android/media/MediaCodec", NULL, NULL, FF_JNI_CLASS, OFFSET(mediacodec_class), 1 },

-        { "android/media/MediaCodec", "INFO_TRY_AGAIN_LATER", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, info_try_again_later_id), 1 },
-        { "android/media/MediaCodec", "INFO_OUTPUT_BUFFERS_CHANGED", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, info_output_buffers_changed_id), 1 },
-        { "android/media/MediaCodec", "INFO_OUTPUT_FORMAT_CHANGED", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, info_output_format_changed_id), 1 },
+        { "android/media/MediaCodec", "INFO_TRY_AGAIN_LATER", "I", FF_JNI_STATIC_FIELD, OFFSET(info_try_again_later_id), 1 },
+        { "android/media/MediaCodec", "INFO_OUTPUT_BUFFERS_CHANGED", "I", FF_JNI_STATIC_FIELD, OFFSET(info_output_buffers_changed_id), 1 },
+        { "android/media/MediaCodec", "INFO_OUTPUT_FORMAT_CHANGED", "I", FF_JNI_STATIC_FIELD, OFFSET(info_output_format_changed_id), 1 },

-        { "android/media/MediaCodec", "BUFFER_FLAG_CODEC_CONFIG", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, buffer_flag_codec_config_id), 1 },
-        { "android/media/MediaCodec", "BUFFER_FLAG_END_OF_STREAM", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, buffer_flag_end_of_stream_id), 1 },
-        { "android/media/MediaCodec", "BUFFER_FLAG_KEY_FRAME", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, buffer_flag_key_frame_id), 0 },
+        { "android/media/MediaCodec", "BUFFER_FLAG_CODEC_CONFIG", "I", FF_JNI_STATIC_FIELD, OFFSET(buffer_flag_codec_config_id), 1 },
+        { "android/media/MediaCodec", "BUFFER_FLAG_END_OF_STREAM", "I", FF_JNI_STATIC_FIELD, OFFSET(buffer_flag_end_of_stream_id), 1 },
+        { "android/media/MediaCodec", "BUFFER_FLAG_KEY_FRAME", "I", FF_JNI_STATIC_FIELD, OFFSET(buffer_flag_key_frame_id), 0 },

-        { "android/media/MediaCodec", "CONFIGURE_FLAG_ENCODE", "I", FF_JNI_STATIC_FIELD, offsetof(struct JNIAMediaCodecFields, configure_flag_encode_id), 1 },
+        { "android/media/MediaCodec", "CONFIGURE_FLAG_ENCODE", "I", FF_JNI_STATIC_FIELD, OFFSET(configure_flag_encode_id), 1 },

-        { "android/media/MediaCodec", "createByCodecName", "(Ljava/lang/String;)Landroid/media/MediaCodec;", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecFields, create_by_codec_name_id), 1 },
-        { "android/media/MediaCodec", "createDecoderByType", "(Ljava/lang/String;)Landroid/media/MediaCodec;", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecFields, create_decoder_by_type_id), 1 },
-        { "android/media/MediaCodec", "createEncoderByType", "(Ljava/lang/String;)Landroid/media/MediaCodec;", FF_JNI_STATIC_METHOD, offsetof(struct JNIAMediaCodecFields, create_encoder_by_type_id), 1 },
+        { "android/media/MediaCodec", "createByCodecName", "(Ljava/lang/String;)Landroid/media/MediaCodec;", FF_JNI_STATIC_METHOD, OFFSET(create_by_codec_name_id), 1 },
+        { "android/media/MediaCodec", "createDecoderByType", "(Ljava/lang/String;)Landroid/media/MediaCodec;", FF_JNI_STATIC_METHOD, OFFSET(create_decoder_by_type_id), 1 },
+        { "android/media/MediaCodec", "createEncoderByType", "(Ljava/lang/String;)Landroid/media/MediaCodec;", FF_JNI_STATIC_METHOD, OFFSET(create_encoder_by_type_id), 1 },

-        { "android/media/MediaCodec", "getName", "()Ljava/lang/String;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_name_id), 1 },
+        { "android/media/MediaCodec", "getName", "()Ljava/lang/String;", FF_JNI_METHOD, OFFSET(get_name_id), 1 },

-        { "android/media/MediaCodec", "configure", "(Landroid/media/MediaFormat;Landroid/view/Surface;Landroid/media/MediaCrypto;I)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, configure_id), 1 },
-        { "android/media/MediaCodec", "start", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, start_id), 1 },
-        { "android/media/MediaCodec", "flush", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, flush_id), 1 },
-        { "android/media/MediaCodec", "stop", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, stop_id), 1 },
-        { "android/media/MediaCodec", "release", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, release_id), 1 },
+        { "android/media/MediaCodec", "configure", "(Landroid/media/MediaFormat;Landroid/view/Surface;Landroid/media/MediaCrypto;I)V", FF_JNI_METHOD, OFFSET(configure_id), 1 },
+        { "android/media/MediaCodec", "start", "()V", FF_JNI_METHOD, OFFSET(start_id), 1 },
+        { "android/media/MediaCodec", "flush", "()V", FF_JNI_METHOD, OFFSET(flush_id), 1 },
+        { "android/media/MediaCodec", "stop", "()V", FF_JNI_METHOD, OFFSET(stop_id), 1 },
+        { "android/media/MediaCodec", "release", "()V", FF_JNI_METHOD, OFFSET(release_id), 1 },

-        { "android/media/MediaCodec", "getOutputFormat", "()Landroid/media/MediaFormat;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_output_format_id), 1 },
+        { "android/media/MediaCodec", "getOutputFormat", "()Landroid/media/MediaFormat;", FF_JNI_METHOD, OFFSET(get_output_format_id), 1 },

-        { "android/media/MediaCodec", "dequeueInputBuffer", "(J)I", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, dequeue_input_buffer_id), 1 },
-        { "android/media/MediaCodec", "queueInputBuffer", "(IIIJI)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, queue_input_buffer_id), 1 },
-        { "android/media/MediaCodec", "getInputBuffer", "(I)Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_input_buffer_id), 0 },
-        { "android/media/MediaCodec", "getInputBuffers", "()[Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_input_buffers_id), 1 },
+        { "android/media/MediaCodec", "dequeueInputBuffer", "(J)I", FF_JNI_METHOD, OFFSET(dequeue_input_buffer_id), 1 },
+        { "android/media/MediaCodec", "queueInputBuffer", "(IIIJI)V", FF_JNI_METHOD, OFFSET(queue_input_buffer_id), 1 },
+        { "android/media/MediaCodec", "getInputBuffer", "(I)Ljava/nio/ByteBuffer;", FF_JNI_METHOD, OFFSET(get_input_buffer_id), 0 },
+        { "android/media/MediaCodec", "getInputBuffers", "()[Ljava/nio/ByteBuffer;", FF_JNI_METHOD, OFFSET(get_input_buffers_id), 1 },

-        { "android/media/MediaCodec", "dequeueOutputBuffer", "(Landroid/media/MediaCodec$BufferInfo;J)I", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, dequeue_output_buffer_id), 1 },
-        { "android/media/MediaCodec", "getOutputBuffer", "(I)Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_output_buffer_id), 0 },
-        { "android/media/MediaCodec", "getOutputBuffers", "()[Ljava/nio/ByteBuffer;", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, get_output_buffers_id), 1 },
-        { "android/media/MediaCodec", "releaseOutputBuffer", "(IZ)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, release_output_buffer_id), 1 },
-        { "android/media/MediaCodec", "releaseOutputBuffer", "(IJ)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, release_output_buffer_at_time_id), 0 },
+        { "android/media/MediaCodec", "dequeueOutputBuffer", "(Landroid/media/MediaCodec$BufferInfo;J)I", FF_JNI_METHOD, OFFSET(dequeue_output_buffer_id), 1 },
+        { "android/media/MediaCodec", "getOutputBuffer", "(I)Ljava/nio/ByteBuffer;", FF_JNI_METHOD, OFFSET(get_output_buffer_id), 0 },
+        { "android/media/MediaCodec", "getOutputBuffers", "()[Ljava/nio/ByteBuffer;", FF_JNI_METHOD, OFFSET(get_output_buffers_id), 1 },
+        { "android/media/MediaCodec", "releaseOutputBuffer", "(IZ)V", FF_JNI_METHOD, OFFSET(release_output_buffer_id), 1 },
+        { "android/media/MediaCodec", "releaseOutputBuffer", "(IJ)V", FF_JNI_METHOD, OFFSET(release_output_buffer_at_time_id), 0 },

-        { "android/media/MediaCodec", "setInputSurface", "(Landroid/view/Surface;)V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, set_input_surface_id), 0 },
-        { "android/media/MediaCodec", "signalEndOfInputStream", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, signal_end_of_input_stream_id), 0 },
+        { "android/media/MediaCodec", "setInputSurface", "(Landroid/view/Surface;)V", FF_JNI_METHOD, OFFSET(set_input_surface_id), 0 },
+        { "android/media/MediaCodec", "signalEndOfInputStream", "()V", FF_JNI_METHOD, OFFSET(signal_end_of_input_stream_id), 0 },

-    { "android/media/MediaCodec$BufferInfo", NULL, NULL, FF_JNI_CLASS, offsetof(struct JNIAMediaCodecFields, mediainfo_class), 1 },
+    { "android/media/MediaCodec$BufferInfo", NULL, NULL, FF_JNI_CLASS, OFFSET(mediainfo_class), 1 },

-        { "android/media/MediaCodec.BufferInfo", "<init>", "()V", FF_JNI_METHOD, offsetof(struct JNIAMediaCodecFields, init_id), 1 },
-        { "android/media/MediaCodec.BufferInfo", "flags", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecFields, flags_id), 1 },
-        { "android/media/MediaCodec.BufferInfo", "offset", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecFields, offset_id), 1 },
-        { "android/media/MediaCodec.BufferInfo", "presentationTimeUs", "J", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecFields, presentation_time_us_id), 1 },
-        { "android/media/MediaCodec.BufferInfo", "size", "I", FF_JNI_FIELD, offsetof(struct JNIAMediaCodecFields, size_id), 1 },
+        { "android/media/MediaCodec.BufferInfo", "<init>", "()V", FF_JNI_METHOD, OFFSET(init_id), 1 },
+        { "android/media/MediaCodec.BufferInfo", "flags", "I", FF_JNI_FIELD, OFFSET(flags_id), 1 },
+        { "android/media/MediaCodec.BufferInfo", "offset", "I", FF_JNI_FIELD, OFFSET(offset_id), 1 },
+        { "android/media/MediaCodec.BufferInfo", "presentationTimeUs", "J", FF_JNI_FIELD, OFFSET(presentation_time_us_id), 1 },
+        { "android/media/MediaCodec.BufferInfo", "size", "I", FF_JNI_FIELD, OFFSET(size_id), 1 },

    { NULL }
 };
+#undef OFFSET

 static const AVClass amediacodec_class = {
    .class_name = "amediacodec",
@ -543,10 +549,8 @@ char *ff_AMediaCodecList_getCodecNameByType(const char *mime, int profile, int e
            goto done;
        }

-        if (codec_name) {
-            (*env)->DeleteLocalRef(env, codec_name);
-            codec_name = NULL;
-        }
+        (*env)->DeleteLocalRef(env, codec_name);
+        codec_name = NULL;

        /* Skip software decoders */
        if (
@ -610,10 +614,8 @@ char *ff_AMediaCodecList_getCodecNameByType(const char *mime, int profile, int e

                found_codec = profile == supported_profile;

-                if (profile_level) {
-                    (*env)->DeleteLocalRef(env, profile_level);
-                    profile_level = NULL;
-                }
+                (*env)->DeleteLocalRef(env, profile_level);
+                profile_level = NULL;

                if (found_codec) {
                    break;
@ -621,20 +623,14 @@ char *ff_AMediaCodecList_getCodecNameByType(const char *mime, int profile, int e
            }

 done_with_type:
-            if (profile_levels) {
-                (*env)->DeleteLocalRef(env, profile_levels);
-                profile_levels = NULL;
-            }
+            (*env)->DeleteLocalRef(env, profile_levels);
+            profile_levels = NULL;

-            if (capabilities) {
-                (*env)->DeleteLocalRef(env, capabilities);
-                capabilities = NULL;
-            }
+            (*env)->DeleteLocalRef(env, capabilities);
+            capabilities = NULL;

-            if (type) {
-                (*env)->DeleteLocalRef(env, type);
-                type = NULL;
-            }
+            (*env)->DeleteLocalRef(env, type);
+            type = NULL;

            av_freep(&supported_type);

@ -644,15 +640,11 @@ done_with_type:
        }

 done_with_info:
-        if (info) {
-            (*env)->DeleteLocalRef(env, info);
-            info = NULL;
-        }
+        (*env)->DeleteLocalRef(env, info);
+        info = NULL;

-        if (types) {
-            (*env)->DeleteLocalRef(env, types);
-            types = NULL;
-        }
+        (*env)->DeleteLocalRef(env, types);
+        types = NULL;

        if (found_codec) {
            break;
@ -662,33 +654,13 @@ done_with_info:
    }

 done:
-    if (codec_name) {
-        (*env)->DeleteLocalRef(env, codec_name);
-    }
-
-    if (info) {
-        (*env)->DeleteLocalRef(env, info);
-    }
-
-    if (type) {
-        (*env)->DeleteLocalRef(env, type);
-    }
-
-    if (types) {
-        (*env)->DeleteLocalRef(env, types);
-    }
-
-    if (capabilities) {
-        (*env)->DeleteLocalRef(env, capabilities);
-    }
-
-    if (profile_level) {
-        (*env)->DeleteLocalRef(env, profile_level);
-    }
-
-    if (profile_levels) {
-        (*env)->DeleteLocalRef(env, profile_levels);
-    }
+    (*env)->DeleteLocalRef(env, codec_name);
+    (*env)->DeleteLocalRef(env, info);
+    (*env)->DeleteLocalRef(env, type);
+    (*env)->DeleteLocalRef(env, types);
+    (*env)->DeleteLocalRef(env, capabilities);
+    (*env)->DeleteLocalRef(env, profile_level);
+    (*env)->DeleteLocalRef(env, profile_levels);

    av_freep(&supported_type);

@ -735,9 +707,7 @@ static FFAMediaFormat *mediaformat_jni_new(void)
    }

 fail:
-    if (object) {
-        (*env)->DeleteLocalRef(env, object);
-    }
+    (*env)->DeleteLocalRef(env, object);

    if (!format->object) {
        ff_jni_reset_jfields(env, &format->jfields, jni_amediaformat_mapping, 1, format);
@ -822,9 +792,7 @@ static char* mediaformat_jni_toString(FFAMediaFormat* ctx)

    ret = ff_jni_jstring_to_utf_chars(env, description, format);
 fail:
-    if (description) {
-        (*env)->DeleteLocalRef(env, description);
-    }
+    (*env)->DeleteLocalRef(env, description);

    return ret;
 }
@ -861,9 +829,7 @@ static int mediaformat_jni_getInt32(FFAMediaFormat* ctx, const char *name, int32

    ret = 1;
 fail:
-    if (key) {
-        (*env)->DeleteLocalRef(env, key);
-    }
+    (*env)->DeleteLocalRef(env, key);

    return ret;
 }
@ -900,9 +866,7 @@ static int mediaformat_jni_getInt64(FFAMediaFormat* ctx, const char *name, int64

    ret = 1;
 fail:
-    if (key) {
-        (*env)->DeleteLocalRef(env, key);
-    }
+    (*env)->DeleteLocalRef(env, key);

    return ret;
 }
@ -939,9 +903,7 @@ static int mediaformat_jni_getFloat(FFAMediaFormat* ctx, const char *name, float

    ret = 1;
 fail:
-    if (key) {
-        (*env)->DeleteLocalRef(env, key);
-    }
+    (*env)->DeleteLocalRef(env, key);

    return ret;
 }
@ -993,13 +955,8 @@ static int mediaformat_jni_getBuffer(FFAMediaFormat* ctx, const char *name, void

    ret = 1;
 fail:
-    if (key) {
-        (*env)->DeleteLocalRef(env, key);
-    }
-
-    if (result) {
-        (*env)->DeleteLocalRef(env, result);
-    }
+    (*env)->DeleteLocalRef(env, key);
+    (*env)->DeleteLocalRef(env, result);

    return ret;
 }
@ -1043,13 +1000,8 @@ static int mediaformat_jni_getString(FFAMediaFormat* ctx, const char *name, cons

    ret = 1;
 fail:
-    if (key) {
-        (*env)->DeleteLocalRef(env, key);
-    }
-
-    if (result) {
-        (*env)->DeleteLocalRef(env, result);
-    }
+    (*env)->DeleteLocalRef(env, key);
+    (*env)->DeleteLocalRef(env, result);

    return ret;
 }
@ -1075,9 +1027,7 @@ static void mediaformat_jni_setInt32(FFAMediaFormat* ctx, const char* name, int3
    }

 fail:
-    if (key) {
-        (*env)->DeleteLocalRef(env, key);
-    }
+    (*env)->DeleteLocalRef(env, key);
 }

 static void mediaformat_jni_setInt64(FFAMediaFormat* ctx, const char* name, int64_t value)
@ -1101,9 +1051,7 @@ static void mediaformat_jni_setInt64(FFAMediaFormat* ctx, const char* name, int6
    }

 fail:
-    if (key) {
-        (*env)->DeleteLocalRef(env, key);
-    }
+    (*env)->DeleteLocalRef(env, key);
 }

 static void mediaformat_jni_setFloat(FFAMediaFormat* ctx, const char* name, float value)
@ -1127,9 +1075,7 @@ static void mediaformat_jni_setFloat(FFAMediaFormat* ctx, const char* name, floa
    }

 fail:
-    if (key) {
-        (*env)->DeleteLocalRef(env, key);
-    }
+    (*env)->DeleteLocalRef(env, key);
 }

 static void mediaformat_jni_setString(FFAMediaFormat* ctx, const char* name, const char* value)
@ -1159,13 +1105,8 @@ static void mediaformat_jni_setString(FFAMediaFormat* ctx, const char* name, con
    }

 fail:
-    if (key) {
-        (*env)->DeleteLocalRef(env, key);
-    }
-
-    if (string) {
-        (*env)->DeleteLocalRef(env, string);
-    }
+    (*env)->DeleteLocalRef(env, key);
+    (*env)->DeleteLocalRef(env, string);
 }

 static void mediaformat_jni_setBuffer(FFAMediaFormat* ctx, const char* name, void* data, size_t size)
@ -1207,13 +1148,8 @@ static void mediaformat_jni_setBuffer(FFAMediaFormat* ctx, const char* name, voi
    }

 fail:
-    if (key) {
-        (*env)->DeleteLocalRef(env, key);
-    }
-
-    if (buffer) {
-        (*env)->DeleteLocalRef(env, buffer);
-    }
+    (*env)->DeleteLocalRef(env, key);
+    (*env)->DeleteLocalRef(env, buffer);
 }

 static int codec_init_static_fields(FFAMediaCodecJni *codec)
@ -1346,26 +1282,13 @@ static inline FFAMediaCodec *codec_create(int method, const char *arg)

    ret = 0;
 fail:
-    if (jarg) {
-        (*env)->DeleteLocalRef(env, jarg);
-    }
-
-    if (object) {
-        (*env)->DeleteLocalRef(env, object);
-    }
-
-    if (buffer_info) {
-        (*env)->DeleteLocalRef(env, buffer_info);
-    }
+    (*env)->DeleteLocalRef(env, jarg);
+    (*env)->DeleteLocalRef(env, object);
+    (*env)->DeleteLocalRef(env, buffer_info);

    if (ret < 0) {
-        if (codec->object) {
-            (*env)->DeleteGlobalRef(env, codec->object);
-        }
-
-        if (codec->buffer_info) {
-            (*env)->DeleteGlobalRef(env, codec->buffer_info);
-        }
+        (*env)->DeleteGlobalRef(env, codec->object);
+        (*env)->DeleteGlobalRef(env, codec->buffer_info);

        ff_jni_reset_jfields(env, &codec->jfields, jni_amediacodec_mapping, 1, codec);
        av_freep(&codec);
@ -1686,13 +1609,8 @@ static uint8_t* mediacodec_jni_getInputBuffer(FFAMediaCodec* ctx, size_t idx, si
    ret = (*env)->GetDirectBufferAddress(env, buffer);
    *out_size = (*env)->GetDirectBufferCapacity(env, buffer);
 fail:
-    if (buffer) {
-        (*env)->DeleteLocalRef(env, buffer);
-    }
-
-    if (input_buffers) {
-        (*env)->DeleteLocalRef(env, input_buffers);
-    }
+    (*env)->DeleteLocalRef(env, buffer);
+    (*env)->DeleteLocalRef(env, input_buffers);

    return ret;
 }
@ -1734,13 +1652,8 @@ static uint8_t* mediacodec_jni_getOutputBuffer(FFAMediaCodec* ctx, size_t idx, s
    ret = (*env)->GetDirectBufferAddress(env, buffer);
    *out_size = (*env)->GetDirectBufferCapacity(env, buffer);
 fail:
-    if (buffer) {
-        (*env)->DeleteLocalRef(env, buffer);
-    }
-
-    if (output_buffers) {
-        (*env)->DeleteLocalRef(env, output_buffers);
-    }
+    (*env)->DeleteLocalRef(env, buffer);
+    (*env)->DeleteLocalRef(env, output_buffers);

    return ret;
 }
@ -1762,9 +1675,7 @@ static FFAMediaFormat* mediacodec_jni_getOutputFormat(FFAMediaCodec* ctx)

    ret = mediaformat_jni_newFromObject(mediaformat);
 fail:
-    if (mediaformat) {
-        (*env)->DeleteLocalRef(env, mediaformat);
-    }
+    (*env)->DeleteLocalRef(env, mediaformat);

    return ret;
 }
--- a/libavcodec/mpeg12dec.c
+++ b/libavcodec/mpeg12dec.c
@ -62,6 +62,13 @@

 #define A53_MAX_CC_COUNT 2000

+enum Mpeg2ClosedCaptionsFormat {
+    CC_FORMAT_AUTO,
+    CC_FORMAT_A53_PART4,
+    CC_FORMAT_SCTE20,
+    CC_FORMAT_DVD
+};
+
 typedef struct Mpeg1Context {
    MpegEncContext mpeg_enc_ctx;
    int mpeg_enc_ctx_allocated; /* true if decoding context allocated */
@ -70,12 +77,14 @@ typedef struct Mpeg1Context {
    AVStereo3D stereo3d;
    int has_stereo3d;
    AVBufferRef *a53_buf_ref;
+    enum Mpeg2ClosedCaptionsFormat cc_format;
    uint8_t afd;
    int has_afd;
    int slice_count;
    unsigned aspect_ratio_info;
    AVRational save_aspect;
    int save_width, save_height, save_progressive_seq;
+    enum AVCodecID save_codec_id;
    AVRational frame_rate_ext;  /* MPEG-2 specific framerate modificator */
    unsigned frame_rate_index;
    int sync;                   /* Did we reach a sync point like a GOP/SEQ/KEYFrame? */
@ -787,9 +796,6 @@ static av_cold int mpeg_decode_init(AVCodecContext *avctx)
    Mpeg1Context *s    = avctx->priv_data;
    MpegEncContext *s2 = &s->mpeg_enc_ctx;

-    if (   avctx->codec_tag != AV_RL32("VCR2")
-        && avctx->codec_tag != AV_RL32("BW10"))
-        avctx->coded_width = avctx->coded_height = 0; // do not trust dimensions from input
    ff_mpv_decode_init(s2, avctx);

    ff_mpeg12_init_vlcs();
@ -960,6 +966,7 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
        s1->save_height          != s->height               ||
        av_cmp_q(s1->save_aspect, s->avctx->sample_aspect_ratio) ||
        (s1->save_progressive_seq != s->progressive_sequence && FFALIGN(s->height, 16) != FFALIGN(s->height, 32)) ||
+        s1->save_codec_id        != s->codec_id             ||
        0) {
        if (s1->mpeg_enc_ctx_allocated) {
            ff_mpv_common_end(s);
@ -981,6 +988,7 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
        s1->save_width           = s->width;
        s1->save_height          = s->height;
        s1->save_progressive_seq = s->progressive_sequence;
+        s1->save_codec_id        = s->codec_id;

        /* low_delay may be forced, in this case we will have B-frames
         * that behave like P-frames. */
@ -1013,7 +1021,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
            case 1: avctx->chroma_sample_location = AVCHROMA_LOC_LEFT; break;
            case 2:
            case 3: avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT; break;
-            default: av_assert0(0);
            }
        } // MPEG-2

@ -1083,6 +1090,7 @@ static void mpeg_decode_sequence_extension(Mpeg1Context *s1)
    skip_bits(&s->gb, 1); /* profile and level esc*/
    s->avctx->profile       = get_bits(&s->gb, 3);
    s->avctx->level         = get_bits(&s->gb, 4);
+    s->avctx->progressive_sequence =
    s->progressive_sequence = get_bits1(&s->gb);   /* progressive_sequence */
    s->chroma_format        = get_bits(&s->gb, 2); /* chroma_format 1=420, 2=422, 3=444 */

@ -1833,6 +1841,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
    s->height = height;

    /* We set MPEG-2 parameters so that it emulates MPEG-1. */
+    s->avctx->progressive_sequence =
    s->progressive_sequence = 1;
    s->progressive_frame    = 1;
    s->picture_structure    = PICT_FRAME;
@ -1886,6 +1895,7 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
        s->chroma_inter_matrix[j] = v;
    }

+    s->avctx->progressive_sequence =
    s->progressive_sequence  = 1;
    s->progressive_frame     = 1;
    s->picture_structure     = PICT_FRAME;
@ -1900,15 +1910,31 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
    s1->save_width           = s->width;
    s1->save_height          = s->height;
    s1->save_progressive_seq = s->progressive_sequence;
+    s1->save_codec_id        = s->codec_id;
    return 0;
 }

+static void mpeg_set_cc_format(AVCodecContext *avctx, enum Mpeg2ClosedCaptionsFormat format,
+                               const char *label)
+{
+    Mpeg1Context *s1 = avctx->priv_data;
+
+    av_assert2(format != CC_FORMAT_AUTO);
+
+    if (!s1->cc_format) {
+        s1->cc_format = format;
+
+        av_log(avctx, AV_LOG_DEBUG, "CC: first seen substream is %s format\n", label);
+    }
+}
+
 static int mpeg_decode_a53_cc(AVCodecContext *avctx,
                              const uint8_t *p, int buf_size)
 {
    Mpeg1Context *s1 = avctx->priv_data;

-    if (buf_size >= 6 &&
+    if ((!s1->cc_format || s1->cc_format == CC_FORMAT_A53_PART4) &&
+        buf_size >= 6 &&
        p[0] == 'G' && p[1] == 'A' && p[2] == '9' && p[3] == '4' &&
        p[4] == 3 && (p[5] & 0x40)) {
        /* extract A53 Part 4 CC data */
@ -1927,9 +1953,11 @@ static int mpeg_decode_a53_cc(AVCodecContext *avctx,
                memcpy(s1->a53_buf_ref->data + old_size, p + 7, cc_count * UINT64_C(3));

            avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
+            mpeg_set_cc_format(avctx, CC_FORMAT_A53_PART4, "A/53 Part 4");
        }
        return 1;
-    } else if (buf_size >= 2 &&
+    } else if ((!s1->cc_format || s1->cc_format == CC_FORMAT_SCTE20) &&
+               buf_size >= 2 &&
               p[0] == 0x03 && (p[1]&0x7f) == 0x01) {
        /* extract SCTE-20 CC data */
        GetBitContext gb;
@ -1973,10 +2001,13 @@ static int mpeg_decode_a53_cc(AVCodecContext *avctx,
                    cap += 3;
                }
            }
+
            avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
+            mpeg_set_cc_format(avctx, CC_FORMAT_SCTE20, "SCTE-20");
        }
        return 1;
-    } else if (buf_size >= 11 &&
+    } else if ((!s1->cc_format || s1->cc_format == CC_FORMAT_DVD) &&
+               buf_size >= 11 &&
               p[0] == 'C' && p[1] == 'C' && p[2] == 0x01 && p[3] == 0xf8) {
        /* extract DVD CC data
         *
@ -2033,7 +2064,9 @@ static int mpeg_decode_a53_cc(AVCodecContext *avctx,
                    p += 6;
                }
            }
+
            avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
+            mpeg_set_cc_format(avctx, CC_FORMAT_DVD, "DVD");
        }
        return 1;
    }
@ -2598,11 +2631,39 @@ const FFCodec ff_mpeg1video_decoder = {
                           },
 };

+#define M2V_OFFSET(x) offsetof(Mpeg1Context, x)
+#define M2V_PARAM     AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+
+static const AVOption mpeg2video_options[] = {
+    { "cc_format", "extract a specific Closed Captions format",
+       M2V_OFFSET(cc_format), AV_OPT_TYPE_INT, { .i64 = CC_FORMAT_AUTO },
+        CC_FORMAT_AUTO, CC_FORMAT_DVD, M2V_PARAM, .unit = "cc_format" },
+
+       { "auto",   "pick first seen CC substream",  0, AV_OPT_TYPE_CONST,
+        { .i64 =   CC_FORMAT_AUTO },                .flags = M2V_PARAM, .unit = "cc_format" },
+       { "a53",    "pick A/53 Part 4 CC substream", 0, AV_OPT_TYPE_CONST,
+        { .i64 =   CC_FORMAT_A53_PART4 },           .flags = M2V_PARAM, .unit = "cc_format" },
+       { "scte20", "pick SCTE-20 CC substream",     0, AV_OPT_TYPE_CONST,
+        { .i64 =   CC_FORMAT_SCTE20 },              .flags = M2V_PARAM, .unit = "cc_format" },
+       { "dvd",    "pick DVD CC substream",         0, AV_OPT_TYPE_CONST,
+        { .i64 =   CC_FORMAT_DVD },                 .flags = M2V_PARAM, .unit = "cc_format" },
+    { NULL }
+};
+
+static const AVClass mpeg2video_class = {
+    .class_name = "MPEG-2 video",
+    .item_name  = av_default_item_name,
+    .option     = mpeg2video_options,
+    .version    = LIBAVUTIL_VERSION_INT,
+    .category   = AV_CLASS_CATEGORY_DECODER,
+};
+
 const FFCodec ff_mpeg2video_decoder = {
    .p.name         = "mpeg2video",
    CODEC_LONG_NAME("MPEG-2 video"),
    .p.type         = AVMEDIA_TYPE_VIDEO,
    .p.id           = AV_CODEC_ID_MPEG2VIDEO,
+    .p.priv_class   = &mpeg2video_class,
    .priv_data_size = sizeof(Mpeg1Context),
    .init           = mpeg_decode_init,
    .close          = mpeg_decode_end,
--- a/libavcodec/mpeg4audio.h
+++ b/libavcodec/mpeg4audio.h
@ -39,6 +39,7 @@ typedef struct MPEG4AudioConfig {
    int channels;
    int ps;  ///< -1 implicit, 1 presence
    int frame_length_short;
+    int pce;
 } MPEG4AudioConfig;

 extern const int     ff_mpeg4audio_sample_rates[16];
--- a/libavcodec/mpegvideo_dec.c
+++ b/libavcodec/mpegvideo_dec.c
@ -344,8 +344,10 @@ int ff_mpv_frame_start(MpegEncContext *s, AVCodecContext *avctx)
            pic->reference = 3;
    }

-    if (alloc_picture(s, pic) < 0)
+    if (alloc_picture(s, pic) < 0) {
+        s->current_picture_ptr = NULL;
        return -1;
+    }

    s->current_picture_ptr = pic;
    // FIXME use only the vars from current_pic
--- a/libavcodec/mpegvideo_parser.c
+++ b/libavcodec/mpegvideo_parser.c
@ -75,9 +75,15 @@ static int mpeg1_find_frame_end(ParseContext *pc, const uint8_t *buf,
                pc->frame_start_found = 4;
            }
            if (state == SEQ_END_CODE) {
+                int idx = i + 1;
+                /* DVDs won't send the next frame start on still images */
+                /* SEQ_END_CODE will have to stay at the beginning of the next frame */
+                if (pc->frame_start_found && i != 3) {
+                    idx = i - 3;
+                }
                pc->frame_start_found = 0;
                pc->state = -1;
-                return i + 1;
+                return idx;
            }
            if (pc->frame_start_found == 2 && state == SEQ_START_CODE)
                pc->frame_start_found = 0;
--- a/libavcodec/options.c
+++ b/libavcodec/options.c
@ -176,6 +176,8 @@ void avcodec_free_context(AVCodecContext **pavctx)
    av_freep(&avctx->inter_matrix);
    av_freep(&avctx->rc_override);
    av_channel_layout_uninit(&avctx->ch_layout);
+    av_frame_side_data_free(
+        &avctx->decoded_side_data, &avctx->nb_decoded_side_data);

    av_freep(pavctx);
 }
--- a/libavcodec/packet.h
+++ b/libavcodec/packet.h
@ -68,6 +68,9 @@ enum AVPacketSideDataType {
     * if (param_flags & AV_SIDE_DATA_PARAM_CHANGE_DIMENSIONS)
     *     s32le width
     *     s32le height
+     * if (param_flags & AV_SIDE_DATA_PARAM_CHANGE_ASPECTRATIO)
+     *     s32le num
+     *     s32le den
     * @endcode
     */
    AV_PKT_DATA_PARAM_CHANGE,
@ -596,8 +599,11 @@ typedef struct AVPacketList {
 #define AV_PKT_FLAG_DISPOSABLE 0x0010

 enum AVSideDataParamChangeFlags {
+    AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_COUNT  = 0x0001,
+    AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_LAYOUT = 0x0002,
    AV_SIDE_DATA_PARAM_CHANGE_SAMPLE_RATE    = 0x0004,
    AV_SIDE_DATA_PARAM_CHANGE_DIMENSIONS     = 0x0008,
+    AV_SIDE_DATA_PARAM_CHANGE_ASPECTRATIO    = 0x8000,
 };

 /**
--- a/libavcodec/parsers.c
+++ b/libavcodec/parsers.c
@ -51,6 +51,7 @@ extern const AVCodecParser ff_gsm_parser;
 extern const AVCodecParser ff_h261_parser;
 extern const AVCodecParser ff_h263_parser;
 extern const AVCodecParser ff_h264_parser;
+extern const AVCodecParser ff_h264_mvc_parser;
 extern const AVCodecParser ff_hevc_parser;
 extern const AVCodecParser ff_hdr_parser;
 extern const AVCodecParser ff_ipu_parser;
--- a/libavcodec/profiles.c
+++ b/libavcodec/profiles.c
@ -40,6 +40,8 @@ const AVProfile ff_dca_profiles[] = {
    { AV_PROFILE_DTS_ES,             "DTS-ES"                 },
    { AV_PROFILE_DTS_96_24,          "DTS 96/24"              },
    { AV_PROFILE_DTS_HD_HRA,         "DTS-HD HRA"             },
+    { AV_PROFILE_DTS_HD_HRA_X,       "DTS-HD HRA + DTS:X"     },
+    { AV_PROFILE_DTS_HD_HRA_X_IMAX,  "DTS-HD HRA + DTS:X IMAX"},
    { AV_PROFILE_DTS_HD_MA,          "DTS-HD MA"              },
    { AV_PROFILE_DTS_HD_MA_X,        "DTS-HD MA + DTS:X"      },
    { AV_PROFILE_DTS_HD_MA_X_IMAX,   "DTS-HD MA + DTS:X IMAX" },
@ -83,6 +85,7 @@ const AVProfile ff_h264_profiles[] = {
    { AV_PROFILE_H264_CAVLC_444,            "CAVLC 4:4:4"           },
    { AV_PROFILE_H264_MULTIVIEW_HIGH,       "Multiview High"        },
    { AV_PROFILE_H264_STEREO_HIGH,          "Stereo High"           },
+    { FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH, "Multiview High Depth"  },
    { AV_PROFILE_UNKNOWN },
 };

--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@ -178,6 +178,27 @@ static av_cold int decode_init(AVCodecContext *avctx)
        return ret;
    }

+    switch (avctx->codec_tag) {
+    case MKTAG('a', 'p', 'c', 'h'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+        break;
+    case MKTAG('a', 'p', 'c', 'n'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+        break;
+    case MKTAG('a', 'p', 'c', 's'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+        break;
+    case MKTAG('a', 'p', 'c', 'o'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV422P10;
+        break;
+    case MKTAG('a', 'p', '4', 'h'):
+        avctx->pix_fmt = AV_PIX_FMT_YUV444P10;
+        break;
+    default:
+        av_log(avctx, AV_LOG_WARNING, "Unknown ProRes FOURCC provided (%08X)\n",
+               avctx->codec_tag);
+    }
+
    ff_init_scantable_permutation(idct_permutation,
                                  ctx->prodsp.idct_permutation_type);

--- a/libavcodec/pthread_frame.c
+++ b/libavcodec/pthread_frame.c
@ -315,6 +315,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
        if (err < 0)
            return err;

+        dst->progressive_sequence = src->progressive_sequence;
+
        if (!!dst->hw_frames_ctx != !!src->hw_frames_ctx ||
            (dst->hw_frames_ctx && dst->hw_frames_ctx->data != src->hw_frames_ctx->data)) {
            av_buffer_unref(&dst->hw_frames_ctx);
--- a/libavcodec/pthread_internal.h
+++ b/libavcodec/pthread_internal.h
@ -23,7 +23,7 @@

 /* H.264 slice threading seems to be buggy with more than 16 threads,
 * limit the number of threads to 16 for automatic detection */
-#define MAX_AUTO_THREADS 16
+#define MAX_AUTO_THREADS 1

 int ff_slice_thread_init(AVCodecContext *avctx);
 void ff_slice_thread_free(AVCodecContext *avctx);
--- a/libavcodec/s302m.c
+++ b/libavcodec/s302m.c
@ -203,6 +203,7 @@ static int s302m_decode_frame(AVCodecContext *avctx, AVFrame *frame,
    }

    avctx->sample_rate = 48000;
+    avctx->codec_tag = non_pcm_data_type;

    *got_frame_ptr = 1;

@ -211,7 +212,7 @@ static int s302m_decode_frame(AVCodecContext *avctx, AVFrame *frame,

 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_DECODING_PARAM
 static const AVOption s302m_options[] = {
-    {"non_pcm_mode", "Chooses what to do with NON-PCM", offsetof(S302Context, non_pcm_mode), AV_OPT_TYPE_INT, {.i64 = 3}, 0, 3, FLAGS, .unit = "non_pcm_mode"},
+    {"non_pcm_mode", "Chooses what to do with NON-PCM", offsetof(S302Context, non_pcm_mode), AV_OPT_TYPE_INT, {.i64 = 2}, 0, 3, FLAGS, .unit = "non_pcm_mode"},
    {"copy"        , "Pass NON-PCM through unchanged"     , 0, AV_OPT_TYPE_CONST, {.i64 = 0}, 0, 3, FLAGS, .unit = "non_pcm_mode"},
    {"drop"        , "Drop NON-PCM"                       , 0, AV_OPT_TYPE_CONST, {.i64 = 1}, 0, 3, FLAGS, .unit = "non_pcm_mode"},
    {"decode_copy" , "Decode if possible else passthrough", 0, AV_OPT_TYPE_CONST, {.i64 = 2}, 0, 3, FLAGS, .unit = "non_pcm_mode"},
--- a/libavcodec/utvideodec.c
+++ b/libavcodec/utvideodec.c
@ -27,7 +27,7 @@
 #include <inttypes.h>
 #include <stdlib.h>

-#define CACHED_BITSTREAM_READER !ARCH_X86_32
+#define CACHED_BITSTREAM_READER 0 /* cached reader is broken with get_bits_le used below */
 #define UNCHECKED_BITSTREAM_READER 1

 #include "libavutil/intreadwrite.h"
--- a/libavcodec/vc1.h
+++ b/libavcodec/vc1.h
@ -396,6 +396,8 @@ typedef struct VC1Context{

    int parse_only;              ///< Context is used within parser
    int resync_marker;           ///< could this stream contain resync markers
+
+    int recovered;
 } VC1Context;

 /**
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@ -1045,6 +1045,13 @@ static int vc1_decode_frame(AVCodecContext *avctx, AVFrame *pict,
        goto err;
    }

+    if (!v->recovered && !(avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL)) {
+        if (s->pict_type == AV_PICTURE_TYPE_I)
+            v->recovered = 1;
+        else
+            goto err;
+    }
+
    /* skip B-frames if we don't have reference frames */
    if (!s->last_picture_ptr && s->pict_type == AV_PICTURE_TYPE_B) {
        av_log(v->s.avctx, AV_LOG_DEBUG, "Skipping B frame without reference frames\n");
@ -1381,6 +1388,14 @@ err:
    return ret;
 }

+static void vc1_decode_flush(AVCodecContext *avctx)
+{
+    VC1Context *v = avctx->priv_data;
+
+    ff_mpeg_flush(avctx);
+
+    v->recovered = 0;
+}

 const FFCodec ff_vc1_decoder = {
    .p.name         = "vc1",
@ -1391,7 +1406,7 @@ const FFCodec ff_vc1_decoder = {
    .init           = vc1_decode_init,
    .close          = ff_vc1_decode_end,
    FF_CODEC_DECODE_CB(vc1_decode_frame),
-    .flush          = ff_mpeg_flush,
+    .flush          = vc1_decode_flush,
    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
    .hw_configs     = (const AVCodecHWConfigInternal *const []) {
 #if CONFIG_VC1_DXVA2_HWACCEL
@ -1430,7 +1445,7 @@ const FFCodec ff_wmv3_decoder = {
    .init           = vc1_decode_init,
    .close          = ff_vc1_decode_end,
    FF_CODEC_DECODE_CB(vc1_decode_frame),
-    .flush          = ff_mpeg_flush,
+    .flush          = vc1_decode_flush,
    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
    .hw_configs     = (const AVCodecHWConfigInternal *const []) {
 #if CONFIG_WMV3_DXVA2_HWACCEL
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@ -29,8 +29,8 @@

 #include "version_major.h"

-#define LIBAVCODEC_VERSION_MINOR   1
-#define LIBAVCODEC_VERSION_MICRO 101
+#define LIBAVCODEC_VERSION_MINOR   2
+#define LIBAVCODEC_VERSION_MICRO 100

 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
                                               LIBAVCODEC_VERSION_MINOR, \
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@ -1341,9 +1341,6 @@ static int decode_tiles(AVCodecContext *avctx,
                        decode_sb_mem(td, row, col, lflvl_ptr,
                                      yoff2, uvoff2, BL_64X64);
                    } else {
-                        if (vpx_rac_is_end(td->c)) {
-                            return AVERROR_INVALIDDATA;
-                        }
                        decode_sb(td, row, col, lflvl_ptr,
                                  yoff2, uvoff2, BL_64X64);
                    }
--- a/libavcodec/vulkan_hevc.c
+++ b/libavcodec/vulkan_hevc.c
@ -250,10 +250,10 @@ static void set_sps(const HEVCSPS *sps, int sps_idx,

    *vksps_vui_header = (StdVideoH265HrdParameters) {
        .flags = (StdVideoH265HrdFlags) {
-            .nal_hrd_parameters_present_flag = sps->hdr.flags.nal_hrd_parameters_present_flag,
-            .vcl_hrd_parameters_present_flag = sps->hdr.flags.vcl_hrd_parameters_present_flag,
-            .sub_pic_hrd_params_present_flag = sps->hdr.flags.sub_pic_hrd_params_present_flag,
-            .sub_pic_cpb_params_in_pic_timing_sei_flag = sps->hdr.flags.sub_pic_cpb_params_in_pic_timing_sei_flag,
+            .nal_hrd_parameters_present_flag = sps->hdr.nal_hrd_parameters_present_flag,
+            .vcl_hrd_parameters_present_flag = sps->hdr.vcl_hrd_parameters_present_flag,
+            .sub_pic_hrd_params_present_flag = sps->hdr.sub_pic_hrd_params_present_flag,
+            .sub_pic_cpb_params_in_pic_timing_sei_flag = sps->hdr.sub_pic_cpb_params_in_pic_timing_sei_flag,
            .fixed_pic_rate_general_flag = sps->hdr.flags.fixed_pic_rate_general_flag,
            .fixed_pic_rate_within_cvs_flag = sps->hdr.flags.fixed_pic_rate_within_cvs_flag,
            .low_delay_hrd_flag = sps->hdr.flags.low_delay_hrd_flag,
@ -567,10 +567,10 @@ static void set_vps(const HEVCVPS *vps,

        sls_hdr[i] = (StdVideoH265HrdParameters) {
            .flags = (StdVideoH265HrdFlags) {
-                .nal_hrd_parameters_present_flag = src->flags.nal_hrd_parameters_present_flag,
-                .vcl_hrd_parameters_present_flag = src->flags.vcl_hrd_parameters_present_flag,
-                .sub_pic_hrd_params_present_flag = src->flags.sub_pic_hrd_params_present_flag,
-                .sub_pic_cpb_params_in_pic_timing_sei_flag = src->flags.sub_pic_cpb_params_in_pic_timing_sei_flag,
+                .nal_hrd_parameters_present_flag = src->nal_hrd_parameters_present_flag,
+                .vcl_hrd_parameters_present_flag = src->vcl_hrd_parameters_present_flag,
+                .sub_pic_hrd_params_present_flag = src->sub_pic_hrd_params_present_flag,
+                .sub_pic_cpb_params_in_pic_timing_sei_flag = src->sub_pic_cpb_params_in_pic_timing_sei_flag,
                .fixed_pic_rate_general_flag = src->flags.fixed_pic_rate_general_flag,
                .fixed_pic_rate_within_cvs_flag = src->flags.fixed_pic_rate_within_cvs_flag,
                .low_delay_hrd_flag = src->flags.low_delay_hrd_flag,
--- a/libavcodec/vvc/vvc_ctu.c
+++ b/libavcodec/vvc/vvc_ctu.c
@ -96,7 +96,7 @@ static int get_qp_y_pred(const VVCLocalContext *lc)
    if (lc->na.cand_up) {
        const int first_qg_in_ctu = !(xQg & ctb_size_mask) &&  !(yQg & ctb_size_mask);
        const int qPy_up          = fc->tab.qp[LUMA][x_cb + (y_cb - 1) * min_cb_width];
-        if (first_qg_in_ctu && pps->ctb_to_col_bd[xQg >> ctb_log2_size] == xQg)
+        if (first_qg_in_ctu && pps->ctb_to_col_bd[xQg >> ctb_log2_size] == xQg >> ctb_log2_size)
            return qPy_up;
    }

--- a/libavcodec/vvc/vvcdec.c
+++ b/libavcodec/vvc/vvcdec.c
@ -1040,8 +1040,7 @@ const FFCodec ff_vvc_decoder = {
    .close          = vvc_decode_free,
    FF_CODEC_DECODE_CB(vvc_decode_frame),
    .flush          = vvc_decode_flush,
-    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_OTHER_THREADS |
-                      AV_CODEC_CAP_EXPERIMENTAL,
+    .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_OTHER_THREADS,
    .caps_internal  = FF_CODEC_CAP_EXPORTS_CROPPING | FF_CODEC_CAP_INIT_CLEANUP |
                      FF_CODEC_CAP_AUTO_THREADS,
    .p.profiles     = NULL_IF_CONFIG_SMALL(ff_vvc_profiles),
--- a/libavcodec/vvc_parser.c
+++ b/libavcodec/vvc_parser.c
@ -173,7 +173,7 @@ static void set_parser_ctx(AVCodecParserContext *s, AVCodecContext *avctx,
        h266_sub_width_c[sps->sps_chroma_format_idc];
    s->height = pps->pps_pic_height_in_luma_samples -
        (pps->pps_conf_win_top_offset + pps->pps_conf_win_bottom_offset) *
-        h266_sub_height_c[sps->sps_chroma_format_idc];;
+        h266_sub_height_c[sps->sps_chroma_format_idc];

    avctx->profile = sps->profile_tier_level.general_profile_idc;
    avctx->level = sps->profile_tier_level.general_level_idc;
@ -317,7 +317,7 @@ static int get_pu_info(PuInfo *info, const CodedBitstreamH266Context *h266,
    }
    info->pic_type = get_pict_type(pu);
    return 0;
-  error:
+error:
    memset(info, 0, sizeof(*info));
    return ret;
 }
@ -329,7 +329,7 @@ static int append_au(AVPacket *pkt, const uint8_t *buf, int buf_size)
    if ((ret = av_grow_packet(pkt, buf_size)) < 0)
        goto end;
    memcpy(pkt->data + offset, buf, buf_size);
-  end:
+end:
    return ret;
 }

@ -376,7 +376,7 @@ static int parse_nal_units(AVCodecParserContext *s, const uint8_t *buf,
    } else {
        ret = 1; //not a completed au
    }
-  end:
+end:
    ff_cbs_fragment_reset(pu);
    return ret;
 }
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@ -169,7 +169,9 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER)     += x86/hevc_add_res.o            \
                                          x86/hevc_mc.o                 \
                                          x86/h26x/h2656_inter.o        \
                                          x86/hevc_sao.o                \
-                                          x86/hevc_sao_10bit.o
+                                          x86/hevc_sao_10bit.o          \
+                                          x86/hevc_idct_intrinsic.o     \
+                                          x86/hevc_intra_intrinsic.o
 X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
 X86ASM-OBJS-$(CONFIG_LSCR_DECODER)     += x86/pngdsp.o
 X86ASM-OBJS-$(CONFIG_MLP_DECODER)      += x86/mlpdsp.o
--- a/libavcodec/x86/hevc_idct_intrinsic.c
+++ b/libavcodec/x86/hevc_idct_intrinsic.c
@ -0,0 +1,716 @@
+#include "config.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
+#include "libavcodec/hevc.h"
+#include "libavcodec/x86/hevcdsp.h"
+
+#ifdef __GNUC__
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#endif
+
+#if HAVE_SSE2
+#include <emmintrin.h>
+#endif
+
+DECLARE_ALIGNED(16, static const int16_t, transform4x4_luma[8][8] )=
+{
+    {   29, +84, 29,  +84,  29, +84,  29, +84 },
+    {  +74, +55, +74, +55, +74, +55, +74, +55 },
+    {   55, -29,  55, -29,  55, -29,  55, -29 },
+    {  +74, -84, +74, -84, +74, -84, +74, -84 },
+    {   74, -74,  74, -74,  74, -74,  74, -74 },
+    {    0, +74,   0, +74,   0, +74,   0, +74 },
+    {   84, +55,  84, +55,  84, +55,  84, +55 },
+    {  -74, -29, -74, -29, -74, -29, -74, -29 }
+};
+
+DECLARE_ALIGNED( 16, static const int16_t, transform4x4[4][8] ) = {
+    { 64,  64, 64,  64, 64,  64, 64,  64 },
+    { 64, -64, 64, -64, 64, -64, 64, -64 },
+    { 83,  36, 83,  36, 83,  36, 83,  36 },
+    { 36, -83, 36, -83, 36, -83, 36, -83 }
+};
+
+DECLARE_ALIGNED(16, static const int16_t, transform8x8[12][1][8] )=
+{
+    {{  89,  75,  89,  75, 89,  75, 89,  75 }},
+    {{  50,  18,  50,  18, 50,  18, 50,  18 }},
+    {{  75, -18,  75, -18, 75, -18, 75, -18 }},
+    {{ -89, -50, -89, -50,-89, -50,-89, -50 }},
+    {{  50, -89,  50, -89, 50, -89, 50, -89 }},
+    {{  18,  75,  18,  75, 18,  75, 18,  75 }},
+    {{  18, -50,  18, -50, 18, -50, 18, -50 }},
+    {{  75, -89,  75, -89, 75, -89, 75, -89 }},
+    {{  64,  64,  64,  64, 64,  64, 64,  64 }},
+    {{  64, -64,  64, -64, 64, -64, 64, -64 }},
+    {{  83,  36,  83,  36, 83,  36, 83,  36 }},
+    {{  36, -83,  36, -83, 36, -83, 36, -83 }}
+};
+
+DECLARE_ALIGNED(16, static const int16_t, transform16x16_1[4][8][8] )=
+{
+    {/*1-3*/ /*2-6*/
+        { 90,  87,  90,  87,  90,  87,  90,  87 },
+        { 87,  57,  87,  57,  87,  57,  87,  57 },
+        { 80,   9,  80,   9,  80,   9,  80,   9 },
+        { 70, -43,  70, -43,  70, -43,  70, -43 },
+        { 57, -80,  57, -80,  57, -80,  57, -80 },
+        { 43, -90,  43, -90,  43, -90,  43, -90 },
+        { 25, -70,  25, -70,  25, -70,  25, -70 },
+        { 9,  -25,   9, -25,   9, -25,   9, -25 },
+    },{ /*5-7*/ /*10-14*/
+        {  80,  70,  80,  70,  80,  70,  80,  70 },
+        {   9, -43,   9, -43,   9, -43,   9, -43 },
+        { -70, -87, -70, -87, -70, -87, -70, -87 },
+        { -87,   9, -87,   9, -87,   9, -87,   9 },
+        { -25,  90, -25,  90, -25,  90, -25,  90 },
+        {  57,  25,  57,  25,  57,  25,  57,  25 },
+        {  90, -80,  90, -80,  90, -80,  90, -80 },
+        {  43, -57,  43, -57,  43, -57,  43, -57 },
+    },{ /*9-11*/ /*18-22*/
+        {  57,  43,  57,  43,  57,  43,  57,  43 },
+        { -80, -90, -80, -90, -80, -90, -80, -90 },
+        { -25,  57, -25,  57, -25,  57, -25,  57 },
+        {  90,  25,  90,  25,  90,  25,  90,  25 },
+        {  -9,  -87, -9,  -87, -9,  -87, -9, -87 },
+        { -87,  70, -87,  70, -87,  70, -87,  70 },
+        {  43,   9,  43,   9,  43,   9,  43,   9 },
+        {  70, -80,  70, -80,  70, -80,  70, -80 },
+    },{/*13-15*/ /*  26-30   */
+        {  25,   9,  25,   9,  25,   9,  25,   9 },
+        { -70, -25, -70, -25, -70, -25, -70, -25 },
+        {  90,  43,  90,  43,  90,  43,  90,  43 },
+        { -80, -57, -80, -57, -80, -57, -80, -57 },
+        {  43,  70,  43,  70,  43,  70,  43,  70 },
+        {  9,  -80,   9, -80,   9, -80,   9, -80 },
+        { -57,  87, -57,  87, -57,  87, -57,  87 },
+        {  87, -90,  87, -90,  87, -90,  87, -90 },
+    }
+};
+
+DECLARE_ALIGNED(16, static const int16_t, transform32x32[8][16][8] )=
+{
+    { /*   1-3     */
+        { 90,  90, 90,  90, 90,  90, 90,  90 },
+        { 90,  82, 90,  82, 90,  82, 90,  82 },
+        { 88,  67, 88,  67, 88,  67, 88,  67 },
+        { 85,  46, 85,  46, 85,  46, 85,  46 },
+        { 82,  22, 82,  22, 82,  22, 82,  22 },
+        { 78,  -4, 78,  -4, 78,  -4, 78,  -4 },
+        { 73, -31, 73, -31, 73, -31, 73, -31 },
+        { 67, -54, 67, -54, 67, -54, 67, -54 },
+        { 61, -73, 61, -73, 61, -73, 61, -73 },
+        { 54, -85, 54, -85, 54, -85, 54, -85 },
+        { 46, -90, 46, -90, 46, -90, 46, -90 },
+        { 38, -88, 38, -88, 38, -88, 38, -88 },
+        { 31, -78, 31, -78, 31, -78, 31, -78 },
+        { 22, -61, 22, -61, 22, -61, 22, -61 },
+        { 13, -38, 13, -38, 13, -38, 13, -38 },
+        { 4,  -13,  4, -13,  4, -13,  4, -13 },
+    },{/*  5-7 */
+        {  88,  85,  88,  85,  88,  85,  88,  85 },
+        {  67,  46,  67,  46,  67,  46,  67,  46 },
+        {  31, -13,  31, -13,  31, -13,  31, -13 },
+        { -13, -67, -13, -67, -13, -67, -13, -67 },
+        { -54, -90, -54, -90, -54, -90, -54, -90 },
+        { -82, -73, -82, -73, -82, -73, -82, -73 },
+        { -90, -22, -90, -22, -90, -22, -90, -22 },
+        { -78,  38, -78,  38, -78,  38, -78,  38 },
+        { -46,  82, -46,  82, -46,  82, -46,  82 },
+        {  -4,  88,  -4,  88,  -4,  88,  -4,  88 },
+        {  38,  54,  38,  54,  38,  54,  38,  54 },
+        {  73,  -4,  73,  -4,  73,  -4,  73,  -4 },
+        {  90, -61,  90, -61,  90, -61,  90, -61 },
+        {  85, -90,  85, -90,  85, -90,  85, -90 },
+        {  61, -78,  61, -78,  61, -78,  61, -78 },
+        {  22, -31,  22, -31,  22, -31,  22, -31 },
+    },{/*  9-11   */
+        {  82,  78,  82,  78,  82,  78,  82,  78 },
+        {  22,  -4,  22,  -4,  22,  -4,  22,  -4 },
+        { -54, -82, -54, -82, -54, -82, -54, -82 },
+        { -90, -73, -90, -73, -90, -73, -90, -73 },
+        { -61,  13, -61,  13, -61,  13, -61,  13 },
+        {  13,  85,  13,  85,  13,  85,  13,  85 },
+        {  78,  67,  78,  67,  78,  67,  78,  67 },
+        {  85, -22,  85, -22,  85, -22,  85, -22 },
+        {  31, -88,  31, -88,  31, -88,  31, -88 },
+        { -46, -61, -46, -61, -46, -61, -46, -61 },
+        { -90,  31, -90,  31, -90,  31, -90,  31 },
+        { -67,  90, -67,  90, -67,  90, -67,  90 },
+        {   4,  54,   4,  54,   4,  54,   4,  54 },
+        {  73, -38,  73, -38,  73, -38,  73, -38 },
+        {  88, -90,  88, -90,  88, -90,  88, -90 },
+        {  38, -46,  38, -46,  38, -46,  38, -46 },
+    },{/*  13-15   */
+        {  73,  67,  73,  67,  73,  67,  73,  67 },
+        { -31, -54, -31, -54, -31, -54, -31, -54 },
+        { -90, -78, -90, -78, -90, -78, -90, -78 },
+        { -22,  38, -22,  38, -22,  38, -22,  38 },
+        {  78,  85,  78,  85,  78,  85,  78,  85 },
+        {  67, -22,  67, -22,  67, -22,  67, -22 },
+        { -38, -90, -38, -90, -38, -90, -38, -90 },
+        { -90,   4, -90,   4, -90,   4, -90,   4 },
+        { -13,  90, -13,  90, -13,  90, -13,  90 },
+        {  82,  13,  82,  13,  82,  13,  82,  13 },
+        {  61, -88,  61, -88,  61, -88,  61, -88 },
+        { -46, -31, -46, -31, -46, -31, -46, -31 },
+        { -88,  82, -88,  82, -88,  82, -88,  82 },
+        { -4,   46, -4,   46, -4,   46, -4,   46 },
+        {  85, -73,  85, -73,  85, -73,  85, -73 },
+        {  54, -61,  54, -61,  54, -61,  54, -61 },
+    },{/*  17-19   */
+        {  61,  54,  61,  54,  61,  54,  61,  54 },
+        { -73, -85, -73, -85, -73, -85, -73, -85 },
+        { -46,  -4, -46,  -4, -46,  -4, -46,  -4 },
+        {  82,  88,  82,  88,  82,  88,  82,  88 },
+        {  31, -46,  31, -46,  31, -46,  31, -46 },
+        { -88, -61, -88, -61, -88, -61, -88, -61 },
+        { -13,  82, -13,  82, -13,  82, -13,  82 },
+        {  90,  13,  90,  13,  90,  13,  90,  13 },
+        { -4, -90,  -4, -90,  -4, -90,  -4, -90 },
+        { -90,  38, -90,  38, -90,  38, -90,  38 },
+        {  22,  67,  22,  67,  22,  67,  22,  67 },
+        {  85, -78,  85, -78,  85, -78,  85, -78 },
+        { -38, -22, -38, -22, -38, -22, -38, -22 },
+        { -78,  90, -78,  90, -78,  90, -78,  90 },
+        {  54, -31,  54, -31,  54, -31,  54, -31 },
+        {  67, -73,  67, -73,  67, -73,  67, -73 },
+    },{ /*  21-23   */
+        {  46,  38,  46,  38,  46,  38,  46,  38 },
+        { -90, -88, -90, -88, -90, -88, -90, -88 },
+        {  38,  73,  38,  73,  38,  73,  38,  73 },
+        {  54,  -4,  54,  -4,  54,  -4,  54,  -4 },
+        { -90, -67, -90, -67, -90, -67, -90, -67 },
+        {  31,  90,  31,  90,  31,  90,  31,  90 },
+        {  61, -46,  61, -46,  61, -46,  61, -46 },
+        { -88, -31, -88, -31, -88, -31, -88, -31 },
+        {  22,  85,  22,  85,  22,  85,  22,  85 },
+        {  67, -78,  67, -78,  67, -78,  67, -78 },
+        { -85,  13, -85,  13, -85,  13, -85,  13 },
+        {  13,  61,  13,  61,  13,  61,  13,  61 },
+        {  73, -90,  73, -90,  73, -90,  73, -90 },
+        { -82,  54, -82,  54, -82,  54, -82,  54 },
+        {   4,  22,   4,  22,   4,  22,   4,  22 },
+        {  78, -82,  78, -82,  78, -82,  78, -82 },
+    },{ /*  25-27   */
+        {  31,  22,  31,  22,  31,  22,  31,  22 },
+        { -78, -61, -78, -61, -78, -61, -78, -61 },
+        {  90,  85,  90,  85,  90,  85,  90,  85 },
+        { -61, -90, -61, -90, -61, -90, -61, -90 },
+        {   4,  73,   4,  73,   4,  73,   4,  73 },
+        {  54, -38,  54, -38,  54, -38,  54, -38 },
+        { -88,  -4, -88,  -4, -88,  -4, -88,  -4 },
+        {  82,  46,  82,  46,  82,  46,  82,  46 },
+        { -38, -78, -38, -78, -38, -78, -38, -78 },
+        { -22,  90, -22,  90, -22,  90, -22,  90 },
+        {  73, -82,  73, -82,  73, -82,  73, -82 },
+        { -90,  54, -90,  54, -90,  54, -90,  54 },
+        {  67, -13,  67, -13,  67, -13,  67, -13 },
+        { -13, -31, -13, -31, -13, -31, -13, -31 },
+        { -46,  67, -46,  67, -46,  67, -46,  67 },
+        {  85, -88,  85, -88,  85, -88,  85, -88 },
+    },{/*  29-31   */
+        {  13,   4,  13,   4,  13,   4,  13,   4 },
+        { -38, -13, -38, -13, -38, -13, -38, -13 },
+        {  61,  22,  61,  22,  61,  22,  61,  22 },
+        { -78, -31, -78, -31, -78, -31, -78, -31 },
+        {  88,  38,  88,  38,  88,  38,  88,  38 },
+        { -90, -46, -90, -46, -90, -46, -90, -46 },
+        {  85,  54,  85,  54,  85,  54,  85,  54 },
+        { -73, -61, -73, -61, -73, -61, -73, -61 },
+        {  54,  67,  54,  67,  54,  67,  54,  67 },
+        { -31, -73, -31, -73, -31, -73, -31, -73 },
+        {   4,  78,   4,  78,   4,  78,   4,  78 },
+        {  22, -82,  22, -82,  22, -82,  22, -82 },
+        { -46,  85, -46,  85, -46,  85, -46,  85 },
+        {  67, -88,  67, -88,  67, -88,  67, -88 },
+        { -82,  90, -82,  90, -82,  90, -82,  90 },
+        {  90, -90,  90, -90,  90, -90,  90, -90 },
+    }
+};
+
+#define shift_1st 7
+#define add_1st (1 << (shift_1st - 1))
+
+#define CLIP_PIXEL_MAX_10 0x03FF
+#define CLIP_PIXEL_MAX_12 0x0FFF
+
+#if HAVE_SSE2
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define INIT_8()                                                               \
+    uint8_t *dst = (uint8_t*) _dst;                                            \
+    ptrdiff_t stride = _stride
+#define INIT_10()                                                              \
+    uint16_t *dst = (uint16_t*) _dst;                                          \
+    ptrdiff_t stride = _stride>>1
+
+#define INIT_12() INIT_10()
+#define INIT8_12() INIT8_10()
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define LOAD_EMPTY(dst, src)
+#define LOAD4x4(dst, src)                                                      \
+    dst ## 0 = _mm_load_si128((__m128i *) &src[0]);                           \
+    dst ## 1 = _mm_load_si128((__m128i *) &src[8])
+#define LOAD4x4_STEP(dst, src, sstep)                                          \
+    tmp0 = _mm_loadl_epi64((__m128i *) &src[0 * sstep]);                       \
+    tmp1 = _mm_loadl_epi64((__m128i *) &src[1 * sstep]);                       \
+    tmp2 = _mm_loadl_epi64((__m128i *) &src[2 * sstep]);                       \
+    tmp3 = _mm_loadl_epi64((__m128i *) &src[3 * sstep]);                       \
+    dst ## 0 = _mm_unpacklo_epi16(tmp0, tmp2);                                 \
+    dst ## 1 = _mm_unpacklo_epi16(tmp1, tmp3)
+#define LOAD8x8_E(dst, src, sstep)                                             \
+    dst ## 0 = _mm_load_si128((__m128i *) &src[0 * sstep]);                   \
+    dst ## 1 = _mm_load_si128((__m128i *) &src[1 * sstep]);                   \
+    dst ## 2 = _mm_load_si128((__m128i *) &src[2 * sstep]);                   \
+    dst ## 3 = _mm_load_si128((__m128i *) &src[3 * sstep])
+#define LOAD8x8_O(dst, src, sstep)                                             \
+    tmp0 = _mm_load_si128((__m128i *) &src[1 * sstep]);                       \
+    tmp1 = _mm_load_si128((__m128i *) &src[3 * sstep]);                       \
+    tmp2 = _mm_load_si128((__m128i *) &src[5 * sstep]);                       \
+    tmp3 = _mm_load_si128((__m128i *) &src[7 * sstep]);                       \
+    dst ## 0 = _mm_unpacklo_epi16(tmp0, tmp1);                                 \
+    dst ## 1 = _mm_unpackhi_epi16(tmp0, tmp1);                                 \
+    dst ## 2 = _mm_unpacklo_epi16(tmp2, tmp3);                                 \
+    dst ## 3 = _mm_unpackhi_epi16(tmp2, tmp3)
+#define LOAD16x16_O(dst, src, sstep)                                           \
+    LOAD8x8_O(dst, src, sstep);                                                \
+    tmp0 = _mm_load_si128((__m128i *) &src[ 9 * sstep]);                      \
+    tmp1 = _mm_load_si128((__m128i *) &src[11 * sstep]);                      \
+    tmp2 = _mm_load_si128((__m128i *) &src[13 * sstep]);                      \
+    tmp3 = _mm_load_si128((__m128i *) &src[15 * sstep]);                      \
+    dst ## 4 = _mm_unpacklo_epi16(tmp0, tmp1);                                 \
+    dst ## 5 = _mm_unpackhi_epi16(tmp0, tmp1);                                 \
+    dst ## 6 = _mm_unpacklo_epi16(tmp2, tmp3);                                 \
+    dst ## 7 = _mm_unpackhi_epi16(tmp2, tmp3)
+
+#define LOAD_8x32(dst, dst_stride, src0, src1, idx)                            \
+    src0 = _mm_load_si128((__m128i *) &dst[idx*dst_stride]);                   \
+    src1 = _mm_load_si128((__m128i *) &dst[idx*dst_stride+4])
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define ASSIGN_EMPTY(dst, dst_stride, src)
+#define SAVE_8x16(dst, dst_stride, src)                                        \
+    _mm_store_si128((__m128i *) dst, src);                                    \
+    dst += dst_stride
+#define SAVE_8x32(dst, dst_stride, src0, src1, idx)                            \
+    _mm_store_si128((__m128i *) &dst[idx*dst_stride]  , src0);                \
+    _mm_store_si128((__m128i *) &dst[idx*dst_stride+4], src1)
+
+#define ASSIGN2(dst, dst_stride, src0, src1, assign)                           \
+    assign(dst, dst_stride, src0);                                             \
+    assign(dst, dst_stride, _mm_srli_si128(src0, 8));                          \
+    assign(dst, dst_stride, src1);                                             \
+    assign(dst, dst_stride, _mm_srli_si128(src1, 8))
+#define ASSIGN4(dst, dst_stride, src0, src1, src2, src3, assign)               \
+    assign(dst, dst_stride, src0);                                             \
+    assign(dst, dst_stride, src1);                                             \
+    assign(dst, dst_stride, src2);                                             \
+    assign(dst, dst_stride, src3)
+#define ASSIGN4_LO(dst, dst_stride, src, assign)                               \
+    ASSIGN4(dst, dst_stride, src ## 0, src ## 1, src ## 2, src ## 3, assign)
+#define ASSIGN4_HI(dst, dst_stride, src, assign)                               \
+    ASSIGN4(dst, dst_stride, src ## 4, src ## 5, src ## 6, src ## 7, assign)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define TRANSPOSE4X4_16(dst)                                                   \
+    tmp0 = _mm_unpacklo_epi16(dst ## 0, dst ## 1);                             \
+    tmp1 = _mm_unpackhi_epi16(dst ## 0, dst ## 1);                             \
+    dst ## 0 = _mm_unpacklo_epi16(tmp0, tmp1);                                 \
+    dst ## 1 = _mm_unpackhi_epi16(tmp0, tmp1)
+#define TRANSPOSE4X4_16_S(dst, dst_stride, src, assign)                        \
+    TRANSPOSE4X4_16(src);                                                      \
+    ASSIGN2(dst, dst_stride, src ## 0, src ## 1, assign)
+
+#define TRANSPOSE8X8_16(dst)                                                   \
+    tmp0 = _mm_unpacklo_epi16(dst ## 0, dst ## 1);                             \
+    tmp1 = _mm_unpacklo_epi16(dst ## 2, dst ## 3);                             \
+    tmp2 = _mm_unpacklo_epi16(dst ## 4, dst ## 5);                             \
+    tmp3 = _mm_unpacklo_epi16(dst ## 6, dst ## 7);                             \
+    src0 = _mm_unpacklo_epi32(tmp0, tmp1);                                     \
+    src1 = _mm_unpacklo_epi32(tmp2, tmp3);                                     \
+    src2 = _mm_unpackhi_epi32(tmp0, tmp1);                                     \
+    src3 = _mm_unpackhi_epi32(tmp2, tmp3);                                     \
+    tmp0 = _mm_unpackhi_epi16(dst ## 0, dst ## 1);                             \
+    tmp1 = _mm_unpackhi_epi16(dst ## 2, dst ## 3);                             \
+    tmp2 = _mm_unpackhi_epi16(dst ## 4, dst ## 5);                             \
+    tmp3 = _mm_unpackhi_epi16(dst ## 6, dst ## 7);                             \
+    dst ## 0 = _mm_unpacklo_epi64(src0 , src1);                                \
+    dst ## 1 = _mm_unpackhi_epi64(src0 , src1);                                \
+    dst ## 2 = _mm_unpacklo_epi64(src2 , src3);                                \
+    dst ## 3 = _mm_unpackhi_epi64(src2 , src3);                                \
+    src0 = _mm_unpacklo_epi32(tmp0, tmp1);                                     \
+    src1 = _mm_unpacklo_epi32(tmp2, tmp3);                                     \
+    src2 = _mm_unpackhi_epi32(tmp0, tmp1);                                     \
+    src3 = _mm_unpackhi_epi32(tmp2, tmp3);                                     \
+    dst ## 4 = _mm_unpacklo_epi64(src0 , src1);                                \
+    dst ## 5 = _mm_unpackhi_epi64(src0 , src1);                                \
+    dst ## 6 = _mm_unpacklo_epi64(src2 , src3);                                \
+    dst ## 7 = _mm_unpackhi_epi64(src2 , src3)
+#define TRANSPOSE8x8_16_S(out, sstep_out, src, assign)                         \
+    TRANSPOSE8X8_16(src);                                                      \
+    p_dst = out;                                                               \
+    ASSIGN4_LO(p_dst, sstep_out, src, assign);                                 \
+    ASSIGN4_HI(p_dst, sstep_out, src, assign)
+#define TRANSPOSE8x8_16_LS(out, sstep_out, in, sstep_in, assign)               \
+    e0  = _mm_load_si128((__m128i *) &in[0*sstep_in]);                         \
+    e1  = _mm_load_si128((__m128i *) &in[1*sstep_in]);                         \
+    e2  = _mm_load_si128((__m128i *) &in[2*sstep_in]);                         \
+    e3  = _mm_load_si128((__m128i *) &in[3*sstep_in]);                         \
+    e4  = _mm_load_si128((__m128i *) &in[4*sstep_in]);                         \
+    e5  = _mm_load_si128((__m128i *) &in[5*sstep_in]);                         \
+    e6  = _mm_load_si128((__m128i *) &in[6*sstep_in]);                         \
+    e7  = _mm_load_si128((__m128i *) &in[7*sstep_in]);                         \
+    TRANSPOSE8x8_16_S(out, sstep_out, e, assign)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define TR_COMPUTE_TRANFORM(dst1, dst2, src0, src1, src2, src3, i, j, transform)\
+    tmp1 = _mm_load_si128((__m128i *) transform[i  ][j]);                      \
+    tmp3 = _mm_load_si128((__m128i *) transform[i+1][j]);                      \
+    tmp0 = _mm_madd_epi16(src0, tmp1);                                         \
+    tmp1 = _mm_madd_epi16(src1, tmp1);                                         \
+    tmp2 = _mm_madd_epi16(src2, tmp3);                                         \
+    tmp3 = _mm_madd_epi16(src3, tmp3);                                         \
+    dst1 = _mm_add_epi32(tmp0, tmp2);                                          \
+    dst2 = _mm_add_epi32(tmp1, tmp3)
+
+#define SCALE8x8_2x32(dst0, src0, src1)                                        \
+    src0 = _mm_srai_epi32(src0, shift);                                        \
+    src1 = _mm_srai_epi32(src1, shift);                                        \
+    dst0 = _mm_packs_epi32(src0, src1)
+#define SCALE_4x32(dst0, dst1, src0, src1, src2, src3)                         \
+    SCALE8x8_2x32(dst0, src0, src1);                                           \
+    SCALE8x8_2x32(dst1, src2, src3)
+#define SCALE16x16_2x32(dst, dst_stride, src0, src1, j)                        \
+    e0   = _mm_load_si128((__m128i *) &o16[j*8+0]);                           \
+    e7   = _mm_load_si128((__m128i *) &o16[j*8+4]);                           \
+    tmp4 = _mm_add_epi32(src0, e0);                                            \
+    src0 = _mm_sub_epi32(src0, e0);                                            \
+    e0   = _mm_add_epi32(src1, e7);                                            \
+    src1 = _mm_sub_epi32(src1, e7);                                            \
+    SCALE_4x32(e0, e7, tmp4, e0, src0, src1);                                  \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(             j)]  , e0);     \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(dst_stride-1-j)]  , e7)
+
+#define SCALE32x32_2x32(dst, dst_stride, j)                                    \
+    e0   = _mm_load_si128((__m128i *) &e32[j*16+0]);                          \
+    e1   = _mm_load_si128((__m128i *) &e32[j*16+4]);                          \
+    e4   = _mm_load_si128((__m128i *) &o32[j*16+0]);                          \
+    e5   = _mm_load_si128((__m128i *) &o32[j*16+4]);                          \
+    tmp0 = _mm_add_epi32(e0, e4);                                              \
+    tmp1 = _mm_add_epi32(e1, e5);                                              \
+    tmp2 = _mm_sub_epi32(e1, e5);                                              \
+    tmp3 = _mm_sub_epi32(e0, e4);                                              \
+    SCALE_4x32(tmp0, tmp1, tmp0, tmp1, tmp3, tmp2);                            \
+    _mm_store_si128((__m128i *) &dst[dst_stride*i+0]  , tmp0);                \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(dst_stride-1-i)+0]  , tmp1)
+
+#define SAVE16x16_2x32(dst, dst_stride, src0, src1, j)                        \
+    e0   = _mm_load_si128((__m128i *) &o16[j*8+0]);                           \
+    e7   = _mm_load_si128((__m128i *) &o16[j*8+4]);                           \
+    tmp4 = _mm_add_epi32(src0, e0);                                            \
+    src0 = _mm_sub_epi32(src0, e0);                                            \
+    e0   = _mm_add_epi32(src1, e7);                                            \
+    src1 = _mm_sub_epi32(src1, e7);                                            \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(             j)]  , tmp4);   \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(             j)+4], e0);     \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(dst_stride-1-j)]  , src0);   \
+    _mm_store_si128((__m128i *) &dst[dst_stride*(dst_stride-1-j)+4], src1)
+
+
+#define SCALE8x8_2x32_WRAPPER(dst, dst_stride, dst0, src0, src1, idx)          \
+    SCALE8x8_2x32(dst0, src0, src1)
+#define SCALE16x16_2x32_WRAPPER(dst, dst_stride, dst0, src0, src1, idx)        \
+    SCALE16x16_2x32(dst, dst_stride, src0, src1, idx)
+#define SAVE16x16_2x32_WRAPPER(dst, dst_stride, dst0, src0, src1, idx)         \
+    SAVE16x16_2x32(dst, dst_stride, src0, src1, idx)
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_4x4_luma_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+#define COMPUTE_LUMA(dst , idx)                                                \
+    tmp0 = _mm_load_si128((__m128i *) (transform4x4_luma[idx  ]));            \
+    tmp1 = _mm_load_si128((__m128i *) (transform4x4_luma[idx+1]));            \
+    tmp0 = _mm_madd_epi16(src0, tmp0);                                         \
+    tmp1 = _mm_madd_epi16(src1, tmp1);                                         \
+    dst  = _mm_add_epi32(tmp0, tmp1);                                          \
+    dst  = _mm_add_epi32(dst, add);                                            \
+    dst  = _mm_srai_epi32(dst, shift)
+#define COMPUTE_LUMA_ALL()                                                     \
+    add  = _mm_set1_epi32(1 << (shift - 1));                                   \
+    src0 = _mm_unpacklo_epi16(tmp0, tmp1);                                     \
+    src1 = _mm_unpackhi_epi16(tmp0, tmp1);                                     \
+    COMPUTE_LUMA(res2 , 0);                                                    \
+    COMPUTE_LUMA(res3 , 2);                                                    \
+    res0 = _mm_packs_epi32(res2, res3);                                        \
+    COMPUTE_LUMA(res2 , 4);                                                    \
+    COMPUTE_LUMA(res3 , 6);                                                    \
+    res1 = _mm_packs_epi32(res2, res3)
+
+#define TRANSFORM_LUMA(D)                                                  \
+void ff_hevc_transform_4x4_luma ## _ ## D ## _sse2(int16_t *_coeffs) {          \
+    uint8_t  shift = 7;                                                        \
+    int16_t *src    = _coeffs;                                                 \
+    int16_t *coeffs = _coeffs;                                                 \
+    __m128i res0, res1, res2, res3;                                            \
+    __m128i tmp0, tmp1, src0, src1, add;                                       \
+    LOAD4x4(tmp, src);                                                         \
+    COMPUTE_LUMA_ALL();                                                        \
+    shift = 20 - D;                                                            \
+    res2  = _mm_unpacklo_epi16(res0, res1);                                    \
+    res3  = _mm_unpackhi_epi16(res0, res1);                                    \
+    tmp0  = _mm_unpacklo_epi16(res2, res3);                                    \
+    tmp1  = _mm_unpackhi_epi16(res2, res3);                                    \
+    COMPUTE_LUMA_ALL();                                                        \
+    TRANSPOSE4X4_16(res);                                                      \
+    _mm_store_si128((__m128i *) coeffs    , res0);                             \
+    _mm_store_si128((__m128i *) (coeffs + 8), res1);                           \
+}
+
+TRANSFORM_LUMA( 8);
+TRANSFORM_LUMA( 10);
+TRANSFORM_LUMA( 12);
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_4x4_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+#define COMPUTE4x4(dst0, dst1, dst2, dst3)                                     \
+    tmp0 = _mm_load_si128((__m128i *) transform4x4[0]);                        \
+    tmp1 = _mm_load_si128((__m128i *) transform4x4[1]);                        \
+    tmp2 = _mm_load_si128((__m128i *) transform4x4[2]);                        \
+    tmp3 = _mm_load_si128((__m128i *) transform4x4[3]);                        \
+    tmp0 = _mm_madd_epi16(e6, tmp0);                                           \
+    tmp1 = _mm_madd_epi16(e6, tmp1);                                           \
+    tmp2 = _mm_madd_epi16(e7, tmp2);                                           \
+    tmp3 = _mm_madd_epi16(e7, tmp3);                                           \
+    e6   = _mm_set1_epi32(add);                                                \
+    tmp0 = _mm_add_epi32(tmp0, e6);                                            \
+    tmp1 = _mm_add_epi32(tmp1, e6);                                            \
+    dst0 = _mm_add_epi32(tmp0, tmp2);                                          \
+    dst1 = _mm_add_epi32(tmp1, tmp3);                                          \
+    dst2 = _mm_sub_epi32(tmp1, tmp3);                                          \
+    dst3 = _mm_sub_epi32(tmp0, tmp2)
+#define COMPUTE4x4_LO()                                                        \
+    COMPUTE4x4(e0, e1, e2, e3)
+#define COMPUTE4x4_HI(dst)                                                     \
+    COMPUTE4x4(e7, e6, e5, e4)
+
+#define TR_4(dst, dst_stride, in, sstep, load, assign)                         \
+    load(e, in);                                                               \
+    e6 = _mm_unpacklo_epi16(e0, e1);                                           \
+    e7 = _mm_unpackhi_epi16(e0, e1);                                           \
+    COMPUTE4x4_LO();                                                           \
+    SCALE_4x32(e0, e1, e0, e1, e2, e3);                                        \
+    TRANSPOSE4X4_16_S(dst, dst_stride, e, assign)                              \
+
+#define TR_4_1( dst, dst_stride, src)    TR_4( dst, dst_stride, src,  4, LOAD4x4, ASSIGN_EMPTY)
+#define TR_4_2( dst, dst_stride, src, D) TR_4( dst, dst_stride, src,  4, LOAD_EMPTY, ASSIGN_EMPTY)
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_8x8_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+#define TR_4_set8x4(in, sstep)                                                 \
+    LOAD8x8_E(src, in, sstep);                                                 \
+    e6 = _mm_unpacklo_epi16(src0, src2);                                       \
+    e7 = _mm_unpacklo_epi16(src1, src3);                                       \
+    COMPUTE4x4_LO();                                                           \
+    e6 = _mm_unpackhi_epi16(src0, src2);                                       \
+    e7 = _mm_unpackhi_epi16(src1, src3);                                       \
+    COMPUTE4x4_HI()
+
+#define TR_COMPUTE8x8(e0, e1, i)                                               \
+    TR_COMPUTE_TRANFORM(tmp2, tmp3, src0, src1, src2, src3, i, 0, transform8x8);\
+    tmp0 = _mm_add_epi32(e0, tmp2);                                            \
+    tmp1 = _mm_add_epi32(e1, tmp3);                                            \
+    tmp3 = _mm_sub_epi32(e1, tmp3);                                            \
+    tmp2 = _mm_sub_epi32(e0, tmp2)
+
+#define TR_8(dst, dst_stride, in, sstep, assign)                               \
+    TR_4_set8x4(in, 2 * sstep);                                                \
+    LOAD8x8_O(src, in, sstep);                                                 \
+    TR_COMPUTE8x8(e0, e7, 0);                                                  \
+    assign(dst, dst_stride, e0, tmp0, tmp1, 0);                                \
+    assign(dst, dst_stride, e7, tmp2, tmp3, 7);                                \
+    TR_COMPUTE8x8(e1, e6, 2);                                                  \
+    assign(dst, dst_stride, e1, tmp0, tmp1, 1);                                \
+    assign(dst, dst_stride, e6, tmp2, tmp3, 6);                                \
+    TR_COMPUTE8x8(e2, e5, 4);                                                  \
+    assign(dst, dst_stride, e2, tmp0, tmp1, 2);                                \
+    assign(dst, dst_stride, e5, tmp2, tmp3, 5);                                \
+    TR_COMPUTE8x8(e3, e4, 6);                                                  \
+    assign(dst, dst_stride, e3, tmp0, tmp1, 3);                                \
+    assign(dst, dst_stride, e4, tmp2, tmp3, 4);                                \
+
+#define TR_8_1( dst, dst_stride, src)                                         \
+    TR_8( dst, dst_stride, src,  8, SCALE8x8_2x32_WRAPPER);                    \
+    TRANSPOSE8x8_16_S(dst, dst_stride, e, SAVE_8x16)
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_XxX_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+
+#define TRANSFORM_4x4(D)                                                       \
+void ff_hevc_transform_4x4_ ## D ## _sse2 (int16_t *_coeffs, int col_limit) {  \
+    int16_t *src    = _coeffs;                                                 \
+    int16_t *coeffs = _coeffs;                                                 \
+    int      shift  = 7;                                                       \
+    int      add    = 1 << (shift - 1);                                        \
+    __m128i tmp0, tmp1, tmp2, tmp3;                                            \
+    __m128i e0, e1, e2, e3, e6, e7;                                            \
+    TR_4_1(p_dst1, 4, src);                                                    \
+    shift   = 20 - D;                                                          \
+    add     = 1 << (shift - 1);                                                \
+    TR_4_2(coeffs, 8, tmp, D);                                                 \
+    _mm_store_si128((__m128i *) coeffs    , e0);                               \
+    _mm_store_si128((__m128i *) (coeffs + 8), e1);                             \
+}
+#define TRANSFORM_8x8(D)                                                       \
+void ff_hevc_transform_8x8_ ## D ## _sse2 (int16_t *coeffs, int col_limit) {    \
+    DECLARE_ALIGNED(16, int16_t, tmp[8*8]);                                    \
+    int16_t *src    = coeffs;                                                  \
+    int16_t *p_dst1 = tmp;                                                     \
+    int16_t *p_dst;                                                            \
+    int      shift  = 7;                                                       \
+    int      add    = 1 << (shift - 1);                                        \
+    __m128i src0, src1, src2, src3;                                            \
+    __m128i tmp0, tmp1, tmp2, tmp3;                                            \
+    __m128i e0, e1, e2, e3, e4, e5, e6, e7;                                    \
+    TR_8_1(p_dst1, 8, src);                                                    \
+    shift   = 20 - D;                                                          \
+    add     = 1 << (shift - 1);                                                \
+    TR_8_1(coeffs, 8, tmp);                                                    \
+}
+
+TRANSFORM_4x4(12)
+TRANSFORM_8x8(12)
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_16x16_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+#define TR_COMPUTE16x16(dst1, dst2,src0, src1, src2, src3, i, j)              \
+    TR_COMPUTE_TRANFORM(dst1, dst2,src0, src1, src2, src3, i, j, transform16x16_1)
+#define TR_COMPUTE16x16_FIRST(j)                                               \
+    TR_COMPUTE16x16(src0, src1, e0, e1, e2, e3, 0, j)
+#define TR_COMPUTE16x16_NEXT(i, j)                                             \
+    TR_COMPUTE16x16(tmp0, tmp1, e4, e5, e6, e7, i, j);                         \
+    src0 = _mm_add_epi32(src0, tmp0);                                          \
+    src1 = _mm_add_epi32(src1, tmp1)
+
+#define TR_16(dst, dst_stride, in, sstep, assign)                              \
+    {                                                                          \
+        int i;                                                                 \
+        int o16[8*8];                                                          \
+        LOAD16x16_O(e, in, sstep);                                             \
+        for (i = 0; i < 8; i++) {                                              \
+            TR_COMPUTE16x16_FIRST(i);                                          \
+            TR_COMPUTE16x16_NEXT(2, i);                                        \
+            SAVE_8x32(o16, 8, src0, src1, i);                                  \
+        }                                                                      \
+        TR_8(dst, dst_stride, in, 2 * sstep, assign);                          \
+    }
+
+#define TR_16_1( dst, dst_stride, src)        TR_16( dst, dst_stride, src,     16, SCALE16x16_2x32_WRAPPER)
+#define TR_16_2( dst, dst_stride, src, sstep) TR_16( dst, dst_stride, src,  sstep, SAVE16x16_2x32_WRAPPER )
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_32x32_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+#define TR_COMPUTE32x32(dst1, dst2,src0, src1, src2, src3, i, j)              \
+    TR_COMPUTE_TRANFORM(dst1, dst2, src0, src1, src2, src3, i, j, transform32x32)
+#define TR_COMPUTE32x32_FIRST(i, j)                                            \
+    TR_COMPUTE32x32(tmp0, tmp1, e0, e1, e2, e3, i, j);                         \
+    src0 = _mm_add_epi32(src0, tmp0);                                          \
+    src1 = _mm_add_epi32(src1, tmp1)
+#define TR_COMPUTE32x32_NEXT(i, j)                                             \
+    TR_COMPUTE32x32(tmp0, tmp1, e4, e5, e6, e7, i, j);                         \
+    src0 = _mm_add_epi32(src0, tmp0);                                          \
+    src1 = _mm_add_epi32(src1, tmp1)
+
+#define TR_32(dst, dst_stride, in, sstep)                                      \
+    {                                                                          \
+        int i;                                                                 \
+        DECLARE_ALIGNED(16, int, e32[16*16]);                                  \
+        DECLARE_ALIGNED(16, int, o32[16*16]);                                  \
+        LOAD16x16_O(e, in, sstep);                                             \
+        for (i = 0; i < 16; i++) {                                             \
+            src0 = _mm_setzero_si128();                                        \
+            src1 = _mm_setzero_si128();                                        \
+            TR_COMPUTE32x32_FIRST(0, i);                                       \
+            TR_COMPUTE32x32_NEXT(2, i);                                        \
+            SAVE_8x32(o32, 16, src0, src1, i);                                 \
+        }                                                                      \
+        LOAD16x16_O(e, (&in[16*sstep]), sstep);                                \
+        for (i = 0; i < 16; i++) {                                             \
+            LOAD_8x32(o32, 16, src0, src1, i);                                 \
+            TR_COMPUTE32x32_FIRST(4, i);                                       \
+            TR_COMPUTE32x32_NEXT(6, i);                                        \
+            SAVE_8x32(o32, 16, src0, src1, i);                                 \
+        }                                                                      \
+        TR_16_2(e32, 16, in, 2 * sstep);                                       \
+        for (i = 0; i < 16; i++) {                                             \
+            SCALE32x32_2x32(dst, dst_stride, i);                               \
+        }                                                                      \
+    }
+
+#define TR_32_1( dst, dst_stride, src)        TR_32( dst, dst_stride, src, 32)
+
+////////////////////////////////////////////////////////////////////////////////
+// ff_hevc_transform_XxX_X_sse2
+////////////////////////////////////////////////////////////////////////////////
+#define TRANSFORM2(H, D)                                                   \
+void ff_hevc_transform_ ## H ## x ## H ## _ ## D ## _sse2 (                \
+    int16_t *coeffs, int col_limit) {                                          \
+    int i, j, k, add;                                                          \
+    int      shift = 7;                                                        \
+    int16_t *src   = coeffs;                                                   \
+    DECLARE_ALIGNED(16, int16_t, tmp[H*H]);                                    \
+    DECLARE_ALIGNED(16, int16_t, tmp_2[H*H]);                                  \
+    int16_t *p_dst, *p_tra = tmp_2;                                            \
+    __m128i src0, src1, src2, src3;                                            \
+    __m128i tmp0, tmp1, tmp2, tmp3, tmp4;                                      \
+    __m128i e0, e1, e2, e3, e4, e5, e6, e7;                                    \
+    for (k = 0; k < 2; k++) {                                                  \
+        add   = 1 << (shift - 1);                                              \
+        for (i = 0; i < H; i+=8) {                                             \
+            p_dst = tmp + i;                                                   \
+            TR_ ## H ## _1(p_dst, H, src);                                     \
+            src   += 8;                                                        \
+            for (j = 0; j < H; j+=8) {                                         \
+               TRANSPOSE8x8_16_LS((&p_tra[i*H+j]), H, (&tmp[j*H+i]), H, SAVE_8x16);\
+            }                                                                  \
+        }                                                                      \
+        src   = tmp_2;                                                         \
+        p_tra = coeffs;                                                         \
+        shift = 20 - D;                                                        \
+    }                                                                          \
+}
+
+#if !ARCH_X86_64
+    TRANSFORM2(16,  8);
+    TRANSFORM2(16, 10);
+#endif
+TRANSFORM2(16, 12);
+
+#if !ARCH_X86_64
+    TRANSFORM2(32,  8);
+    TRANSFORM2(32, 10);
+#endif
+TRANSFORM2(32, 12);
+
+#endif
+
+#ifdef __GNUC__
+#pragma GCC pop_options
+#endif
--- a/libavcodec/x86/hevc_intra_intrinsic.c
+++ b/libavcodec/x86/hevc_intra_intrinsic.c
@ -0,0 +1,922 @@
+#include "config.h"
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
+#include "libavcodec/get_bits.h"
+#include "libavcodec/hevc.h"
+#include "libavcodec/x86/hevcpred.h"
+
+#ifdef __GNUC__
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#endif
+
+#if HAVE_SSE2
+#include <emmintrin.h>
+#endif
+#if HAVE_SSSE3
+#include <tmmintrin.h>
+#endif
+#if HAVE_SSE4
+#include <smmintrin.h>
+#endif
+
+#if HAVE_SSE4
+#define _MM_PACKUS_EPI32 _mm_packus_epi32
+#else
+static av_always_inline __m128i _MM_PACKUS_EPI32( __m128i a, __m128i b )
+{
+     a = _mm_slli_epi32 (a, 16);
+     a = _mm_srai_epi32 (a, 16);
+     b = _mm_slli_epi32 (b, 16);
+     b = _mm_srai_epi32 (b, 16);
+     a = _mm_packs_epi32 (a, b);
+    return a;
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#if HAVE_SSE4
+#define PLANAR_INIT_8()                                                        \
+    uint8_t *src = (uint8_t*)_src;                                             \
+    const uint8_t *top = (const uint8_t*)_top;                                 \
+    const uint8_t *left = (const uint8_t*)_left
+#define PLANAR_INIT_10()                                                       \
+    uint16_t *src = (uint16_t*)_src;                                           \
+    const uint16_t *top = (const uint16_t*)_top;                               \
+    const uint16_t *left = (const uint16_t*)_left
+
+#define PLANAR_COMPUTE(val, shift)                                             \
+    add = _mm_mullo_epi16(_mm_set1_epi16(1+y), l0);                            \
+    ly1 = _mm_unpacklo_epi16(ly , ly );                                        \
+    ly1 = _mm_unpacklo_epi32(ly1, ly1);                                        \
+    ly1 = _mm_unpacklo_epi64(ly1, ly1);                                        \
+    c0  = _mm_mullo_epi16(tmp1, ly1);                                          \
+    x0  = _mm_mullo_epi16(_mm_set1_epi16(val - y), tx);                        \
+    c0  = _mm_add_epi16(c0, c1);                                               \
+    x0  = _mm_add_epi16(x0, c0);                                               \
+    x0  = _mm_add_epi16(x0, add);                                              \
+    c0  = _mm_srli_epi16(x0, shift)
+
+#define PLANAR_COMPUTE_HI(val, shift)                                          \
+    C0  = _mm_mullo_epi16(tmp2, ly1);                                          \
+    x0  = _mm_mullo_epi16(_mm_set1_epi16(val - y), th);                        \
+    C0  = _mm_add_epi16(C0, C1);                                               \
+    x0  = _mm_add_epi16(x0, C0);                                               \
+    x0  = _mm_add_epi16(x0, add);                                              \
+    C0  = _mm_srli_epi16(x0, shift)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define PLANAR_LOAD_0_8()                                                      \
+    ly   = _mm_loadl_epi64((__m128i*) left);                                   \
+    tx   = _mm_loadl_epi64((__m128i*) top);                                    \
+    ly   = _mm_unpacklo_epi8(ly, _mm_setzero_si128());                         \
+    tx   = _mm_unpacklo_epi8(tx, _mm_setzero_si128());                         \
+    ly   = _mm_unpacklo_epi16(ly, ly);                                         \
+    tx   = _mm_unpacklo_epi64(tx, tx)
+#define PLANAR_LOAD_0_10()                                                     \
+    ly   = _mm_loadl_epi64((__m128i*) left);                                   \
+    tx   = _mm_loadl_epi64((__m128i*) top);                                    \
+    ly   = _mm_unpacklo_epi16(ly, ly);                                         \
+    tx   = _mm_unpacklo_epi64(tx, tx)
+
+#define PLANAR_COMPUTE_0(dst , v1, v2, v3, v4)                                 \
+    dst = _mm_mullo_epi16(tmp1, ly1);                                          \
+    x0  = _mm_mullo_epi16(_mm_set_epi16(v1,v1,v1,v1,v2,v2,v2,v2), tx);         \
+    add = _mm_mullo_epi16(_mm_set_epi16(v3,v3,v3,v3,v4,v4,v4,v4), l0);         \
+    dst = _mm_add_epi16(dst, c1);                                              \
+    x0  = _mm_add_epi16(x0, add);                                              \
+    dst = _mm_add_epi16(dst, x0);                                              \
+    dst = _mm_srli_epi16(dst, 3)
+
+#define PLANAR_STORE_0_8()                                                     \
+    c0  = _mm_packus_epi16(c0,C0);                                             \
+    *((uint32_t *) src              ) = _mm_cvtsi128_si32(c0   );              \
+    *((uint32_t *)(src +     stride)) = _mm_extract_epi32(c0, 1);              \
+    *((uint32_t *)(src + 2 * stride)) = _mm_extract_epi32(c0, 2);              \
+    *((uint32_t *)(src + 3 * stride)) = _mm_extract_epi32(c0, 3)
+#define PLANAR_STORE_0_10()                                                    \
+    _mm_storel_epi64((__m128i*)(src             ), c0);                        \
+    _mm_storel_epi64((__m128i*)(src +     stride), _mm_unpackhi_epi64(c0, c0));\
+    _mm_storel_epi64((__m128i*)(src + 2 * stride), C0);                        \
+    _mm_storel_epi64((__m128i*)(src + 3 * stride), _mm_unpackhi_epi64(C0, C0))
+
+#define PRED_PLANAR_0(D)                                                       \
+void pred_planar_0_ ## D ## _sse(uint8_t *_src, const uint8_t *_top,           \
+        const uint8_t *_left, ptrdiff_t stride) {                              \
+    __m128i ly, l0, tx, ly1;                                                   \
+    __m128i tmp1, add, x0, c0, c1, C0;                                         \
+    PLANAR_INIT_ ## D();                                                       \
+    tx   = _mm_set1_epi16(top[4]);                                             \
+    l0   = _mm_set1_epi16(left[4]);                                            \
+    add  = _mm_set1_epi16(4);                                                  \
+    tmp1 = _mm_set_epi16(0,1,2,3,0,1,2,3);                                     \
+    c1   = _mm_mullo_epi16(_mm_set_epi16(4,3,2,1,4,3,2,1), tx);                \
+    c1   = _mm_add_epi16(c1, add);                                             \
+    PLANAR_LOAD_0_ ##D();                                                      \
+                                                                               \
+    ly1 = _mm_unpacklo_epi32(ly, ly);                                          \
+    PLANAR_COMPUTE_0(c0, 2, 3, 2, 1);                                          \
+    ly1 = _mm_unpackhi_epi32(ly, ly);                                          \
+    PLANAR_COMPUTE_0(C0, 0, 1, 4, 3);                                          \
+    PLANAR_STORE_0_ ## D();                                                    \
+}
+PRED_PLANAR_0( 8)
+PRED_PLANAR_0(10)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define PLANAR_LOAD_1_8()                                                      \
+    ly   = _mm_loadl_epi64((__m128i*)left);                                    \
+    tx   = _mm_loadl_epi64((__m128i*)top);                                     \
+    ly   = _mm_unpacklo_epi8(ly,_mm_setzero_si128());                          \
+    tx   = _mm_unpacklo_epi8(tx,_mm_setzero_si128())
+#define PLANAR_LOAD_1_10()                                                     \
+    ly   = _mm_loadu_si128((__m128i*)left);                                    \
+    tx   = _mm_loadu_si128((__m128i*)top)
+
+#define PLANAR_COMPUTE_1()                                                     \
+    PLANAR_COMPUTE(7, 4)
+
+#define PLANAR_STORE_1_8()                                                     \
+    c0  = _mm_packus_epi16(c0,_mm_setzero_si128());                            \
+    _mm_storel_epi64((__m128i*)(src), c0);                                     \
+    src+= stride;                                                              \
+    ly  = _mm_srli_si128(ly,2)
+#define PLANAR_STORE_1_10()                                                    \
+    _mm_storeu_si128((__m128i*)(src), c0);                                     \
+    src+= stride;                                                              \
+    ly  = _mm_srli_si128(ly,2)
+
+#define PRED_PLANAR_1(D)                                                       \
+void pred_planar_1_ ## D ## _sse(uint8_t *_src, const uint8_t *_top,           \
+        const uint8_t *_left, ptrdiff_t stride) {                              \
+    int y;                                                                     \
+    __m128i ly, l0, tx, ly1;                                                   \
+    __m128i tmp1, add, x0, c0, c1;                                             \
+    PLANAR_INIT_ ## D();                                                       \
+    tx   = _mm_set1_epi16(top[8]);                                             \
+    l0   = _mm_set1_epi16(left[8]);                                            \
+    add  = _mm_set1_epi16(8);                                                  \
+    tmp1 = _mm_set_epi16(0,1,2,3,4,5,6,7);                                     \
+    c1   = _mm_mullo_epi16(_mm_set_epi16(8,7,6,5,4,3,2,1), tx);                \
+    c1   = _mm_add_epi16(c1,add);                                              \
+    PLANAR_LOAD_1_ ## D();                                                     \
+    for (y = 0; y < 8; y++) {                                                  \
+        PLANAR_COMPUTE_1();                                                    \
+        PLANAR_STORE_1_ ## D();                                                \
+    }                                                                          \
+}
+
+PRED_PLANAR_1( 8)
+PRED_PLANAR_1(10)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define PLANAR_LOAD_2_8()                                                      \
+    ly   = _mm_loadu_si128((__m128i*) left);                                   \
+    tx   = _mm_loadu_si128((__m128i*) top);                                    \
+    lh   = _mm_unpackhi_epi8(ly,_mm_setzero_si128());                          \
+    ly   = _mm_unpacklo_epi8(ly,_mm_setzero_si128());                          \
+    th   = _mm_unpackhi_epi8(tx,_mm_setzero_si128());                          \
+    tx   = _mm_unpacklo_epi8(tx,_mm_setzero_si128())
+
+#define PLANAR_LOAD_2_10()                                                     \
+    ly   = _mm_loadu_si128((__m128i*) left);                                   \
+    lh   = _mm_loadu_si128((__m128i*)&left[8]);                                \
+    tx   = _mm_loadu_si128((__m128i*) top);                                    \
+    th   = _mm_loadu_si128((__m128i*)&top[8])
+
+#define PLANAR_COMPUTE_2()                                                     \
+    PLANAR_COMPUTE(15, 5)
+#define PLANAR_COMPUTE_HI_2()                                                  \
+    PLANAR_COMPUTE_HI(15, 5)
+
+#define PLANAR_STORE_2_8()                                                     \
+    c0  = _mm_packus_epi16(c0, C0);                                            \
+    _mm_storeu_si128((__m128i*) src, c0);                                      \
+    src+= stride;                                                              \
+    ly  = _mm_srli_si128(ly,2)
+#define PLANAR_STORE_2_10()                                                    \
+    _mm_storeu_si128((__m128i*) src   , c0);                                   \
+    _mm_storeu_si128((__m128i*)&src[8], C0);                                   \
+    src+= stride;                                                              \
+    ly  = _mm_srli_si128(ly,2)
+
+#define PRED_PLANAR_2(D)                                                       \
+void pred_planar_2_ ## D ## _sse(uint8_t *_src, const uint8_t *_top,           \
+        const uint8_t *_left, ptrdiff_t stride) {                              \
+    int y, i;                                                                  \
+    __m128i ly, lh, l0, tx, th, ly1;                                           \
+    __m128i tmp1, tmp2, add, x0, c0, c1, C0, C1;                               \
+    PLANAR_INIT_ ## D();                                                       \
+    tx   = _mm_set1_epi16(top[16]);                                            \
+    l0   = _mm_set1_epi16(left[16]);                                           \
+    add  = _mm_set1_epi16(16);                                                 \
+    tmp1 = _mm_set_epi16( 8, 9,10,11,12,13,14,15);                             \
+    tmp2 = _mm_set_epi16( 0, 1, 2, 3, 4, 5, 6, 7);                             \
+    c1   = _mm_mullo_epi16(_mm_set_epi16( 8, 7, 6, 5, 4, 3, 2, 1), tx);        \
+    C1   = _mm_mullo_epi16(_mm_set_epi16(16,15,14,13,12,11,10, 9), tx);        \
+    c1   = _mm_add_epi16(c1, add);                                             \
+    C1   = _mm_add_epi16(C1, add);                                             \
+    PLANAR_LOAD_2_ ## D();                                                     \
+    for (i = 0; i < 2; i++) {                                                  \
+        for (y = i*8; y < i*8+8; y++) {                                        \
+            PLANAR_COMPUTE_2();                                                \
+            PLANAR_COMPUTE_HI_2();                                             \
+            PLANAR_STORE_2_ ## D();                                            \
+        }                                                                      \
+        ly = lh;                                                               \
+    }                                                                          \
+}
+
+PRED_PLANAR_2( 8)
+PRED_PLANAR_2(10)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define PLANAR_LOAD_3_8()                                                      \
+    ly   = _mm_loadu_si128((__m128i*) left);                                   \
+    lh   = _mm_unpackhi_epi8(ly,_mm_setzero_si128());                          \
+    ly   = _mm_unpacklo_epi8(ly,_mm_setzero_si128());                          \
+    tx   = _mm_loadu_si128((__m128i*) top);                                    \
+    th   = _mm_unpackhi_epi8(tx,_mm_setzero_si128());                          \
+    tx   = _mm_unpacklo_epi8(tx,_mm_setzero_si128());                          \
+    TX   = _mm_loadu_si128((__m128i*)(top + 16));                              \
+    TH   = _mm_unpackhi_epi8(TX,_mm_setzero_si128());                          \
+    TX   = _mm_unpacklo_epi8(TX,_mm_setzero_si128())
+#define PLANAR_LOAD_3_10()                                                     \
+    ly   = _mm_loadu_si128((__m128i*) left   );                                \
+    lh   = _mm_loadu_si128((__m128i*)&left[8]);                                \
+    tx   = _mm_loadu_si128((__m128i*) top    );                                \
+    th   = _mm_loadu_si128((__m128i*)&top[ 8]);                                \
+    TX   = _mm_loadu_si128((__m128i*)&top[16]);                                \
+    TH   = _mm_loadu_si128((__m128i*)&top[24])
+
+#define PLANAR_RELOAD_3_8()                                                    \
+    ly = _mm_loadu_si128((__m128i*)(left+16));                                 \
+    lh = _mm_unpackhi_epi8(ly,_mm_setzero_si128());                            \
+    ly = _mm_unpacklo_epi8(ly,_mm_setzero_si128())
+#define PLANAR_RELOAD_3_10()                                                   \
+    ly = _mm_loadu_si128((__m128i*)&left[16]);                                 \
+    lh = _mm_loadu_si128((__m128i*)&left[24])
+
+#define PLANAR_COMPUTE_3()                                                     \
+    PLANAR_COMPUTE(31, 6)
+#define PLANAR_COMPUTE_HI_3()                                                  \
+    PLANAR_COMPUTE_HI(31, 6)
+#define PLANAR_COMPUTE_HI2_3()                                                 \
+    c0  = _mm_mullo_epi16(TMP1, ly1);                                          \
+    x0  = _mm_mullo_epi16(_mm_set1_epi16(31 - y), TX);                         \
+    c0  = _mm_add_epi16(c0, c2);                                               \
+    x0  = _mm_add_epi16(x0, c0);                                               \
+    x0  = _mm_add_epi16(x0, add);                                              \
+    c0  = _mm_srli_epi16(x0, 6)
+#define PLANAR_COMPUTE_HI3_3()                                                 \
+    C0  = _mm_mullo_epi16(TMP2, ly1);                                          \
+    x0  = _mm_mullo_epi16(_mm_set1_epi16(31 - y), TH);                         \
+    C0  = _mm_add_epi16(C0, C2);                                               \
+    x0  = _mm_add_epi16(x0, C0);                                               \
+    x0  = _mm_add_epi16(x0, add);                                              \
+    C0  = _mm_srli_epi16(x0, 6)
+
+#define PLANAR_STORE1_3_8()                                                    \
+    c0 = _mm_packus_epi16(c0, C0);                                             \
+    _mm_storeu_si128((__m128i*) src, c0)
+#define PLANAR_STORE2_3_8()                                                    \
+    c0  = _mm_packus_epi16(c0, C0);                                            \
+    _mm_storeu_si128((__m128i*) (src + 16), c0);                               \
+    src+= stride;                                                              \
+    ly  = _mm_srli_si128(ly, 2)
+
+#define PLANAR_STORE1_3_10()                                                   \
+    _mm_storeu_si128((__m128i*) src    , c0);                                  \
+    _mm_storeu_si128((__m128i*)&src[ 8], C0)
+#define PLANAR_STORE2_3_10()                                                   \
+    _mm_storeu_si128((__m128i*)&src[16], c0);                                  \
+    _mm_storeu_si128((__m128i*)&src[24], C0);                                  \
+    src+= stride;                                                              \
+    ly  = _mm_srli_si128(ly, 2)
+
+
+#define PRED_PLANAR_3(D)                                                       \
+void pred_planar_3_ ## D ## _sse(uint8_t *_src, const uint8_t *_top,           \
+        const uint8_t *_left, ptrdiff_t stride) {                              \
+    int y, i;                                                                  \
+    __m128i l0, ly, lh, ly1, tx, th, TX, TH, tmp1, tmp2, TMP1, TMP2;           \
+    __m128i x0, c0, c1, c2, C0, C1, C2, add;                                   \
+    PLANAR_INIT_ ## D();                                                       \
+    tx   = _mm_set1_epi16(top[32]);                                            \
+    l0   = _mm_set1_epi16(left[32]);                                           \
+    add  = _mm_set1_epi16(32);                                                 \
+    tmp1 = _mm_set_epi16(24,25,26,27,28,29,30,31);                             \
+    tmp2 = _mm_set_epi16(16,17,18,19,20,21,22,23);                             \
+    TMP1 = _mm_set_epi16( 8, 9,10,11,12,13,14,15);                             \
+    TMP2 = _mm_set_epi16( 0, 1, 2, 3, 4, 5, 6, 7);                             \
+    c1   = _mm_mullo_epi16(_mm_set_epi16( 8, 7, 6, 5, 4, 3, 2, 1), tx);        \
+    C1   = _mm_mullo_epi16(_mm_set_epi16(16,15,14,13,12,11,10, 9), tx);        \
+    c2   = _mm_mullo_epi16(_mm_set_epi16(24,23,22,21,20,19,18,17), tx);        \
+    C2   = _mm_mullo_epi16(_mm_set_epi16(32,31,30,29,28,27,26,25), tx);        \
+    c1   = _mm_add_epi16(c1, add);                                             \
+    C1   = _mm_add_epi16(C1, add);                                             \
+    c2   = _mm_add_epi16(c2, add);                                             \
+    C2   = _mm_add_epi16(C2, add);                                             \
+    PLANAR_LOAD_3_ ## D();                                                     \
+    for (i = 0; i < 4; i++) {                                                  \
+        for (y = 0+i*8; y < 8+i*8; y++) {                                      \
+            PLANAR_COMPUTE_3();                                                \
+            PLANAR_COMPUTE_HI_3();                                             \
+            PLANAR_STORE1_3_ ## D();                                           \
+            PLANAR_COMPUTE_HI2_3();                                            \
+            PLANAR_COMPUTE_HI3_3();                                            \
+            PLANAR_STORE2_3_ ## D();                                           \
+        }                                                                      \
+        if (i == 0 || i == 2) {                                                \
+            ly = lh;                                                           \
+        } else {                                                               \
+            PLANAR_RELOAD_3_ ## D();                                           \
+        }                                                                      \
+    }                                                                          \
+}
+
+PRED_PLANAR_3( 8)
+PRED_PLANAR_3(10)
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define STORE8(out, sstep_out)                                                 \
+    _mm_storel_epi64((__m128i*)&out[0*sstep_out], m10);                        \
+    _mm_storel_epi64((__m128i*)&out[2*sstep_out], m12);                        \
+    _mm_storel_epi64((__m128i*)&out[4*sstep_out], m11);                        \
+    _mm_storel_epi64((__m128i*)&out[6*sstep_out], m13);                        \
+    m10 = _mm_unpackhi_epi64(m10, m10);                                        \
+    m12 = _mm_unpackhi_epi64(m12, m12);                                        \
+    m11 = _mm_unpackhi_epi64(m11, m11);                                        \
+    m13 = _mm_unpackhi_epi64(m13, m13);                                        \
+    _mm_storel_epi64((__m128i*)&out[1*sstep_out], m10);                        \
+    _mm_storel_epi64((__m128i*)&out[3*sstep_out], m12);                        \
+    _mm_storel_epi64((__m128i*)&out[5*sstep_out], m11);                        \
+    _mm_storel_epi64((__m128i*)&out[7*sstep_out], m13)
+
+#define STORE16(out, sstep_out)                                                \
+    _mm_storeu_si128((__m128i *) &out[0*sstep_out], m0);                       \
+    _mm_storeu_si128((__m128i *) &out[1*sstep_out], m1);                       \
+    _mm_storeu_si128((__m128i *) &out[2*sstep_out], m2);                       \
+    _mm_storeu_si128((__m128i *) &out[3*sstep_out], m3);                       \
+    _mm_storeu_si128((__m128i *) &out[4*sstep_out], m4);                       \
+    _mm_storeu_si128((__m128i *) &out[5*sstep_out], m5);                       \
+    _mm_storeu_si128((__m128i *) &out[6*sstep_out], m6);                       \
+    _mm_storeu_si128((__m128i *) &out[7*sstep_out], m7)
+
+#define TRANSPOSE4x4_8(in, sstep_in, out, sstep_out)                           \
+    {                                                                          \
+        __m128i m0  = _mm_loadl_epi64((__m128i *) &in[0*sstep_in]);            \
+        __m128i m1  = _mm_loadl_epi64((__m128i *) &in[1*sstep_in]);            \
+        __m128i m2  = _mm_loadl_epi64((__m128i *) &in[2*sstep_in]);            \
+        __m128i m3  = _mm_loadl_epi64((__m128i *) &in[3*sstep_in]);            \
+                                                                               \
+        __m128i m10 = _mm_unpacklo_epi8(m0, m1);                               \
+        __m128i m11 = _mm_unpacklo_epi8(m2, m3);                               \
+                                                                               \
+        m0  = _mm_unpacklo_epi16(m10, m11);                                    \
+                                                                               \
+        *((uint32_t *) (out+0*sstep_out)) =_mm_cvtsi128_si32(m0);              \
+        *((uint32_t *) (out+1*sstep_out)) =_mm_extract_epi32(m0, 1);           \
+        *((uint32_t *) (out+2*sstep_out)) =_mm_extract_epi32(m0, 2);           \
+        *((uint32_t *) (out+3*sstep_out)) =_mm_extract_epi32(m0, 3);           \
+    }
+#define TRANSPOSE8x8_8(in, sstep_in, out, sstep_out)                           \
+    {                                                                          \
+        __m128i m0  = _mm_loadl_epi64((__m128i *) &in[0*sstep_in]);            \
+        __m128i m1  = _mm_loadl_epi64((__m128i *) &in[1*sstep_in]);            \
+        __m128i m2  = _mm_loadl_epi64((__m128i *) &in[2*sstep_in]);            \
+        __m128i m3  = _mm_loadl_epi64((__m128i *) &in[3*sstep_in]);            \
+        __m128i m4  = _mm_loadl_epi64((__m128i *) &in[4*sstep_in]);            \
+        __m128i m5  = _mm_loadl_epi64((__m128i *) &in[5*sstep_in]);            \
+        __m128i m6  = _mm_loadl_epi64((__m128i *) &in[6*sstep_in]);            \
+        __m128i m7  = _mm_loadl_epi64((__m128i *) &in[7*sstep_in]);            \
+                                                                               \
+        __m128i m10 = _mm_unpacklo_epi8(m0, m1);                               \
+        __m128i m11 = _mm_unpacklo_epi8(m2, m3);                               \
+        __m128i m12 = _mm_unpacklo_epi8(m4, m5);                               \
+        __m128i m13 = _mm_unpacklo_epi8(m6, m7);                               \
+                                                                               \
+        m0  = _mm_unpacklo_epi16(m10, m11);                                    \
+        m1  = _mm_unpacklo_epi16(m12, m13);                                    \
+        m2  = _mm_unpackhi_epi16(m10, m11);                                    \
+        m3  = _mm_unpackhi_epi16(m12, m13);                                    \
+                                                                               \
+        m10 = _mm_unpacklo_epi32(m0 , m1 );                                    \
+        m11 = _mm_unpacklo_epi32(m2 , m3 );                                    \
+        m12 = _mm_unpackhi_epi32(m0 , m1 );                                    \
+        m13 = _mm_unpackhi_epi32(m2 , m3 );                                    \
+                                                                               \
+        STORE8(out, sstep_out);                                                \
+    }
+#define TRANSPOSE16x16_8(in, sstep_in, out, sstep_out)                        \
+    for (y = 0; y < sstep_in; y+=8)                                           \
+        for (x = 0; x < sstep_in; x+=8)                                       \
+            TRANSPOSE8x8_8((&in[y*sstep_in+x]), sstep_in, (&out[x*sstep_out+y]), sstep_out)
+#define TRANSPOSE32x32_8(in, sstep_in, out, sstep_out)                        \
+    for (y = 0; y < sstep_in; y+=8)                                           \
+        for (x = 0; x < sstep_in; x+=8)                                       \
+            TRANSPOSE8x8_8((&in[y*sstep_in+x]), sstep_in, (&out[x*sstep_out+y]), sstep_out)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define TRANSPOSE4x4_10(in, sstep_in, out, sstep_out)                          \
+    {                                                                          \
+        __m128i m0  = _mm_loadl_epi64((__m128i *) &in[0*sstep_in]);            \
+        __m128i m1  = _mm_loadl_epi64((__m128i *) &in[1*sstep_in]);            \
+        __m128i m2  = _mm_loadl_epi64((__m128i *) &in[2*sstep_in]);            \
+        __m128i m3  = _mm_loadl_epi64((__m128i *) &in[3*sstep_in]);            \
+                                                                               \
+        __m128i m10 = _mm_unpacklo_epi16(m0, m1);                              \
+        __m128i m11 = _mm_unpacklo_epi16(m2, m3);                              \
+                                                                               \
+        m0  = _mm_unpacklo_epi32(m10, m11);                                    \
+        m1  = _mm_unpackhi_epi32(m10, m11);                                    \
+                                                                               \
+        _mm_storel_epi64((__m128i *) (out+0*sstep_out) , m0);                  \
+        _mm_storel_epi64((__m128i *) (out+1*sstep_out) , _mm_unpackhi_epi64(m0, m0));\
+        _mm_storel_epi64((__m128i *) (out+2*sstep_out) , m1);                  \
+        _mm_storel_epi64((__m128i *) (out+3*sstep_out) , _mm_unpackhi_epi64(m1, m1));\
+    }
+#define TRANSPOSE8x8_10(in, sstep_in, out, sstep_out)                          \
+    {                                                                          \
+        __m128i tmp0, tmp1, tmp2, tmp3, src0, src1, src2, src3;                \
+        __m128i m0  = _mm_loadu_si128((__m128i *) &in[0*sstep_in]);            \
+        __m128i m1  = _mm_loadu_si128((__m128i *) &in[1*sstep_in]);            \
+        __m128i m2  = _mm_loadu_si128((__m128i *) &in[2*sstep_in]);            \
+        __m128i m3  = _mm_loadu_si128((__m128i *) &in[3*sstep_in]);            \
+        __m128i m4  = _mm_loadu_si128((__m128i *) &in[4*sstep_in]);            \
+        __m128i m5  = _mm_loadu_si128((__m128i *) &in[5*sstep_in]);            \
+        __m128i m6  = _mm_loadu_si128((__m128i *) &in[6*sstep_in]);            \
+        __m128i m7  = _mm_loadu_si128((__m128i *) &in[7*sstep_in]);            \
+                                                                               \
+        tmp0 = _mm_unpacklo_epi16(m0, m1);                                     \
+        tmp1 = _mm_unpacklo_epi16(m2, m3);                                     \
+        tmp2 = _mm_unpacklo_epi16(m4, m5);                                     \
+        tmp3 = _mm_unpacklo_epi16(m6, m7);                                     \
+        src0 = _mm_unpacklo_epi32(tmp0, tmp1);                                 \
+        src1 = _mm_unpacklo_epi32(tmp2, tmp3);                                 \
+        src2 = _mm_unpackhi_epi32(tmp0, tmp1);                                 \
+        src3 = _mm_unpackhi_epi32(tmp2, tmp3);                                 \
+        tmp0 = _mm_unpackhi_epi16(m0, m1);                                     \
+        tmp1 = _mm_unpackhi_epi16(m2, m3);                                     \
+        tmp2 = _mm_unpackhi_epi16(m4, m5);                                     \
+        tmp3 = _mm_unpackhi_epi16(m6, m7);                                     \
+        m0   = _mm_unpacklo_epi64(src0 , src1);                                \
+        m1   = _mm_unpackhi_epi64(src0 , src1);                                \
+        m2   = _mm_unpacklo_epi64(src2 , src3);                                \
+        m3   = _mm_unpackhi_epi64(src2 , src3);                                \
+        src0 = _mm_unpacklo_epi32(tmp0, tmp1);                                 \
+        src1 = _mm_unpacklo_epi32(tmp2, tmp3);                                 \
+        src2 = _mm_unpackhi_epi32(tmp0, tmp1);                                 \
+        src3 = _mm_unpackhi_epi32(tmp2, tmp3);                                 \
+        m4   = _mm_unpacklo_epi64(src0 , src1);                                \
+        m5   = _mm_unpackhi_epi64(src0 , src1);                                \
+        m6   = _mm_unpacklo_epi64(src2 , src3);                                \
+        m7   = _mm_unpackhi_epi64(src2 , src3);                                \
+        STORE16(out, sstep_out);                                               \
+    }
+#define TRANSPOSE16x16_10(in, sstep_in, out, sstep_out)                        \
+    for (y = 0; y < sstep_in; y+=8)                                           \
+        for (x = 0; x < sstep_in; x+=8)                                       \
+            TRANSPOSE8x8_10((&in[y*sstep_in+x]), sstep_in, (&out[x*sstep_out+y]), sstep_out)
+#define TRANSPOSE32x32_10(in, sstep_in, out, sstep_out)                        \
+    for (y = 0; y < sstep_in; y+=8)                                           \
+        for (x = 0; x < sstep_in; x+=8)                                       \
+            TRANSPOSE8x8_10((&in[y*sstep_in+x]), sstep_in, (&out[x*sstep_out+y]), sstep_out)
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define ANGULAR_COMPUTE_8(W)                                                   \
+    for (x = 0; x < W; x += 8) {                                               \
+        r3 = _mm_set1_epi16((fact << 8) + (32 - fact));                        \
+        r1 = _mm_loadu_si128((__m128i*)(&ref[x+idx+1]));                       \
+        r0 = _mm_srli_si128(r1, 1);                                            \
+        r1 = _mm_unpacklo_epi8(r1, r0);                                        \
+        r1 = _mm_maddubs_epi16(r1, r3);                                        \
+        r1 = _mm_mulhrs_epi16(r1, _mm_set1_epi16(1024));                                           \
+        r1 = _mm_packus_epi16(r1, r1);                                         \
+        _mm_storel_epi64((__m128i *) &p_src[x], r1);                           \
+    }
+
+
+#define ANGULAR_COMPUTE4_8()                                                   \
+    r3 = _mm_set1_epi16((fact << 8) + (32 - fact));                            \
+    r1 = _mm_loadu_si128((__m128i*)(&ref[idx+1]));                             \
+    r0 = _mm_srli_si128(r1, 1);                                                \
+    r1 = _mm_unpacklo_epi8(r1, r0);                                            \
+    r1 = _mm_maddubs_epi16(r1, r3);                                            \
+    r1 = _mm_mulhrs_epi16(r1, _mm_set1_epi16(1024));                                           \
+    r1 = _mm_packus_epi16(r1, r1);                                             \
+    *((uint32_t *)p_src) = _mm_cvtsi128_si32(r1)
+#define ANGULAR_COMPUTE8_8()     ANGULAR_COMPUTE_8( 8)
+#define ANGULAR_COMPUTE16_8()    ANGULAR_COMPUTE_8(16)
+#define ANGULAR_COMPUTE32_8()    ANGULAR_COMPUTE_8(32)
+
+#define ANGULAR_COMPUTE_ELSE4_8()                                              \
+    r1 = _mm_loadl_epi64((__m128i*) &ref[idx+1]);                              \
+    *((uint32_t *)p_src) = _mm_cvtsi128_si32(r1)
+#define ANGULAR_COMPUTE_ELSE8_8()                                              \
+    r1 = _mm_loadl_epi64((__m128i*) &ref[idx+1]);                              \
+    _mm_storel_epi64((__m128i *) p_src, r1)
+#define ANGULAR_COMPUTE_ELSE16_8()                                             \
+    r1 = _mm_loadu_si128((__m128i*) &ref[idx+1]);                              \
+    _mm_storeu_si128((__m128i *) p_src, r1)
+#define ANGULAR_COMPUTE_ELSE32_8()                                             \
+    r1 = _mm_loadu_si128((__m128i*) &ref[idx+1]);                              \
+    _mm_storeu_si128((__m128i *) p_src ,r1);                                   \
+    r1 = _mm_loadu_si128((__m128i*) &ref[idx+17]);                             \
+    _mm_storeu_si128((__m128i *)&p_src[16] ,r1)
+
+#define CLIP_PIXEL(src1, src2)                                                 \
+    r3  = _mm_loadu_si128((__m128i*)src1);                                     \
+    r1  = _mm_set1_epi16(src1[-1]);                                            \
+    r2  = _mm_set1_epi16(src2[0]);                                             \
+    r0  = _mm_unpacklo_epi8(r3,_mm_setzero_si128());                           \
+    r0  = _mm_subs_epi16(r0, r1);                                              \
+    r0  = _mm_srai_epi16(r0, 1);                                               \
+    r0  = _mm_add_epi16(r0, r2)
+#define CLIP_PIXEL_HI()                                                        \
+    r3  = _mm_unpackhi_epi8(r3,_mm_setzero_si128());                           \
+    r3  = _mm_subs_epi16(r3, r1);                                              \
+    r3  = _mm_srai_epi16(r3, 1);                                               \
+    r3  = _mm_add_epi16(r3, r2)
+
+#define CLIP_PIXEL1_4_8()                                                      \
+    p_src = src;                                                               \
+    CLIP_PIXEL(src2, src1);                                                    \
+    r0  = _mm_packus_epi16(r0, r0);                                            \
+    *((char *) p_src) = _mm_extract_epi8(r0, 0);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 1);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 2);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 3)
+#define CLIP_PIXEL1_8_8()                                                      \
+    CLIP_PIXEL1_4_8();                                                         \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 4);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 5);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 6);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 7)
+#define CLIP_PIXEL1_16_8()                                                     \
+    p_src = src;                                                               \
+    CLIP_PIXEL(src2, src1);                                                    \
+    CLIP_PIXEL_HI();                                                           \
+    r0  = _mm_packus_epi16(r0, r3);                                            \
+    *((char *) p_src) = _mm_extract_epi8(r0, 0);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 1);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 2);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 3);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 4);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 5);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 6);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 7);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 8);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0, 9);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0,10);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0,11);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0,12);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0,13);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0,14);                               \
+    p_src += stride;                                                           \
+    *((char *) p_src) = _mm_extract_epi8(r0,15)
+#define CLIP_PIXEL1_32_8()
+
+#define CLIP_PIXEL2_4_8()                                                      \
+    CLIP_PIXEL(src2, src1);                                                    \
+    r0  = _mm_packus_epi16(r0, r0);                                            \
+    *((uint32_t *)_src) = _mm_cvtsi128_si32(r0)
+#define CLIP_PIXEL2_8_8()                                                      \
+    CLIP_PIXEL(src2, src1);                                                    \
+    r0  = _mm_packus_epi16(r0, r0);                                            \
+    _mm_storel_epi64((__m128i*)_src, r0)
+#define CLIP_PIXEL2_16_8()                                                     \
+    CLIP_PIXEL(src2, src1);                                                    \
+    CLIP_PIXEL_HI();                                                           \
+    r0  = _mm_packus_epi16(r0, r3);                                            \
+    _mm_storeu_si128((__m128i*) _src , r0)
+#define CLIP_PIXEL2_32_8()
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#if HAVE_SSE4
+#define ANGULAR_COMPUTE_10(W)                                                  \
+    for (x = 0; x < W; x += 4) {                                               \
+        r3 = _mm_set1_epi32((fact << 16) + (32 - fact));                       \
+        r1 = _mm_loadu_si128((__m128i*)(&ref[x+idx+1]));                       \
+        r0 = _mm_srli_si128(r1, 2);                                            \
+        r1 = _mm_unpacklo_epi16(r1, r0);                                       \
+        r1 = _mm_madd_epi16(r1, r3);                                           \
+        r1 = _mm_mulhrs_epi16(r1, _mm_set1_epi16(1024));                                           \
+        r1 = _MM_PACKUS_EPI32(r1, r1);                                         \
+        _mm_storel_epi64((__m128i *) &p_src[x], r1);                           \
+    }
+#define ANGULAR_COMPUTE4_10()    ANGULAR_COMPUTE_10( 4)
+#define ANGULAR_COMPUTE8_10()    ANGULAR_COMPUTE_10( 8)
+#define ANGULAR_COMPUTE16_10()   ANGULAR_COMPUTE_10(16)
+#define ANGULAR_COMPUTE32_10()   ANGULAR_COMPUTE_10(32)
+
+#define ANGULAR_COMPUTE_ELSE_10(W)                                             \
+    for (x = 0; x < W; x += 8) {                                               \
+        r1 = _mm_loadu_si128((__m128i*)(&ref[x+idx+1]));                       \
+        _mm_storeu_si128((__m128i *) &p_src[x], r1);                           \
+    }
+
+#define ANGULAR_COMPUTE_ELSE4_10()                                             \
+    r1 = _mm_loadl_epi64((__m128i*)(&ref[idx+1]));                             \
+    _mm_storel_epi64((__m128i *) p_src, r1)
+
+#define ANGULAR_COMPUTE_ELSE8_10()      ANGULAR_COMPUTE_ELSE_10(8)
+#define ANGULAR_COMPUTE_ELSE16_10()     ANGULAR_COMPUTE_ELSE_10(16)
+#define ANGULAR_COMPUTE_ELSE32_10()     ANGULAR_COMPUTE_ELSE_10(32)
+
+#define CLIP_PIXEL_10()                                                        \
+    r0  = _mm_loadu_si128((__m128i*)src2);                                     \
+    r1  = _mm_set1_epi16(src2[-1]);                                            \
+    r2  = _mm_set1_epi16(src1[0]);                                             \
+    r0  = _mm_subs_epi16(r0, r1);                                              \
+    r0  = _mm_srai_epi16(r0, 1);                                               \
+    r0  = _mm_add_epi16(r0, r2)
+#define CLIP_PIXEL_HI_10()                                                     \
+    r3  = _mm_loadu_si128((__m128i*)&src2[8]);                                 \
+    r3  = _mm_subs_epi16(r3, r1);                                              \
+    r3  = _mm_srai_epi16(r3, 1);                                               \
+    r3  = _mm_add_epi16(r3, r2)
+
+#define CLIP_PIXEL1_4_10()                                                     \
+    p_src = src;                                                               \
+    CLIP_PIXEL_10();                                                           \
+    r0  = _mm_max_epi16(r0, _mm_setzero_si128());                              \
+    r0  = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff));                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 0);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 1);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 2);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 3)
+#define CLIP_PIXEL1_8_10()                                                     \
+    CLIP_PIXEL1_4_10();                                                        \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 4);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 5);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 6);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 7)
+#define CLIP_PIXEL1_16_10()                                                    \
+    p_src = src;                                                               \
+    CLIP_PIXEL_10();                                                           \
+    CLIP_PIXEL_HI_10();                                                        \
+    r0  = _mm_max_epi16(r0, _mm_setzero_si128());                              \
+    r0  = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff));                           \
+    r3  = _mm_max_epi16(r3, _mm_setzero_si128());                              \
+    r3  = _mm_min_epi16(r3, _mm_set1_epi16(0x03ff));                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 0);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 1);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 2);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 3);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 4);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 5);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 6);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r0, 7);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 0);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 1);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 2);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 3);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 4);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 5);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 6);                          \
+    p_src += stride;                                                           \
+    *((uint16_t *) p_src) = _mm_extract_epi16(r3, 7)
+#define CLIP_PIXEL1_32_10()
+
+#define CLIP_PIXEL2_4_10()                                                     \
+    CLIP_PIXEL_10();                                                           \
+    r0  = _mm_max_epi16(r0, _mm_setzero_si128());                              \
+    r0  = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff));                           \
+    _mm_storel_epi64((__m128i*) _src    , r0)
+#define CLIP_PIXEL2_8_10()                                                     \
+    CLIP_PIXEL_10();                                                           \
+    r0  = _mm_max_epi16(r0, _mm_setzero_si128());                              \
+    r0  = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff));                           \
+    _mm_storeu_si128((__m128i*) _src    , r0)
+#define CLIP_PIXEL2_16_10()                                                    \
+    CLIP_PIXEL_10();                                                           \
+    CLIP_PIXEL_HI_10();                                                        \
+    r0  = _mm_max_epi16(r0, _mm_setzero_si128());                              \
+    r0  = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff));                           \
+    r3  = _mm_max_epi16(r3, _mm_setzero_si128());                              \
+    r3  = _mm_min_epi16(r3, _mm_set1_epi16(0x03ff));                           \
+    _mm_storeu_si128((__m128i*) p_out    , r0);                                \
+    _mm_storeu_si128((__m128i*) &p_out[8], r3);
+
+#define CLIP_PIXEL2_32_10()
+
+////////////////////////////////////////////////////////////////////////////////
+//
+////////////////////////////////////////////////////////////////////////////////
+#define PRED_ANGULAR_INIT_8(W)                                                 \
+    const uint8_t *src1;                                                       \
+    const uint8_t *src2;                                                       \
+    uint8_t       *ref, *p_src, *src, *p_out;                                  \
+    uint8_t        src_tmp[W*W];                                               \
+    if (mode >= 18) {                                                          \
+        src1   = (const uint8_t*) _top;                                        \
+        src2   = (const uint8_t*) _left;                                       \
+        src    = (uint8_t*) _src;                                              \
+        stride = _stride;                                                      \
+        p_src  = src;                                                          \
+    } else {                                                                   \
+        src1   = (const uint8_t*) _left;                                       \
+        src2   = (const uint8_t*) _top;                                        \
+        src    = &src_tmp[0];                                                  \
+        stride = W;                                                            \
+        p_src  = src;                                                          \
+    }                                                                          \
+    p_out  = (uint8_t*) _src;                                                  \
+    ref = (uint8_t*) (src1 - 1)
+#define PRED_ANGULAR_INIT_10(W)                                                \
+    const uint16_t *src1;                                                      \
+    const uint16_t *src2;                                                      \
+    uint16_t       *ref, *p_src, *src, *p_out;                                 \
+    uint16_t        src_tmp[W*W];                                              \
+    if (mode >= 18) {                                                          \
+        src1   = (const uint16_t*) _top;                                       \
+        src2   = (const uint16_t*) _left;                                      \
+        src    = (uint16_t*) _src;                                             \
+        stride = _stride;                                                      \
+        p_src  = src;                                                          \
+    } else {                                                                   \
+        src1   = (const uint16_t*) _left;                                      \
+        src2   = (const uint16_t*) _top;                                       \
+        src    = &src_tmp[0];                                                  \
+        stride = W;                                                            \
+        p_src  = src;                                                          \
+    }                                                                          \
+    p_out  = (uint16_t*) _src;                                                 \
+    ref = (uint16_t*) (src1 - 1)
+
+#define PRED_ANGULAR_WAR()                                                     \
+    int y;                                                                     \
+    __m128i r0, r1, r3
+
+#define PRED_ANGULAR_WAR4_8()                                                  \
+    PRED_ANGULAR_WAR();                                                        \
+    __m128i r2
+#define PRED_ANGULAR_WAR8_8()                                                  \
+    PRED_ANGULAR_WAR4_8();                                                       \
+    int x
+#define PRED_ANGULAR_WAR16_8()                                                 \
+    PRED_ANGULAR_WAR8_8()
+#define PRED_ANGULAR_WAR32_8()                                                 \
+    PRED_ANGULAR_WAR();                                                        \
+    int x
+
+#define PRED_ANGULAR_WAR4_10()    PRED_ANGULAR_WAR8_8()
+#define PRED_ANGULAR_WAR8_10()    PRED_ANGULAR_WAR8_8()
+#define PRED_ANGULAR_WAR16_10()   PRED_ANGULAR_WAR16_8()
+#define PRED_ANGULAR_WAR32_10()   PRED_ANGULAR_WAR32_8()
+
+#define PRED_ANGULAR(W, D)                                                     \
+static av_always_inline void pred_angular_ ## W ##_ ## D ## _sse(uint8_t *_src,\
+        const uint8_t *_top, const uint8_t *_left, ptrdiff_t _stride, int c_idx, int mode) {\
+    const int intra_pred_angle[] = {                                           \
+         32, 26, 21, 17, 13,  9,  5,  2,  0, -2, -5, -9,-13,-17,-21,-26,       \
+        -32,-26,-21,-17,-13, -9, -5, -2,  0,  2,  5,  9, 13, 17, 21, 26, 32    \
+    };                                                                         \
+    const int inv_angle[] = {                                                  \
+        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,    \
+        -630, -910, -1638, -4096                                               \
+    };                                                                         \
+    PRED_ANGULAR_WAR ## W ## _ ## D();                                         \
+    int            angle   = intra_pred_angle[mode-2];                         \
+    int            angle_i = angle;                                            \
+    int            last    = (W * angle) >> 5;                                 \
+    int            stride;                                                     \
+    PRED_ANGULAR_INIT_ ## D(W);                                                \
+    if (angle < 0 && last < -1) {                                              \
+        for (y = last; y <= -1; y++)                                           \
+            ref[y] = src2[-1 + ((y * inv_angle[mode-11] + 128) >> 8)];         \
+    }                                                                          \
+    for (y = 0; y < W; y++) {                                                  \
+        int idx  = (angle_i) >> 5;                                             \
+        int fact = (angle_i) & 31;                                             \
+        if (fact) {                                                            \
+            ANGULAR_COMPUTE ## W ## _ ## D();                                  \
+        } else {                                                               \
+            ANGULAR_COMPUTE_ELSE ## W ## _ ## D();                             \
+        }                                                                      \
+        angle_i += angle;                                                      \
+        p_src   += stride;                                                     \
+    }                                                                          \
+    if (mode >= 18) {                                                          \
+        if (mode == 26 && c_idx == 0) {                                        \
+            CLIP_PIXEL1_ ## W ## _ ## D();                                     \
+        }                                                                      \
+    } else {                                                                   \
+        TRANSPOSE ## W ## x ## W ## _ ## D(src_tmp, W, p_out, _stride);        \
+        if (mode == 10 && c_idx == 0) {                                        \
+            CLIP_PIXEL2_ ## W ## _ ## D();                                     \
+        }                                                                      \
+    }                                                                          \
+}
+
+PRED_ANGULAR( 4, 8)
+PRED_ANGULAR( 8, 8)
+PRED_ANGULAR(16, 8)
+PRED_ANGULAR(32, 8)
+
+PRED_ANGULAR( 4,10)
+PRED_ANGULAR( 8,10)
+PRED_ANGULAR(16,10)
+PRED_ANGULAR(32,10)
+
+void pred_angular_0_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+            ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_4_8_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+void pred_angular_1_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+        ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_8_8_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+void pred_angular_2_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+        ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_16_8_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+void pred_angular_3_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+        ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_32_8_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+
+void pred_angular_0_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+            ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_4_10_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+void pred_angular_1_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+            ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_8_10_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+void pred_angular_2_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+            ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_16_10_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+void pred_angular_3_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left,
+            ptrdiff_t _stride, int c_idx, int mode) {
+    pred_angular_32_10_sse(_src, _top, _left, _stride, c_idx, mode);
+}
+#endif
+
+#ifdef __GNUC__
+#pragma GCC pop_options
+#endif
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@ -263,4 +263,24 @@ void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t
 void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
 void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);

+void ff_hevc_transform_4x4_luma_8_sse2(int16_t *coeffs);
+void ff_hevc_transform_4x4_luma_10_sse2(int16_t *coeffs);
+void ff_hevc_transform_4x4_luma_12_sse2(int16_t *coeffs);
+
+#define IDCT_FUNC(s, b) void ff_hevc_transform_ ## s ## x ## s ##_## b ##_sse2\
+            (int16_t *coeffs, int col_limit);
+
+IDCT_FUNC(4, 8)
+IDCT_FUNC(4, 10)
+IDCT_FUNC(4, 12)
+IDCT_FUNC(8, 8)
+IDCT_FUNC(8, 10)
+IDCT_FUNC(8, 12)
+IDCT_FUNC(16, 8)
+IDCT_FUNC(16, 10)
+IDCT_FUNC(16, 12)
+IDCT_FUNC(32, 8)
+IDCT_FUNC(32, 10)
+IDCT_FUNC(32, 12)
+
 #endif // AVCODEC_X86_HEVCDSP_H
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@ -835,6 +835,13 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
            c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
            c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
+
+            /* intrinsics */
+            c->transform_4x4_luma = ff_hevc_transform_4x4_luma_8_sse2;
+            if (!ARCH_X86_64) {
+                c->idct[2] = ff_hevc_transform_16x16_8_sse2;
+                c->idct[3] = ff_hevc_transform_32x32_8_sse2;
+            }
        }
        if (EXTERNAL_SSSE3(cpu_flags)) {
            if(ARCH_X86_64) {
@ -1010,6 +1017,13 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
            c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
            c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
+
+            /* intrinsics  */
+            c->transform_4x4_luma = ff_hevc_transform_4x4_luma_10_sse2;
+            if (!ARCH_X86_64) {
+                c->idct[2] = ff_hevc_transform_16x16_10_sse2;
+                c->idct[3] = ff_hevc_transform_32x32_10_sse2;
+            }
        }
        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
@ -1215,6 +1229,13 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
            c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
            c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
+
+            /* intrinsics */
+            c->transform_4x4_luma = ff_hevc_transform_4x4_luma_12_sse2;
+            c->idct[0] = ff_hevc_transform_4x4_12_sse2;
+            c->idct[1] = ff_hevc_transform_8x8_12_sse2;
+            c->idct[2] = ff_hevc_transform_16x16_12_sse2;
+            c->idct[3] = ff_hevc_transform_32x32_12_sse2;
        }
        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
@ -1252,3 +1273,37 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
        }
    }
 }
+
+#include "libavcodec/hevcpred.h"
+#include "libavcodec/x86/hevcpred.h"
+
+#undef FUNC
+#define FUNC(a, depth) a ## _ ## depth ## _sse
+
+#define HEVC_PRED(depth)                      \
+    hpc->pred_planar[0]  = FUNC(pred_planar_0, depth);  \
+    hpc->pred_planar[1]  = FUNC(pred_planar_1, depth);  \
+    hpc->pred_planar[2]  = FUNC(pred_planar_2, depth);  \
+    hpc->pred_planar[3]  = FUNC(pred_planar_3, depth);  \
+    hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
+    hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
+    hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
+    hpc->pred_angular[3] = FUNC(pred_angular_3, depth)
+
+void ff_hevc_pred_init_x86(HEVCPredContext *hpc, int bit_depth)
+{
+    int mm_flags = av_get_cpu_flags();
+
+#ifndef _MSC_VER
+    if (bit_depth == 8) {
+        if (EXTERNAL_SSE4(mm_flags)) {
+            HEVC_PRED(8);
+        }
+    }
+    if (bit_depth == 10) {
+        if (EXTERNAL_SSE4(mm_flags)) {
+            HEVC_PRED(10);
+        }
+    }
+#endif
+}
--- a/libavcodec/x86/hevcpred.h
+++ b/libavcodec/x86/hevcpred.h
@ -0,0 +1,24 @@
+#ifndef AVCODEC_X86_HEVCPRED_H
+#define AVCODEC_X86_HEVCPRED_H
+
+void pred_planar_0_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+void pred_planar_1_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+void pred_planar_2_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+void pred_planar_3_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+
+void pred_angular_0_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+void pred_angular_1_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+void pred_angular_2_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+void pred_angular_3_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+
+void pred_planar_0_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+void pred_planar_1_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+void pred_planar_2_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+void pred_planar_3_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride);
+
+void pred_angular_0_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+void pred_angular_1_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+void pred_angular_2_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+void pred_angular_3_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode);
+
+#endif // AVCODEC_X86_HEVCPRED_H
--- a/libavdevice/pulse_audio_enc.c
+++ b/libavdevice/pulse_audio_enc.c
@ -801,5 +801,5 @@ const FFOutputFormat ff_pulse_muxer = {
    .p.flags              = AVFMT_NOFILE,
 #endif
    .p.priv_class         = &pulse_muxer_class,
-    .flags_internal       = FF_FMT_ALLOW_FLUSH,
+    .flags_internal       = FF_OFMT_FLAG_ALLOW_FLUSH,
 };
--- a/libavfilter/dnn/Makefile
+++ b/libavfilter/dnn/Makefile
@ -6,5 +6,6 @@ OBJS-$(CONFIG_DNN)                           += dnn/dnn_backend_common.o

 DNN-OBJS-$(CONFIG_LIBTENSORFLOW)             += dnn/dnn_backend_tf.o
 DNN-OBJS-$(CONFIG_LIBOPENVINO)               += dnn/dnn_backend_openvino.o
+DNN-OBJS-$(CONFIG_LIBTORCH)                  += dnn/dnn_backend_torch.o

 OBJS-$(CONFIG_DNN)                           += $(DNN-OBJS-yes)
--- a/libavfilter/dnn/dnn_backend_torch.cpp
+++ b/libavfilter/dnn/dnn_backend_torch.cpp
@ -0,0 +1,597 @@
+/*
+ * Copyright (c) 2024
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * DNN Torch backend implementation.
+ */
+
+#include <torch/torch.h>
+#include <torch/script.h>
+
+extern "C" {
+#include "../internal.h"
+#include "dnn_io_proc.h"
+#include "dnn_backend_common.h"
+#include "libavutil/opt.h"
+#include "queue.h"
+#include "safe_queue.h"
+}
+
+typedef struct THOptions{
+    char *device_name;
+    int optimize;
+} THOptions;
+
+typedef struct THContext {
+    const AVClass *c_class;
+    THOptions options;
+} THContext;
+
+typedef struct THModel {
+    THContext ctx;
+    DNNModel *model;
+    torch::jit::Module *jit_model;
+    SafeQueue *request_queue;
+    Queue *task_queue;
+    Queue *lltask_queue;
+} THModel;
+
+typedef struct THInferRequest {
+    torch::Tensor *output;
+    torch::Tensor *input_tensor;
+} THInferRequest;
+
+typedef struct THRequestItem {
+    THInferRequest *infer_request;
+    LastLevelTaskItem *lltask;
+    DNNAsyncExecModule exec_module;
+} THRequestItem;
+
+
+#define OFFSET(x) offsetof(THContext, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM
+static const AVOption dnn_th_options[] = {
+    { "device", "device to run model", OFFSET(options.device_name), AV_OPT_TYPE_STRING, { .str = "cpu" }, 0, 0, FLAGS },
+    { "optimize", "turn on graph executor optimization", OFFSET(options.optimize), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS},
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(dnn_th);
+
+static int extract_lltask_from_task(TaskItem *task, Queue *lltask_queue)
+{
+    THModel *th_model = (THModel *)task->model;
+    THContext *ctx = &th_model->ctx;
+    LastLevelTaskItem *lltask = (LastLevelTaskItem *)av_malloc(sizeof(*lltask));
+    if (!lltask) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to allocate memory for LastLevelTaskItem\n");
+        return AVERROR(ENOMEM);
+    }
+    task->inference_todo = 1;
+    task->inference_done = 0;
+    lltask->task = task;
+    if (ff_queue_push_back(lltask_queue, lltask) < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to push back lltask_queue.\n");
+        av_freep(&lltask);
+        return AVERROR(ENOMEM);
+    }
+    return 0;
+}
+
+static void th_free_request(THInferRequest *request)
+{
+    if (!request)
+        return;
+    if (request->output) {
+        delete(request->output);
+        request->output = NULL;
+    }
+    if (request->input_tensor) {
+        delete(request->input_tensor);
+        request->input_tensor = NULL;
+    }
+    return;
+}
+
+static inline void destroy_request_item(THRequestItem **arg)
+{
+    THRequestItem *item;
+    if (!arg || !*arg) {
+        return;
+    }
+    item = *arg;
+    th_free_request(item->infer_request);
+    av_freep(&item->infer_request);
+    av_freep(&item->lltask);
+    ff_dnn_async_module_cleanup(&item->exec_module);
+    av_freep(arg);
+}
+
+static void dnn_free_model_th(DNNModel **model)
+{
+    THModel *th_model;
+    if (!model || !*model)
+        return;
+
+    th_model = (THModel *) (*model)->model;
+    while (ff_safe_queue_size(th_model->request_queue) != 0) {
+        THRequestItem *item = (THRequestItem *)ff_safe_queue_pop_front(th_model->request_queue);
+        destroy_request_item(&item);
+    }
+    ff_safe_queue_destroy(th_model->request_queue);
+
+    while (ff_queue_size(th_model->lltask_queue) != 0) {
+        LastLevelTaskItem *item = (LastLevelTaskItem *)ff_queue_pop_front(th_model->lltask_queue);
+        av_freep(&item);
+    }
+    ff_queue_destroy(th_model->lltask_queue);
+
+    while (ff_queue_size(th_model->task_queue) != 0) {
+        TaskItem *item = (TaskItem *)ff_queue_pop_front(th_model->task_queue);
+        av_frame_free(&item->in_frame);
+        av_frame_free(&item->out_frame);
+        av_freep(&item);
+    }
+    ff_queue_destroy(th_model->task_queue);
+    delete th_model->jit_model;
+    av_opt_free(&th_model->ctx);
+    av_freep(&th_model);
+    av_freep(model);
+}
+
+static int get_input_th(void *model, DNNData *input, const char *input_name)
+{
+    input->dt = DNN_FLOAT;
+    input->order = DCO_RGB;
+    input->layout = DL_NCHW;
+    input->dims[0] = 1;
+    input->dims[1] = 3;
+    input->dims[2] = -1;
+    input->dims[3] = -1;
+    return 0;
+}
+
+static void deleter(void *arg)
+{
+    av_freep(&arg);
+}
+
+static int fill_model_input_th(THModel *th_model, THRequestItem *request)
+{
+    LastLevelTaskItem *lltask = NULL;
+    TaskItem *task = NULL;
+    THInferRequest *infer_request = NULL;
+    DNNData input = { 0 };
+    THContext *ctx = &th_model->ctx;
+    int ret, width_idx, height_idx, channel_idx;
+
+    lltask = (LastLevelTaskItem *)ff_queue_pop_front(th_model->lltask_queue);
+    if (!lltask) {
+        ret = AVERROR(EINVAL);
+        goto err;
+    }
+    request->lltask = lltask;
+    task = lltask->task;
+    infer_request = request->infer_request;
+
+    ret = get_input_th(th_model, &input, NULL);
+    if ( ret != 0) {
+        goto err;
+    }
+    width_idx = dnn_get_width_idx_by_layout(input.layout);
+    height_idx = dnn_get_height_idx_by_layout(input.layout);
+    channel_idx = dnn_get_channel_idx_by_layout(input.layout);
+    input.dims[height_idx] = task->in_frame->height;
+    input.dims[width_idx] = task->in_frame->width;
+    input.data = av_malloc(input.dims[height_idx] * input.dims[width_idx] *
+                           input.dims[channel_idx] * sizeof(float));
+    if (!input.data)
+        return AVERROR(ENOMEM);
+    infer_request->input_tensor = new torch::Tensor();
+    infer_request->output = new torch::Tensor();
+
+    switch (th_model->model->func_type) {
+    case DFT_PROCESS_FRAME:
+        input.scale = 255;
+        if (task->do_ioproc) {
+            if (th_model->model->frame_pre_proc != NULL) {
+                th_model->model->frame_pre_proc(task->in_frame, &input, th_model->model->filter_ctx);
+            } else {
+                ff_proc_from_frame_to_dnn(task->in_frame, &input, ctx);
+            }
+        }
+        break;
+    default:
+        avpriv_report_missing_feature(NULL, "model function type %d", th_model->model->func_type);
+        break;
+    }
+    *infer_request->input_tensor = torch::from_blob(input.data,
+        {1, input.dims[channel_idx], input.dims[height_idx], input.dims[width_idx]},
+        deleter, torch::kFloat32);
+    return 0;
+
+err:
+    th_free_request(infer_request);
+    return ret;
+}
+
+static int th_start_inference(void *args)
+{
+    THRequestItem *request = (THRequestItem *)args;
+    THInferRequest *infer_request = NULL;
+    LastLevelTaskItem *lltask = NULL;
+    TaskItem *task = NULL;
+    THModel *th_model = NULL;
+    THContext *ctx = NULL;
+    std::vector<torch::jit::IValue> inputs;
+    torch::NoGradGuard no_grad;
+
+    if (!request) {
+        av_log(NULL, AV_LOG_ERROR, "THRequestItem is NULL\n");
+        return AVERROR(EINVAL);
+    }
+    infer_request = request->infer_request;
+    lltask = request->lltask;
+    task = lltask->task;
+    th_model = (THModel *)task->model;
+    ctx = &th_model->ctx;
+
+    if (ctx->options.optimize)
+        torch::jit::setGraphExecutorOptimize(true);
+    else
+        torch::jit::setGraphExecutorOptimize(false);
+
+    if (!infer_request->input_tensor || !infer_request->output) {
+        av_log(ctx, AV_LOG_ERROR, "input or output tensor is NULL\n");
+        return DNN_GENERIC_ERROR;
+    }
+    inputs.push_back(*infer_request->input_tensor);
+
+    *infer_request->output = th_model->jit_model->forward(inputs).toTensor();
+
+    return 0;
+}
+
+static void infer_completion_callback(void *args) {
+    THRequestItem *request = (THRequestItem*)args;
+    LastLevelTaskItem *lltask = request->lltask;
+    TaskItem *task = lltask->task;
+    DNNData outputs = { 0 };
+    THInferRequest *infer_request = request->infer_request;
+    THModel *th_model = (THModel *)task->model;
+    torch::Tensor *output = infer_request->output;
+
+    c10::IntArrayRef sizes = output->sizes();
+    outputs.order = DCO_RGB;
+    outputs.layout = DL_NCHW;
+    outputs.dt = DNN_FLOAT;
+    if (sizes.size() == 4) {
+        // 4 dimensions: [batch_size, channel, height, width]
+        // this format of data is normally used for video frame SR
+        outputs.dims[0] = sizes.at(0); // N
+        outputs.dims[1] = sizes.at(1); // C
+        outputs.dims[2] = sizes.at(2); // H
+        outputs.dims[3] = sizes.at(3); // W
+    } else {
+        avpriv_report_missing_feature(&th_model->ctx, "Support of this kind of model");
+        goto err;
+    }
+
+    switch (th_model->model->func_type) {
+    case DFT_PROCESS_FRAME:
+        if (task->do_ioproc) {
+            outputs.scale = 255;
+            outputs.data = output->data_ptr();
+            if (th_model->model->frame_post_proc != NULL) {
+                th_model->model->frame_post_proc(task->out_frame, &outputs, th_model->model->filter_ctx);
+            } else {
+                ff_proc_from_dnn_to_frame(task->out_frame, &outputs, &th_model->ctx);
+            }
+        } else {
+            task->out_frame->width = outputs.dims[dnn_get_width_idx_by_layout(outputs.layout)];
+            task->out_frame->height = outputs.dims[dnn_get_height_idx_by_layout(outputs.layout)];
+        }
+        break;
+    default:
+        avpriv_report_missing_feature(&th_model->ctx, "model function type %d", th_model->model->func_type);
+        goto err;
+    }
+    task->inference_done++;
+    av_freep(&request->lltask);
+err:
+    th_free_request(infer_request);
+
+    if (ff_safe_queue_push_back(th_model->request_queue, request) < 0) {
+        destroy_request_item(&request);
+        av_log(&th_model->ctx, AV_LOG_ERROR, "Unable to push back request_queue when failed to start inference.\n");
+    }
+}
+
+static int execute_model_th(THRequestItem *request, Queue *lltask_queue)
+{
+    THModel *th_model = NULL;
+    LastLevelTaskItem *lltask;
+    TaskItem *task = NULL;
+    int ret = 0;
+
+    if (ff_queue_size(lltask_queue) == 0) {
+        destroy_request_item(&request);
+        return 0;
+    }
+
+    lltask = (LastLevelTaskItem *)ff_queue_peek_front(lltask_queue);
+    if (lltask == NULL) {
+        av_log(NULL, AV_LOG_ERROR, "Failed to get LastLevelTaskItem\n");
+        ret = AVERROR(EINVAL);
+        goto err;
+    }
+    task = lltask->task;
+    th_model = (THModel *)task->model;
+
+    ret = fill_model_input_th(th_model, request);
+    if ( ret != 0) {
+        goto err;
+    }
+    if (task->async) {
+        avpriv_report_missing_feature(&th_model->ctx, "LibTorch async");
+    } else {
+        ret = th_start_inference((void *)(request));
+        if (ret != 0) {
+            goto err;
+        }
+        infer_completion_callback(request);
+        return (task->inference_done == task->inference_todo) ? 0 : DNN_GENERIC_ERROR;
+    }
+
+err:
+    th_free_request(request->infer_request);
+    if (ff_safe_queue_push_back(th_model->request_queue, request) < 0) {
+        destroy_request_item(&request);
+    }
+    return ret;
+}
+
+static int get_output_th(void *model, const char *input_name, int input_width, int input_height,
+                                   const char *output_name, int *output_width, int *output_height)
+{
+    int ret = 0;
+    THModel *th_model = (THModel*) model;
+    THContext *ctx = &th_model->ctx;
+    TaskItem task = { 0 };
+    THRequestItem *request = NULL;
+    DNNExecBaseParams exec_params = {
+        .input_name     = input_name,
+        .output_names   = &output_name,
+        .nb_output      = 1,
+        .in_frame       = NULL,
+        .out_frame      = NULL,
+    };
+    ret = ff_dnn_fill_gettingoutput_task(&task, &exec_params, th_model, input_height, input_width, ctx);
+    if ( ret != 0) {
+        goto err;
+    }
+
+    ret = extract_lltask_from_task(&task, th_model->lltask_queue);
+    if ( ret != 0) {
+        av_log(ctx, AV_LOG_ERROR, "unable to extract last level task from task.\n");
+        goto err;
+    }
+
+    request = (THRequestItem*) ff_safe_queue_pop_front(th_model->request_queue);
+    if (!request) {
+        av_log(ctx, AV_LOG_ERROR, "unable to get infer request.\n");
+        ret = AVERROR(EINVAL);
+        goto err;
+    }
+
+    ret = execute_model_th(request, th_model->lltask_queue);
+    *output_width = task.out_frame->width;
+    *output_height = task.out_frame->height;
+
+err:
+    av_frame_free(&task.out_frame);
+    av_frame_free(&task.in_frame);
+    return ret;
+}
+
+static THInferRequest *th_create_inference_request(void)
+{
+    THInferRequest *request = (THInferRequest *)av_malloc(sizeof(THInferRequest));
+    if (!request) {
+        return NULL;
+    }
+    request->input_tensor = NULL;
+    request->output = NULL;
+    return request;
+}
+
+static DNNModel *dnn_load_model_th(const char *model_filename, DNNFunctionType func_type, const char *options, AVFilterContext *filter_ctx)
+{
+    DNNModel *model = NULL;
+    THModel *th_model = NULL;
+    THRequestItem *item = NULL;
+    THContext *ctx;
+
+    model = (DNNModel *)av_mallocz(sizeof(DNNModel));
+    if (!model) {
+        return NULL;
+    }
+
+    th_model = (THModel *)av_mallocz(sizeof(THModel));
+    if (!th_model) {
+        av_freep(&model);
+        return NULL;
+    }
+    th_model->model = model;
+    model->model = th_model;
+    th_model->ctx.c_class = &dnn_th_class;
+    ctx = &th_model->ctx;
+    //parse options
+    av_opt_set_defaults(ctx);
+    if (av_opt_set_from_string(ctx, options, NULL, "=", "&") < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to parse options \"%s\"\n", options);
+        return NULL;
+    }
+
+    c10::Device device = c10::Device(ctx->options.device_name);
+    if (!device.is_cpu()) {
+        av_log(ctx, AV_LOG_ERROR, "Not supported device:\"%s\"\n", ctx->options.device_name);
+        goto fail;
+    }
+
+    try {
+        th_model->jit_model = new torch::jit::Module;
+        (*th_model->jit_model) = torch::jit::load(model_filename);
+    } catch (const c10::Error& e) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to load torch model\n");
+        goto fail;
+    }
+
+    th_model->request_queue = ff_safe_queue_create();
+    if (!th_model->request_queue) {
+        goto fail;
+    }
+
+    item = (THRequestItem *)av_mallocz(sizeof(THRequestItem));
+    if (!item) {
+        goto fail;
+    }
+    item->lltask = NULL;
+    item->infer_request = th_create_inference_request();
+    if (!item->infer_request) {
+        av_log(NULL, AV_LOG_ERROR, "Failed to allocate memory for Torch inference request\n");
+        goto fail;
+    }
+    item->exec_module.start_inference = &th_start_inference;
+    item->exec_module.callback = &infer_completion_callback;
+    item->exec_module.args = item;
+
+    if (ff_safe_queue_push_back(th_model->request_queue, item) < 0) {
+        goto fail;
+    }
+    item = NULL;
+
+    th_model->task_queue = ff_queue_create();
+    if (!th_model->task_queue) {
+        goto fail;
+    }
+
+    th_model->lltask_queue = ff_queue_create();
+    if (!th_model->lltask_queue) {
+        goto fail;
+    }
+
+    model->get_input = &get_input_th;
+    model->get_output = &get_output_th;
+    model->options = NULL;
+    model->filter_ctx = filter_ctx;
+    model->func_type = func_type;
+    return model;
+
+fail:
+    if (item) {
+        destroy_request_item(&item);
+        av_freep(&item);
+    }
+    dnn_free_model_th(&model);
+    return NULL;
+}
+
+static int dnn_execute_model_th(const DNNModel *model, DNNExecBaseParams *exec_params)
+{
+    THModel *th_model = (THModel *)model->model;
+    THContext *ctx = &th_model->ctx;
+    TaskItem *task;
+    THRequestItem *request;
+    int ret = 0;
+
+    ret = ff_check_exec_params(ctx, DNN_TH, model->func_type, exec_params);
+    if (ret != 0) {
+        av_log(ctx, AV_LOG_ERROR, "exec parameter checking fail.\n");
+        return ret;
+    }
+
+    task = (TaskItem *)av_malloc(sizeof(TaskItem));
+    if (!task) {
+        av_log(ctx, AV_LOG_ERROR, "unable to alloc memory for task item.\n");
+        return AVERROR(ENOMEM);
+    }
+
+    ret = ff_dnn_fill_task(task, exec_params, th_model, 0, 1);
+    if (ret != 0) {
+        av_freep(&task);
+        av_log(ctx, AV_LOG_ERROR, "unable to fill task.\n");
+        return ret;
+    }
+
+    ret = ff_queue_push_back(th_model->task_queue, task);
+    if (ret < 0) {
+        av_freep(&task);
+        av_log(ctx, AV_LOG_ERROR, "unable to push back task_queue.\n");
+        return ret;
+    }
+
+    ret = extract_lltask_from_task(task, th_model->lltask_queue);
+    if (ret != 0) {
+        av_log(ctx, AV_LOG_ERROR, "unable to extract last level task from task.\n");
+        return ret;
+    }
+
+    request = (THRequestItem *)ff_safe_queue_pop_front(th_model->request_queue);
+    if (!request) {
+        av_log(ctx, AV_LOG_ERROR, "unable to get infer request.\n");
+        return AVERROR(EINVAL);
+    }
+
+    return execute_model_th(request, th_model->lltask_queue);
+}
+
+static DNNAsyncStatusType dnn_get_result_th(const DNNModel *model, AVFrame **in, AVFrame **out)
+{
+    THModel *th_model = (THModel *)model->model;
+    return ff_dnn_get_result_common(th_model->task_queue, in, out);
+}
+
+static int dnn_flush_th(const DNNModel *model)
+{
+    THModel *th_model = (THModel *)model->model;
+    THRequestItem *request;
+
+    if (ff_queue_size(th_model->lltask_queue) == 0)
+        // no pending task need to flush
+        return 0;
+
+    request = (THRequestItem *)ff_safe_queue_pop_front(th_model->request_queue);
+    if (!request) {
+        av_log(&th_model->ctx, AV_LOG_ERROR, "unable to get infer request.\n");
+        return AVERROR(EINVAL);
+    }
+
+    return execute_model_th(request, th_model->lltask_queue);
+}
+
+extern const DNNModule ff_dnn_backend_torch = {
+    .load_model     = dnn_load_model_th,
+    .execute_model  = dnn_execute_model_th,
+    .get_result     = dnn_get_result_th,
+    .flush          = dnn_flush_th,
+    .free_model     = dnn_free_model_th,
+};
--- a/libavfilter/dnn/dnn_interface.c
+++ b/libavfilter/dnn/dnn_interface.c
@ -28,6 +28,7 @@

 extern const DNNModule ff_dnn_backend_openvino;
 extern const DNNModule ff_dnn_backend_tf;
+extern const DNNModule ff_dnn_backend_torch;

 const DNNModule *ff_get_dnn_module(DNNBackendType backend_type, void *log_ctx)
 {
@ -40,6 +41,10 @@ const DNNModule *ff_get_dnn_module(DNNBackendType backend_type, void *log_ctx)
    case DNN_OV:
        return &ff_dnn_backend_openvino;
    #endif
+    #if (CONFIG_LIBTORCH == 1)
+    case DNN_TH:
+        return &ff_dnn_backend_torch;
+    #endif
    default:
        av_log(log_ctx, AV_LOG_ERROR,
                "Module backend_type %d is not supported or enabled.\n",
--- a/libavfilter/dnn_filter_common.c
+++ b/libavfilter/dnn_filter_common.c
@ -53,12 +53,22 @@ static char **separate_output_names(const char *expr, const char *val_sep, int *

 int ff_dnn_init(DnnContext *ctx, DNNFunctionType func_type, AVFilterContext *filter_ctx)
 {
+    DNNBackendType backend = ctx->backend_type;
+
    if (!ctx->model_filename) {
        av_log(filter_ctx, AV_LOG_ERROR, "model file for network is not specified\n");
        return AVERROR(EINVAL);
    }

-    if (ctx->backend_type == DNN_TF) {
+    if (backend == DNN_TH) {
+        if (ctx->model_inputname)
+            av_log(filter_ctx, AV_LOG_WARNING, "LibTorch backend do not require inputname, "\
+                                               "inputname will be ignored.\n");
+        if (ctx->model_outputnames)
+            av_log(filter_ctx, AV_LOG_WARNING, "LibTorch backend do not require outputname(s), "\
+                                               "all outputname(s) will be ignored.\n");
+        ctx->nb_outputs = 1;
+    } else if (backend == DNN_TF) {
        if (!ctx->model_inputname) {
            av_log(filter_ctx, AV_LOG_ERROR, "input name of the model network is not specified\n");
            return AVERROR(EINVAL);
@ -115,7 +125,8 @@ int ff_dnn_get_input(DnnContext *ctx, DNNData *input)

 int ff_dnn_get_output(DnnContext *ctx, int input_width, int input_height, int *output_width, int *output_height)
 {
-    char * output_name = ctx->model_outputnames ? ctx->model_outputnames[0] : NULL;
+    char * output_name = ctx->model_outputnames && ctx->backend_type != DNN_TH ?
+                         ctx->model_outputnames[0] : NULL;
    return ctx->model->get_output(ctx->model->model, ctx->model_inputname, input_width, input_height,
                                    (const char *)output_name, output_width, output_height);
 }
--- a/Show More
+++ b/Show More